catching top waits

Modern systems are complicated beasts with lots of interdependent activities between threads, programs and kernels. Figuring out some problems is nearly impossible without building some time machine and crystal ball mix that tells exactly what happened.

Did your cgroups CPU limit inject a sleep in the middle of mmap_sem acquisition by ‘ps’? Is everyone waiting for a mutex that is held by someone who is waiting for a DNS response? Did you forget to lock in your libnss.* libraries into memory and hence ended up stalling in unexpected place under memory pressure?

I’ve grabbed Brendan Gregg‘s offcpu profiler, gutted it to the point where all it does is record longest waits per stack trace, as well as timestamp of the longest wait.

Some debugging sessions that would’ve taken hours, days or weeks before now are few minute endeavors. It is still quite hacky, so present it without too much additional polishing.

I do have plans in place to have a visualizing frontend for the output some day.

	#!/usr/bin/env bcc-py
	#
	# topwaits Show longest off-cpu waits per-stack
	#
	# Copyright 2019 Facebook, Inc.
	# Copyright 2016 Netflix, Inc.
	# Licensed under the Apache License, Version 2.0 (the "License")
	#
	# 13-Jan-2016 Brendan Gregg Wrote offcpu profiler
	# 27-Nov-2019 Domas Mituzas Gutted most of profiling part and left stall detector

	from __future__ import print_function
	from bcc import BPF
	from sys import stderr
	from time import sleep, strftime
	import argparse
	import signal
	import time

	parser = argparse.ArgumentParser(
	description="Find longest waiting stacks")

	thread_group = parser.add_mutually_exclusive_group()
	thread_group.add_argument("-p", "–pid", metavar="PID", dest="tgid",
	help="trace this PID only", type=int)
	thread_group.add_argument("-t", "–tid", metavar="TID", dest="pid",
	help="trace this TID only", type=int)

	parser.add_argument("–stack-storage-size", default=65536,
	type=int,
	help="the number of unique stack traces that can be stored and "
	"displayed (default 65536)")

	parser.add_argument("duration", nargs="?", default=99999999,
	type=int,
	help="duration of trace, in seconds")

	parser.add_argument("-m", "–min-block-time", default=1,
	type=int,
	help="the amount of time in microseconds over which we " +
	"store traces (default 1)")
	parser.add_argument("-M", "–max-block-time", default=(1 << 64) – 1,
	type=int,
	help="the amount of time in microseconds under which we " +
	"store traces (default U64_MAX)")

	parser.add_argument("–state", type=int,
	help="filter on this thread state bitmask (eg, 2 == TASK_UNINTERRUPTIBLE" +
	") see include/linux/sched.h")
	args = parser.parse_args()

	if args.pid and args.tgid:
	parser.error("specify only one of -p and -t")

	duration = int(args.duration)

	# define BPF program
	bpf_text = """
	#include <uapi/linux/ptrace.h>
	#include <linux/sched.h>

	#define MINBLOCK_US MINBLOCK_US_VALUEULL
	#define MAXBLOCK_US MAXBLOCK_US_VALUEULL

	struct key_t {
	u32 pid;
	int user_stack_id;
	int kernel_stack_id;
	};
	BPF_HASH(counts, struct key_t);
	BPF_HASH(maximums, struct key_t);
	BPF_HASH(timestamps, struct key_t);
	BPF_HASH(start, u32);
	BPF_STACK_TRACE(stack_traces, STACK_STORAGE_SIZE);

	int oncpu(struct pt_regs ctx, struct task_struct prev) {
	u32 pid = prev->pid;
	u32 tgid = prev->tgid;
	u64 ts, tsp, valp;

	// record previous thread sleep time
	ts = bpf_ktime_get_ns();
	if ((THREAD_FILTER) && (STATE_FILTER)) {
	start.update(&pid, &ts);
	}

	// get the current thread's start time
	pid = bpf_get_current_pid_tgid();
	tgid = bpf_get_current_pid_tgid() >> 32;
	tsp = start.lookup(&pid);
	if (tsp == 0) {
	return 0; // missed start or filtered
	}

	// calculate current thread's delta time in nanos
	u64 delta = ts – *tsp;
	start.delete(&pid);

	// nanos to micros
	delta = delta / 1000;
	if ((delta < MINBLOCK_US) \|\| (delta > MAXBLOCK_US)) {
	return 0;
	}

	// create map key
	struct key_t key = {};

	key.pid = tgid;
	key.user_stack_id = stack_traces.get_stackid(ctx, BPF_F_USER_STACK);
	key.kernel_stack_id = stack_traces.get_stackid(ctx, 0);

	counts.increment(key, delta);
	valp = maximums.lookup(&key);
	if (valp == 0 \|\| *valp < delta) {
	maximums.update(&key, &delta);
	timestamps.update(&key, &ts);
	}
	return 0;
	}
	"""

	thread_filter = '1'

	# set thread filter
	if args.tgid is not None:
	thread_filter = 'tgid == %d' % args.tgid
	elif args.pid is not None:
	thread_filter = 'pid == %d' % args.pid
	if args.state == 0:
	state_filter = 'prev->state == 0'
	elif args.state:
	# these states are sometimes bitmask checked
	state_filter = 'prev->state & %d' % args.state
	else:
	state_filter = '1'

	bpf_text = bpf_text.replace('THREAD_FILTER', thread_filter)
	bpf_text = bpf_text.replace('STATE_FILTER', state_filter)

	# set stack storage size
	bpf_text = bpf_text.replace('STACK_STORAGE_SIZE', str(args.stack_storage_size))
	bpf_text = bpf_text.replace('MINBLOCK_US_VALUE', str(args.min_block_time))
	bpf_text = bpf_text.replace('MAXBLOCK_US_VALUE', str(args.max_block_time))

	# initialize BPF
	b = BPF(text=bpf_text)
	b.attach_kprobe(event="finish_task_switch", fn_name="oncpu")
	matched = b.num_open_kprobes()
	if matched == 0:
	print("error: 0 functions traced. Exiting.", file=stderr)
	exit(1)

	try:
	sleep(duration)
	except KeyboardInterrupt:
	# as cleanup can take many seconds, trap Ctrl-C:
	pass

	signal.signal(signal.SIGINT, signal.SIG_IGN)

	counts = b.get_table("counts")
	stack_traces = b.get_table("stack_traces")
	maximums = b.get_table("maximums")
	timestamps = b.get_table("timestamps")


	basetime = time.time() – float(open("/proc/uptime").read().split()[0])

	def memoize(f):
	class memodict(dict):
	def __init__(self, f):
	self.f = f
	def __call__(self, *args):
	return self[args]
	def __missing__(self, key):
	ret = self[key] = self.f(*key)
	return ret
	return memodict(f)

	@memoize
	def get_sym(addr, pid):
	val = b.sym(addr, pid)
	return val.decode().split("(")[0]

	for k, v in counts.items():
	user_stack = tuple([] if k.user_stack_id < 0 else \
	[get_sym(x, k.pid) for x in stack_traces.walk(k.user_stack_id)])
	kernel_stack = tuple([] if k.kernel_stack_id < 0 else \
	[b.ksym(x) for x in stack_traces.walk(k.kernel_stack_id)])

	ts = timestamps[k].value / 1000000000 + basetime
	print(v.value, maximums[k].value, ts, kernel_stack, user_stack)

view raw

topwaits.py

hosted with ❤ by GitHub

	markcallaghan (@mark… on MySQL does not need SQL
	markcallaghan (@mark… on MySQL does not need SQL
	Domas Mituzas on MySQL does not need SQL
	Marc on MySQL does not need SQL
	Nils Meyer on linux memory management for…

Related