1 | /* $NetBSD: lockstat.c,v 1.24 2015/08/20 14:40:17 christos Exp $ */ |
2 | |
3 | /*- |
4 | * Copyright (c) 2006, 2007 The NetBSD Foundation, Inc. |
5 | * All rights reserved. |
6 | * |
7 | * This code is derived from software contributed to The NetBSD Foundation |
8 | * by Andrew Doran. |
9 | * |
10 | * Redistribution and use in source and binary forms, with or without |
11 | * modification, are permitted provided that the following conditions |
12 | * are met: |
13 | * 1. Redistributions of source code must retain the above copyright |
14 | * notice, this list of conditions and the following disclaimer. |
15 | * 2. Redistributions in binary form must reproduce the above copyright |
16 | * notice, this list of conditions and the following disclaimer in the |
17 | * documentation and/or other materials provided with the distribution. |
18 | * |
19 | * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS |
20 | * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED |
21 | * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR |
22 | * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS |
23 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
24 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
25 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
26 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
27 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
28 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE |
29 | * POSSIBILITY OF SUCH DAMAGE. |
30 | */ |
31 | |
32 | /* |
33 | * Lock statistics driver, providing kernel support for the lockstat(8) |
34 | * command. |
35 | * |
36 | * We use a global lock word (lockstat_lock) to track device opens. |
37 | * Only one thread can hold the device at a time, providing a global lock. |
38 | * |
39 | * XXX Timings for contention on sleep locks are currently incorrect. |
40 | */ |
41 | |
42 | #include <sys/cdefs.h> |
43 | __KERNEL_RCSID(0, "$NetBSD: lockstat.c,v 1.24 2015/08/20 14:40:17 christos Exp $" ); |
44 | |
45 | #include <sys/types.h> |
46 | #include <sys/param.h> |
47 | #include <sys/proc.h> |
48 | #include <sys/resourcevar.h> |
49 | #include <sys/systm.h> |
50 | #include <sys/kernel.h> |
51 | #include <sys/kmem.h> |
52 | #include <sys/conf.h> |
53 | #include <sys/cpu.h> |
54 | #include <sys/syslog.h> |
55 | #include <sys/atomic.h> |
56 | |
57 | #include <dev/lockstat.h> |
58 | |
59 | #include <machine/lock.h> |
60 | |
61 | #include "ioconf.h" |
62 | |
63 | #ifndef __HAVE_CPU_COUNTER |
64 | #error CPU counters not available |
65 | #endif |
66 | |
67 | #if LONG_BIT == 64 |
68 | #define LOCKSTAT_HASH_SHIFT 3 |
69 | #elif LONG_BIT == 32 |
70 | #define LOCKSTAT_HASH_SHIFT 2 |
71 | #endif |
72 | |
73 | #define LOCKSTAT_MINBUFS 1000 |
74 | #define LOCKSTAT_DEFBUFS 10000 |
75 | #define LOCKSTAT_MAXBUFS 1000000 |
76 | |
77 | #define LOCKSTAT_HASH_SIZE 128 |
78 | #define LOCKSTAT_HASH_MASK (LOCKSTAT_HASH_SIZE - 1) |
79 | #define LOCKSTAT_HASH(key) \ |
80 | ((key >> LOCKSTAT_HASH_SHIFT) & LOCKSTAT_HASH_MASK) |
81 | |
82 | typedef struct lscpu { |
83 | SLIST_HEAD(, lsbuf) lc_free; |
84 | u_int lc_overflow; |
85 | LIST_HEAD(lslist, lsbuf) lc_hash[LOCKSTAT_HASH_SIZE]; |
86 | } lscpu_t; |
87 | |
88 | typedef struct lslist lslist_t; |
89 | |
90 | void lockstat_start(lsenable_t *); |
91 | int lockstat_alloc(lsenable_t *); |
92 | void lockstat_init_tables(lsenable_t *); |
93 | int lockstat_stop(lsdisable_t *); |
94 | void lockstat_free(void); |
95 | |
96 | dev_type_open(lockstat_open); |
97 | dev_type_close(lockstat_close); |
98 | dev_type_read(lockstat_read); |
99 | dev_type_ioctl(lockstat_ioctl); |
100 | |
101 | volatile u_int lockstat_enabled; |
102 | volatile u_int lockstat_dev_enabled; |
103 | uintptr_t lockstat_csstart; |
104 | uintptr_t lockstat_csend; |
105 | uintptr_t lockstat_csmask; |
106 | uintptr_t lockstat_lamask; |
107 | uintptr_t lockstat_lockstart; |
108 | uintptr_t lockstat_lockend; |
109 | __cpu_simple_lock_t lockstat_lock; |
110 | lwp_t *lockstat_lwp; |
111 | lsbuf_t *lockstat_baseb; |
112 | size_t lockstat_sizeb; |
113 | int lockstat_busy; |
114 | struct timespec lockstat_stime; |
115 | |
116 | #ifdef KDTRACE_HOOKS |
117 | volatile u_int lockstat_dtrace_enabled; |
118 | CTASSERT(LB_NEVENT <= 3); |
119 | CTASSERT(LB_NLOCK <= (7 << LB_LOCK_SHIFT)); |
120 | void |
121 | lockstat_probe_stub(uint32_t id, uintptr_t lock, uintptr_t callsite, |
122 | uintptr_t flags, uintptr_t count, uintptr_t cycles) |
123 | { |
124 | } |
125 | |
126 | uint32_t lockstat_probemap[LS_NPROBES]; |
127 | void (*lockstat_probe_func)(uint32_t, uintptr_t, uintptr_t, |
128 | uintptr_t, uintptr_t, uintptr_t) = &lockstat_probe_stub; |
129 | #endif |
130 | |
131 | const struct cdevsw lockstat_cdevsw = { |
132 | .d_open = lockstat_open, |
133 | .d_close = lockstat_close, |
134 | .d_read = lockstat_read, |
135 | .d_write = nowrite, |
136 | .d_ioctl = lockstat_ioctl, |
137 | .d_stop = nostop, |
138 | .d_tty = notty, |
139 | .d_poll = nopoll, |
140 | .d_mmap = nommap, |
141 | .d_kqfilter = nokqfilter, |
142 | .d_discard = nodiscard, |
143 | .d_flag = D_OTHER | D_MPSAFE |
144 | }; |
145 | |
146 | /* |
147 | * Called when the pseudo-driver is attached. |
148 | */ |
149 | void |
150 | lockstatattach(int nunits) |
151 | { |
152 | |
153 | (void)nunits; |
154 | |
155 | __cpu_simple_lock_init(&lockstat_lock); |
156 | } |
157 | |
158 | /* |
159 | * Prepare the per-CPU tables for use, or clear down tables when tracing is |
160 | * stopped. |
161 | */ |
162 | void |
163 | lockstat_init_tables(lsenable_t *le) |
164 | { |
165 | int i, per, slop, cpuno; |
166 | CPU_INFO_ITERATOR cii; |
167 | struct cpu_info *ci; |
168 | lscpu_t *lc; |
169 | lsbuf_t *lb; |
170 | |
171 | /* coverity[assert_side_effect] */ |
172 | KASSERT(!lockstat_dev_enabled); |
173 | |
174 | for (CPU_INFO_FOREACH(cii, ci)) { |
175 | if (ci->ci_lockstat != NULL) { |
176 | kmem_free(ci->ci_lockstat, sizeof(lscpu_t)); |
177 | ci->ci_lockstat = NULL; |
178 | } |
179 | } |
180 | |
181 | if (le == NULL) |
182 | return; |
183 | |
184 | lb = lockstat_baseb; |
185 | per = le->le_nbufs / ncpu; |
186 | slop = le->le_nbufs - (per * ncpu); |
187 | cpuno = 0; |
188 | for (CPU_INFO_FOREACH(cii, ci)) { |
189 | lc = kmem_alloc(sizeof(*lc), KM_SLEEP); |
190 | lc->lc_overflow = 0; |
191 | ci->ci_lockstat = lc; |
192 | |
193 | SLIST_INIT(&lc->lc_free); |
194 | for (i = 0; i < LOCKSTAT_HASH_SIZE; i++) |
195 | LIST_INIT(&lc->lc_hash[i]); |
196 | |
197 | for (i = per; i != 0; i--, lb++) { |
198 | lb->lb_cpu = (uint16_t)cpuno; |
199 | SLIST_INSERT_HEAD(&lc->lc_free, lb, lb_chain.slist); |
200 | } |
201 | if (--slop > 0) { |
202 | lb->lb_cpu = (uint16_t)cpuno; |
203 | SLIST_INSERT_HEAD(&lc->lc_free, lb, lb_chain.slist); |
204 | lb++; |
205 | } |
206 | cpuno++; |
207 | } |
208 | } |
209 | |
210 | /* |
211 | * Start collecting lock statistics. |
212 | */ |
213 | void |
214 | lockstat_start(lsenable_t *le) |
215 | { |
216 | |
217 | /* coverity[assert_side_effect] */ |
218 | KASSERT(!lockstat_dev_enabled); |
219 | |
220 | lockstat_init_tables(le); |
221 | |
222 | if ((le->le_flags & LE_CALLSITE) != 0) |
223 | lockstat_csmask = (uintptr_t)-1LL; |
224 | else |
225 | lockstat_csmask = 0; |
226 | |
227 | if ((le->le_flags & LE_LOCK) != 0) |
228 | lockstat_lamask = (uintptr_t)-1LL; |
229 | else |
230 | lockstat_lamask = 0; |
231 | |
232 | lockstat_csstart = le->le_csstart; |
233 | lockstat_csend = le->le_csend; |
234 | lockstat_lockstart = le->le_lockstart; |
235 | lockstat_lockstart = le->le_lockstart; |
236 | lockstat_lockend = le->le_lockend; |
237 | membar_sync(); |
238 | getnanotime(&lockstat_stime); |
239 | lockstat_dev_enabled = le->le_mask; |
240 | LOCKSTAT_ENABLED_UPDATE(); |
241 | } |
242 | |
243 | /* |
244 | * Stop collecting lock statistics. |
245 | */ |
246 | int |
247 | lockstat_stop(lsdisable_t *ld) |
248 | { |
249 | CPU_INFO_ITERATOR cii; |
250 | struct cpu_info *ci; |
251 | u_int cpuno, overflow; |
252 | struct timespec ts; |
253 | int error; |
254 | lwp_t *l; |
255 | |
256 | /* coverity[assert_side_effect] */ |
257 | KASSERT(lockstat_dev_enabled); |
258 | |
259 | /* |
260 | * Set enabled false, force a write barrier, and wait for other CPUs |
261 | * to exit lockstat_event(). |
262 | */ |
263 | lockstat_dev_enabled = 0; |
264 | LOCKSTAT_ENABLED_UPDATE(); |
265 | getnanotime(&ts); |
266 | tsleep(&lockstat_stop, PPAUSE, "lockstat" , mstohz(10)); |
267 | |
268 | /* |
269 | * Did we run out of buffers while tracing? |
270 | */ |
271 | overflow = 0; |
272 | for (CPU_INFO_FOREACH(cii, ci)) |
273 | overflow += ((lscpu_t *)ci->ci_lockstat)->lc_overflow; |
274 | |
275 | if (overflow != 0) { |
276 | error = EOVERFLOW; |
277 | log(LOG_NOTICE, "lockstat: %d buffer allocations failed\n" , |
278 | overflow); |
279 | } else |
280 | error = 0; |
281 | |
282 | lockstat_init_tables(NULL); |
283 | |
284 | /* Run through all LWPs and clear the slate for the next run. */ |
285 | mutex_enter(proc_lock); |
286 | LIST_FOREACH(l, &alllwp, l_list) { |
287 | l->l_pfailaddr = 0; |
288 | l->l_pfailtime = 0; |
289 | l->l_pfaillock = 0; |
290 | } |
291 | mutex_exit(proc_lock); |
292 | |
293 | if (ld == NULL) |
294 | return error; |
295 | |
296 | /* |
297 | * Fill out the disable struct for the caller. |
298 | */ |
299 | timespecsub(&ts, &lockstat_stime, &ld->ld_time); |
300 | ld->ld_size = lockstat_sizeb; |
301 | |
302 | cpuno = 0; |
303 | for (CPU_INFO_FOREACH(cii, ci)) { |
304 | if (cpuno >= sizeof(ld->ld_freq) / sizeof(ld->ld_freq[0])) { |
305 | log(LOG_WARNING, "lockstat: too many CPUs\n" ); |
306 | break; |
307 | } |
308 | ld->ld_freq[cpuno++] = cpu_frequency(ci); |
309 | } |
310 | |
311 | return error; |
312 | } |
313 | |
314 | /* |
315 | * Allocate buffers for lockstat_start(). |
316 | */ |
317 | int |
318 | lockstat_alloc(lsenable_t *le) |
319 | { |
320 | lsbuf_t *lb; |
321 | size_t sz; |
322 | |
323 | /* coverity[assert_side_effect] */ |
324 | KASSERT(!lockstat_dev_enabled); |
325 | lockstat_free(); |
326 | |
327 | sz = sizeof(*lb) * le->le_nbufs; |
328 | |
329 | lb = kmem_zalloc(sz, KM_SLEEP); |
330 | if (lb == NULL) |
331 | return (ENOMEM); |
332 | |
333 | /* coverity[assert_side_effect] */ |
334 | KASSERT(!lockstat_dev_enabled); |
335 | KASSERT(lockstat_baseb == NULL); |
336 | lockstat_sizeb = sz; |
337 | lockstat_baseb = lb; |
338 | |
339 | return (0); |
340 | } |
341 | |
342 | /* |
343 | * Free allocated buffers after tracing has stopped. |
344 | */ |
345 | void |
346 | lockstat_free(void) |
347 | { |
348 | |
349 | /* coverity[assert_side_effect] */ |
350 | KASSERT(!lockstat_dev_enabled); |
351 | |
352 | if (lockstat_baseb != NULL) { |
353 | kmem_free(lockstat_baseb, lockstat_sizeb); |
354 | lockstat_baseb = NULL; |
355 | } |
356 | } |
357 | |
358 | /* |
359 | * Main entry point from lock primatives. |
360 | */ |
361 | void |
362 | lockstat_event(uintptr_t lock, uintptr_t callsite, u_int flags, u_int count, |
363 | uint64_t cycles) |
364 | { |
365 | lslist_t *ll; |
366 | lscpu_t *lc; |
367 | lsbuf_t *lb; |
368 | u_int event; |
369 | int s; |
370 | |
371 | #ifdef KDTRACE_HOOKS |
372 | uint32_t id; |
373 | CTASSERT((LS_NPROBES & (LS_NPROBES - 1)) == 0); |
374 | if ((id = lockstat_probemap[LS_COMPRESS(flags)]) != 0) |
375 | (*lockstat_probe_func)(id, lock, callsite, flags, count, |
376 | cycles); |
377 | #endif |
378 | |
379 | if ((flags & lockstat_dev_enabled) != flags || count == 0) |
380 | return; |
381 | if (lock < lockstat_lockstart || lock > lockstat_lockend) |
382 | return; |
383 | if (callsite < lockstat_csstart || callsite > lockstat_csend) |
384 | return; |
385 | |
386 | callsite &= lockstat_csmask; |
387 | lock &= lockstat_lamask; |
388 | |
389 | /* |
390 | * Find the table for this lock+callsite pair, and try to locate a |
391 | * buffer with the same key. |
392 | */ |
393 | s = splhigh(); |
394 | lc = curcpu()->ci_lockstat; |
395 | ll = &lc->lc_hash[LOCKSTAT_HASH(lock ^ callsite)]; |
396 | event = (flags & LB_EVENT_MASK) - 1; |
397 | |
398 | LIST_FOREACH(lb, ll, lb_chain.list) { |
399 | if (lb->lb_lock == lock && lb->lb_callsite == callsite) |
400 | break; |
401 | } |
402 | |
403 | if (lb != NULL) { |
404 | /* |
405 | * We found a record. Move it to the front of the list, as |
406 | * we're likely to hit it again soon. |
407 | */ |
408 | if (lb != LIST_FIRST(ll)) { |
409 | LIST_REMOVE(lb, lb_chain.list); |
410 | LIST_INSERT_HEAD(ll, lb, lb_chain.list); |
411 | } |
412 | lb->lb_counts[event] += count; |
413 | lb->lb_times[event] += cycles; |
414 | } else if ((lb = SLIST_FIRST(&lc->lc_free)) != NULL) { |
415 | /* |
416 | * Pinch a new buffer and fill it out. |
417 | */ |
418 | SLIST_REMOVE_HEAD(&lc->lc_free, lb_chain.slist); |
419 | LIST_INSERT_HEAD(ll, lb, lb_chain.list); |
420 | lb->lb_flags = (uint16_t)flags; |
421 | lb->lb_lock = lock; |
422 | lb->lb_callsite = callsite; |
423 | lb->lb_counts[event] = count; |
424 | lb->lb_times[event] = cycles; |
425 | } else { |
426 | /* |
427 | * We didn't find a buffer and there were none free. |
428 | * lockstat_stop() will notice later on and report the |
429 | * error. |
430 | */ |
431 | lc->lc_overflow++; |
432 | } |
433 | |
434 | splx(s); |
435 | } |
436 | |
437 | /* |
438 | * Accept an open() on /dev/lockstat. |
439 | */ |
440 | int |
441 | lockstat_open(dev_t dev, int flag, int mode, lwp_t *l) |
442 | { |
443 | |
444 | if (!__cpu_simple_lock_try(&lockstat_lock)) |
445 | return EBUSY; |
446 | lockstat_lwp = curlwp; |
447 | return 0; |
448 | } |
449 | |
450 | /* |
451 | * Accept the last close() on /dev/lockstat. |
452 | */ |
453 | int |
454 | lockstat_close(dev_t dev, int flag, int mode, lwp_t *l) |
455 | { |
456 | |
457 | lockstat_lwp = NULL; |
458 | __cpu_simple_unlock(&lockstat_lock); |
459 | return 0; |
460 | } |
461 | |
462 | /* |
463 | * Handle control operations. |
464 | */ |
465 | int |
466 | lockstat_ioctl(dev_t dev, u_long cmd, void *data, int flag, lwp_t *l) |
467 | { |
468 | lsenable_t *le; |
469 | int error; |
470 | |
471 | if (lockstat_lwp != curlwp) |
472 | return EBUSY; |
473 | |
474 | switch (cmd) { |
475 | case IOC_LOCKSTAT_GVERSION: |
476 | *(int *)data = LS_VERSION; |
477 | error = 0; |
478 | break; |
479 | |
480 | case IOC_LOCKSTAT_ENABLE: |
481 | le = (lsenable_t *)data; |
482 | |
483 | if (!cpu_hascounter()) { |
484 | error = ENODEV; |
485 | break; |
486 | } |
487 | if (lockstat_dev_enabled) { |
488 | error = EBUSY; |
489 | break; |
490 | } |
491 | |
492 | /* |
493 | * Sanitize the arguments passed in and set up filtering. |
494 | */ |
495 | if (le->le_nbufs == 0) |
496 | le->le_nbufs = LOCKSTAT_DEFBUFS; |
497 | else if (le->le_nbufs > LOCKSTAT_MAXBUFS || |
498 | le->le_nbufs < LOCKSTAT_MINBUFS) { |
499 | error = EINVAL; |
500 | break; |
501 | } |
502 | if ((le->le_flags & LE_ONE_CALLSITE) == 0) { |
503 | le->le_csstart = 0; |
504 | le->le_csend = le->le_csstart - 1; |
505 | } |
506 | if ((le->le_flags & LE_ONE_LOCK) == 0) { |
507 | le->le_lockstart = 0; |
508 | le->le_lockend = le->le_lockstart - 1; |
509 | } |
510 | if ((le->le_mask & LB_EVENT_MASK) == 0) |
511 | return EINVAL; |
512 | if ((le->le_mask & LB_LOCK_MASK) == 0) |
513 | return EINVAL; |
514 | |
515 | /* |
516 | * Start tracing. |
517 | */ |
518 | if ((error = lockstat_alloc(le)) == 0) |
519 | lockstat_start(le); |
520 | break; |
521 | |
522 | case IOC_LOCKSTAT_DISABLE: |
523 | if (!lockstat_dev_enabled) |
524 | error = EINVAL; |
525 | else |
526 | error = lockstat_stop((lsdisable_t *)data); |
527 | break; |
528 | |
529 | default: |
530 | error = ENOTTY; |
531 | break; |
532 | } |
533 | |
534 | return error; |
535 | } |
536 | |
537 | /* |
538 | * Copy buffers out to user-space. |
539 | */ |
540 | int |
541 | lockstat_read(dev_t dev, struct uio *uio, int flag) |
542 | { |
543 | |
544 | if (curlwp != lockstat_lwp || lockstat_dev_enabled) |
545 | return EBUSY; |
546 | return uiomove(lockstat_baseb, lockstat_sizeb, uio); |
547 | } |
548 | |