Index: share/man/man9/sleepqueue.9 =================================================================== --- share/man/man9/sleepqueue.9 (revision 202146) +++ share/man/man9/sleepqueue.9 (working copy) @@ -23,7 +23,7 @@ .\" .\" $FreeBSD$ .\" -.Dd December 12, 2009 +.Dd January 11, 2010 .Dt SLEEPQUEUE 9 .Os .Sh NAME @@ -41,6 +41,7 @@ .Nm sleepq_remove , .Nm sleepq_signal , .Nm sleepq_set_timeout , +.Nm sleepq_sleepcnt , .Nm sleepq_timedwait , .Nm sleepq_timedwait_sig , .Nm sleepq_wait , @@ -77,6 +78,8 @@ .Fn sleepq_signal "void *wchan" "int flags" "int pri" "int queue" .Ft void .Fn sleepq_set_timeout "void *wchan" "int timo" +.Ft u_int +.Fn sleepq_sleepcnt "void *wchan" "int queue" .Ft int .Fn sleepq_timedwait "void *wchan" .Ft int @@ -355,6 +358,14 @@ channel. .Pp The +.Fn sleepq_sleepcnt +function offer a simple way to retrieve the number of threads sleeping for +the specified +.Fa queue , +given a +.Fa wchan . +.Pp +The .Fn sleepq_abort , .Fn sleepq_broadcast , and Index: sys/kern/subr_sleepqueue.c =================================================================== --- sys/kern/subr_sleepqueue.c (revision 202146) +++ sys/kern/subr_sleepqueue.c (working copy) @@ -118,6 +118,7 @@ */ struct sleepqueue { TAILQ_HEAD(, thread) sq_blocked[NR_SLEEPQS]; /* (c) Blocked threads. */ + u_int sq_blockedcnt[NR_SLEEPQS]; /* (c) N. of blocked threads. */ LIST_ENTRY(sleepqueue) sq_hash; /* (c) Chain and free list. */ LIST_HEAD(, sleepqueue) sq_free; /* (c) Free queues. */ void *sq_wchan; /* (c) Wait channel. */ @@ -306,9 +307,12 @@ int i; sq = td->td_sleepqueue; - for (i = 0; i < NR_SLEEPQS; i++) + for (i = 0; i < NR_SLEEPQS; i++) { KASSERT(TAILQ_EMPTY(&sq->sq_blocked[i]), - ("thread's sleep queue %d is not empty", i)); + ("thread's sleep queue %d is not empty", i)); + KASSERT(sq->sq_blockedcnt[i] == 0, + ("thread's sleep queue %d count mismatches", i)); + } KASSERT(LIST_EMPTY(&sq->sq_free), ("thread's sleep queue has a non-empty free list")); KASSERT(sq->sq_wchan == NULL, ("stale sq_wchan pointer")); @@ -334,6 +338,7 @@ } thread_lock(td); TAILQ_INSERT_TAIL(&sq->sq_blocked[queue], td, td_slpq); + sq->sq_blockedcnt[queue]++; td->td_sleepqueue = NULL; td->td_sqqueue = queue; td->td_wchan = wchan; @@ -367,6 +372,22 @@ } /* + * Return the number of actual sleepers for the specified queue. + */ +u_int +sleepq_sleepcnt(void *wchan, int queue) +{ + struct sleepqueue *sq; + + KASSERT(wchan != NULL, ("%s: invalid NULL wait channel", __func__)); + MPASS((queue >= 0) && (queue < NR_SLEEPQS)); + sq = sleepq_lookup(wchan); + if (sq == NULL) + return (0); + return (sq->sq_blockedcnt[queue]); +} + +/* * Marks the pending sleep of the current thread as interruptible and * makes an initial check for pending signals before putting a thread * to sleep. Enters and exits with the thread lock held. Thread lock @@ -665,6 +686,7 @@ mtx_assert(&sc->sc_lock, MA_OWNED); /* Remove the thread from the queue. */ + sq->sq_blockedcnt[td->td_sqqueue]--; TAILQ_REMOVE(&sq->sq_blocked[td->td_sqqueue], td, td_slpq); /* @@ -720,8 +742,10 @@ int i; sq = mem; - for (i = 0; i < NR_SLEEPQS; i++) + for (i = 0; i < NR_SLEEPQS; i++) { MPASS(TAILQ_EMPTY(&sq->sq_blocked[i])); + MPASS(sq->sq_blockedcnt[i] == 0); + } } #endif @@ -736,8 +760,10 @@ bzero(mem, size); sq = mem; - for (i = 0; i < NR_SLEEPQS; i++) + for (i = 0; i < NR_SLEEPQS; i++) { TAILQ_INIT(&sq->sq_blocked[i]); + sq->sq_blockedcnt[i] = 0; + } LIST_INIT(&sq->sq_free); return (0); } @@ -1170,6 +1196,7 @@ td->td_tid, td->td_proc->p_pid, td->td_name); } + db_printf("(expected: %u)\n", sq->sq_blockedcnt[i]); } } Index: sys/kern/kern_sx.c =================================================================== --- sys/kern/kern_sx.c (revision 202146) +++ sys/kern/kern_sx.c (working copy) @@ -702,8 +702,12 @@ * ideal. It gives precedence to shared waiters if they are * present. For this condition, we have to preserve the * state of the exclusive waiters flag. + * If interruptible sleeps left the shared queue empty avoid a + * starvation for the threads sleeping on the exclusive queue by giving + * them precedence and cleaning up the shared waiters bit anyway. */ - if (sx->sx_lock & SX_LOCK_SHARED_WAITERS) { + if ((sx->sx_lock & SX_LOCK_SHARED_WAITERS) != 0 && + sleepq_sleepcnt(&sx->lock_object, SQ_SHARED_QUEUE) != 0) { queue = SQ_SHARED_QUEUE; x |= (sx->sx_lock & SX_LOCK_EXCLUSIVE_WAITERS); } else Index: sys/kern/kern_lock.c =================================================================== --- sys/kern/kern_lock.c (revision 202146) +++ sys/kern/kern_lock.c (working copy) @@ -54,8 +54,8 @@ #include #endif -CTASSERT(((LK_ADAPTIVE | LK_NOSHARE) & LO_CLASSFLAGS) == - (LK_ADAPTIVE | LK_NOSHARE)); +CTASSERT(((LK_ADAPTIVE | LK_EXSLPFAIL | LK_NOSHARE) & LO_CLASSFLAGS) == + (LK_ADAPTIVE | LK_EXSLPFAIL | LK_NOSHARE)); CTASSERT(LK_UNLOCKED == (LK_UNLOCKED & ~(LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS))); @@ -194,6 +194,14 @@ if (flags & LK_INTERLOCK) class->lc_unlock(ilk); + + /* + * LK_EXSLPFAIL is not invariant during the lock pattern but it is + * always protected by the sleepqueue spinlock, thus it is safe to + * handle within the lo_flags. + */ + if (queue == SQ_EXCLUSIVE_QUEUE && (flags & LK_SLEEPFAIL) != 0) + lk->lock_object.lo_flags |= LK_EXSLPFAIL; GIANT_SAVE(); sleepq_add(&lk->lock_object, NULL, wmesg, SLEEPQ_LK | (catch ? SLEEPQ_INTERRUPTIBLE : 0), queue); @@ -222,6 +230,7 @@ wakeupshlk(struct lock *lk, const char *file, int line) { uintptr_t v, x; + u_int realexslp; int queue, wakeup_swapper; TD_LOCKS_DEC(curthread); @@ -267,13 +276,45 @@ /* * If the lock has exclusive waiters, give them preference in * order to avoid deadlock with shared runners up. + * If interruptible sleeps left the exclusive queue empty + * avoid a starvation for the threads sleeping on the shared + * queue by giving them precedence and cleaning up the + * exclusive waiters bit anyway. + * Please note that the LK_EXSLPFAIL flag may be lying about + * the real presence of waiters with the LK_SLEEPFAIL flag on + * because they may be used in conjuction with interruptible + * sleeps. */ - if (x & LK_EXCLUSIVE_WAITERS) { - queue = SQ_EXCLUSIVE_QUEUE; - v |= (x & LK_SHARED_WAITERS); + realexslp = sleepq_sleepcnt(&lk->lock_object, + SQ_EXCLUSIVE_QUEUE); + if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { + if ((lk->lock_object.lo_flags & LK_EXSLPFAIL) == 0) { + lk->lock_object.lo_flags &= ~LK_EXSLPFAIL; + queue = SQ_EXCLUSIVE_QUEUE; + v |= (x & LK_SHARED_WAITERS); + } else { + lk->lock_object.lo_flags &= ~LK_EXSLPFAIL; + LOCK_LOG2(lk, + "%s: %p has only LK_SLEEPFAIL sleepers", + __func__, lk); + LOCK_LOG2(lk, + "%s: %p waking up threads on the exclusive queue", + __func__, lk); + wakeup_swapper = + sleepq_broadcast(&lk->lock_object, + SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); + queue = SQ_SHARED_QUEUE; + } + } else { - MPASS((x & ~LK_EXCLUSIVE_SPINNERS) == - LK_SHARED_WAITERS); + + /* + * Exclusive waiters sleeping with LK_SLEEPFAIL on + * and using interruptible sleeps/timeout may have + * left spourious LK_EXSLPFAIL flag on, so clean + * it up anyway. + */ + lk->lock_object.lo_flags &= ~LK_EXSLPFAIL; queue = SQ_SHARED_QUEUE; } @@ -285,7 +326,7 @@ LOCK_LOG3(lk, "%s: %p waking up threads on the %s queue", __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); - wakeup_swapper = sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, + wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); sleepq_release(&lk->lock_object); break; @@ -362,6 +403,8 @@ KASSERT(lk->lk_lock == LK_UNLOCKED, ("lockmgr still held")); KASSERT(lk->lk_recurse == 0, ("lockmgr still recursed")); + KASSERT((lk->lock_object.lo_flags & LK_EXSLPFAIL) == 0, + ("lockmgr still exclusive waiters")); lock_destroy(&lk->lock_object); } @@ -373,7 +416,7 @@ struct lock_class *class; const char *iwmesg; uintptr_t tid, v, x; - u_int op; + u_int op, realexslp; int error, ipri, itimo, queue, wakeup_swapper; #ifdef LOCK_PROFILING uint64_t waittime = 0; @@ -903,14 +946,48 @@ * If the lock has exclusive waiters, give them * preference in order to avoid deadlock with * shared runners up. + * If interruptible sleeps left the exclusive queue + * empty avoid a starvation for the threads sleeping + * on the shared queue by giving them precedence + * and cleaning up the exclusive waiters bit anyway. + * Please note that the LK_EXSLPFAIL flag may be lying + * about the real presence of waiters with the + * LK_SLEEPFAIL flag on because they may be used in + * conjuction with interruptible sleeps. */ MPASS((x & LK_EXCLUSIVE_SPINNERS) == 0); - if (x & LK_EXCLUSIVE_WAITERS) { - queue = SQ_EXCLUSIVE_QUEUE; - v |= (x & LK_SHARED_WAITERS); + realexslp = sleepq_sleepcnt(&lk->lock_object, + SQ_EXCLUSIVE_QUEUE); + if ((x & LK_EXCLUSIVE_WAITERS) != 0 && realexslp != 0) { + if ((lk->lock_object.lo_flags & + LK_EXSLPFAIL) == 0) { + lk->lock_object.lo_flags &= + ~LK_EXSLPFAIL; + queue = SQ_EXCLUSIVE_QUEUE; + v |= (x & LK_SHARED_WAITERS); + } else { + lk->lock_object.lo_flags &= + ~LK_EXSLPFAIL; + LOCK_LOG2(lk, + "%s: %p has only LK_SLEEPFAIL sleepers", + __func__, lk); + LOCK_LOG2(lk, + "%s: %p waking up threads on the exclusive queue", + __func__, lk); + wakeup_swapper = + sleepq_broadcast(&lk->lock_object, + SLEEPQ_LK, 0, SQ_EXCLUSIVE_QUEUE); + queue = SQ_SHARED_QUEUE; + } } else { - MPASS((x & LK_ALL_WAITERS) == - LK_SHARED_WAITERS); + + /* + * Exclusive waiters sleeping with LK_SLEEPFAIL + * on and using interruptible sleeps/timeout + * may have left spourious LK_EXSLPFAIL flag + * on, so clean it up anyway. + */ + lk->lock_object.lo_flags &= ~LK_EXSLPFAIL; queue = SQ_SHARED_QUEUE; } @@ -919,7 +996,7 @@ __func__, lk, queue == SQ_SHARED_QUEUE ? "shared" : "exclusive"); atomic_store_rel_ptr(&lk->lk_lock, v); - wakeup_swapper = sleepq_broadcast(&lk->lock_object, + wakeup_swapper |= sleepq_broadcast(&lk->lock_object, SLEEPQ_LK, 0, queue); sleepq_release(&lk->lock_object); break; @@ -976,14 +1053,64 @@ v = x & (LK_ALL_WAITERS | LK_EXCLUSIVE_SPINNERS); if ((x & ~v) == LK_UNLOCKED) { v = (x & ~LK_EXCLUSIVE_SPINNERS); + + /* + * If interruptible sleeps left the exclusive + * queue empty avoid a starvation for the + * threads sleeping on the shared queue by + * giving them precedence and cleaning up the + * exclusive waiters bit anyway. + * Please note that the LK_EXSLPFAIL flag may + * be lying about the real presence of waiters + * with the LK_SLEEPFAIL flag on because they + * may be used in conjuction with interruptible + * sleeps. + */ if (v & LK_EXCLUSIVE_WAITERS) { queue = SQ_EXCLUSIVE_QUEUE; v &= ~LK_EXCLUSIVE_WAITERS; } else { + + /* + * Exclusive waiters sleeping with + * LK_SLEEPFAIL on and using + * interruptible sleeps/timeout may + * have left spourious LK_EXSLPFAIL + * flag on, so clean it up anyway. + */ MPASS(v & LK_SHARED_WAITERS); + lk->lock_object.lo_flags &= + ~LK_EXSLPFAIL; queue = SQ_SHARED_QUEUE; v &= ~LK_SHARED_WAITERS; } + if (queue == SQ_EXCLUSIVE_QUEUE) { + realexslp = + sleepq_sleepcnt(&lk->lock_object, + SQ_EXCLUSIVE_QUEUE); + if ((lk->lock_object.lo_flags & + LK_EXSLPFAIL) == 0) { + lk->lock_object.lo_flags &= + ~LK_EXSLPFAIL; + queue = SQ_SHARED_QUEUE; + v &= ~LK_SHARED_WAITERS; + if (realexslp != 0) { + LOCK_LOG2(lk, + "%s: %p has only LK_SLEEPFAIL sleepers", + __func__, lk); + LOCK_LOG2(lk, + "%s: %p waking up threads on the exclusive queue", + __func__, lk); + wakeup_swapper = + sleepq_broadcast( + &lk->lock_object, + SLEEPQ_LK, 0, + SQ_EXCLUSIVE_QUEUE); + } + } else + lk->lock_object.lo_flags &= + ~LK_EXSLPFAIL; + } if (!atomic_cmpset_ptr(&lk->lk_lock, x, v)) { sleepq_release(&lk->lock_object); continue; Index: sys/sys/lockmgr.h =================================================================== --- sys/sys/lockmgr.h (revision 202146) +++ sys/sys/lockmgr.h (working copy) @@ -144,6 +144,9 @@ #define LK_QUIET 0x000020 #define LK_ADAPTIVE 0x000040 +/* LK_EXSLPFAIL to follow, even if not used in lockinit() */ +#define LK_EXSLPFAIL 0x000080 + /* * Additional attributes to be used in lockmgr(). */ Index: sys/sys/sleepqueue.h =================================================================== --- sys/sys/sleepqueue.h (revision 202146) +++ sys/sys/sleepqueue.h (working copy) @@ -109,6 +109,7 @@ void sleepq_remove(struct thread *td, void *wchan); int sleepq_signal(void *wchan, int flags, int pri, int queue); void sleepq_set_timeout(void *wchan, int timo); +u_int sleepq_sleepcnt(void *wchan, int queue); int sleepq_timedwait(void *wchan, int pri); int sleepq_timedwait_sig(void *wchan, int pri); void sleepq_wait(void *wchan, int pri);