--- //depot/projects/smpng/sys/dev/e1000/if_igb.c	2010-06-25 20:21:33.000000000 0000
+++ //depot/user/jhb/intr/dev/e1000/if_igb.c	2010-07-08 19:19:45.000000000 0000
@@ -46,6 +46,9 @@
 #endif
 #include <sys/bus.h>
 #include <sys/endian.h>
+#if __FreeBSD_version >= 900000
+#include <sys/interrupt.h>
+#endif
 #include <sys/kernel.h>
 #include <sys/kthread.h>
 #include <sys/malloc.h>
@@ -155,13 +158,15 @@
 static int	igb_shutdown(device_t);
 static int	igb_suspend(device_t);
 static int	igb_resume(device_t);
-static void	igb_start(struct ifnet *);
-static void	igb_start_locked(struct tx_ring *, struct ifnet *ifp);
 #if __FreeBSD_version >= 800000
 static int	igb_mq_start(struct ifnet *, struct mbuf *);
 static int	igb_mq_start_locked(struct ifnet *,
 		    struct tx_ring *, struct mbuf *);
 static void	igb_qflush(struct ifnet *);
+static void	igb_deferred_mq_start(void *, int);
+#else
+static void	igb_start(struct ifnet *);
+static void	igb_start_locked(struct tx_ring *, struct ifnet *ifp);
 #endif
 static int	igb_ioctl(struct ifnet *, u_long, caddr_t);
 static void	igb_init(void *);
@@ -238,8 +243,13 @@
 static int	igb_irq_fast(void *);
 static void	igb_add_rx_process_limit(struct adapter *, const char *,
 		    const char *, int *, int);
+#if __FreeBSD_version < 900000
 static void	igb_handle_que(void *context, int pending);
 static void	igb_handle_link(void *context, int pending);
+#else
+static void	igb_handle_que(void *);
+static void	igb_handle_link(void *);
+#endif
 
 /* These are MSIX only irq handlers */
 static void	igb_msix_que(void *);
@@ -623,6 +633,8 @@
 		return (EBUSY);
 	}
 
+	ether_ifdetach(adapter->ifp);
+
 	if (adapter->led_dev != NULL)
 		led_destroy(adapter->led_dev);
 
@@ -654,8 +666,6 @@
 	if (adapter->vlan_detach != NULL)
 		EVENTHANDLER_DEREGISTER(vlan_unconfig, adapter->vlan_detach);
 
-	ether_ifdetach(adapter->ifp);
-
 	callout_drain(&adapter->timer);
 
 	igb_free_pci_resources(adapter);
@@ -713,14 +723,27 @@
 {
 	struct adapter *adapter = device_get_softc(dev);
 	struct ifnet *ifp = adapter->ifp;
+#if __FreeBSD_version >= 800000
+	struct tx_ring *txr = adapter->tx_rings;
+#endif
 
 	IGB_CORE_LOCK(adapter);
 	igb_init_locked(adapter);
 	igb_init_manageability(adapter);
 
 	if ((ifp->if_flags & IFF_UP) &&
-	    (ifp->if_drv_flags & IFF_DRV_RUNNING))
+	    (ifp->if_drv_flags & IFF_DRV_RUNNING)) {
+#if __FreeBSD_version < 800000
 		igb_start(ifp);
+#else
+		for (int i = 0; i < adapter->num_queues; i++, txr++) {
+			IGB_TX_LOCK(txr);
+			if (!drbr_empty(ifp, txr->br))
+				igb_mq_start_locked(ifp, txr, NULL);
+			IGB_TX_UNLOCK(txr);
+		}
+#endif
+	}
 
 	IGB_CORE_UNLOCK(adapter);
 
@@ -728,6 +751,7 @@
 }
 
 
+#if __FreeBSD_version < 800000
 /*********************************************************************
  *  Transmit entry point
  *
@@ -804,7 +828,7 @@
 	return;
 }
 
-#if __FreeBSD_version >= 800000
+#else /* __FreeBSD_version >= 800000 */
 /*
 ** Multiqueue Transmit driver
 **
@@ -829,7 +853,7 @@
 		IGB_TX_UNLOCK(txr);
 	} else {
 		err = drbr_enqueue(ifp, txr->br, m);
-		taskqueue_enqueue(que->tq, &que->que_task);
+		taskqueue_enqueue(que->tq, &txr->txq_task);
 	}
 
 	return (err);
@@ -892,6 +916,22 @@
 }
 
 /*
+ * Called from a taskqueue to drain queued transmit packets.
+ */
+static void
+igb_deferred_mq_start(void *arg, int pending)
+{
+	struct tx_ring *txr = arg;
+	struct adapter *adapter = txr->adapter;
+	struct ifnet *ifp = adapter->ifp;
+
+	IGB_TX_LOCK(txr);
+	if (!drbr_empty(ifp, txr->br))
+		igb_mq_start_locked(ifp, txr, NULL);
+	IGB_TX_UNLOCK(txr);
+}
+
+/*
 ** Flush all ring buffers
 */
 static void
@@ -909,7 +949,7 @@
 	}
 	if_qflush(ifp);
 }
-#endif /* __FreeBSD_version >= 800000 */
+#endif /* __FreeBSD_version < 800000 */
 
 /*********************************************************************
  *  Ioctl entry point
@@ -1221,9 +1261,13 @@
 	IGB_CORE_UNLOCK(adapter);
 }
 
-
+#if __FreeBSD_version < 900000
 static void
 igb_handle_que(void *context, int pending)
+#else
+static void
+igb_handle_que(void *context)
+#endif
 {
 	struct igb_queue *que = context;
 	struct adapter *adapter = que->adapter;
@@ -1247,7 +1291,11 @@
 #endif
 		IGB_TX_UNLOCK(txr);
 		if (more) {
+#if __FreeBSD_version < 900000
 			taskqueue_enqueue(que->tq, &que->que_task);
+#else
+			hwi_sched(que->tag);
+#endif
 			return;
 		}
 	}
@@ -1264,8 +1312,13 @@
 }
 
 /* Deal with link in a sleepable context */
+#if __FreeBSD_version < 900000
 static void
 igb_handle_link(void *context, int pending)
+#else
+static void
+igb_handle_link(void *context)
+#endif
 {
 	struct adapter *adapter = context;
 
@@ -1283,7 +1336,9 @@
 igb_irq_fast(void *arg)
 {
 	struct adapter		*adapter = arg;
+#if __FreeBSD_version < 900000
 	struct igb_queue	*que = adapter->queues;
+#endif
 	u32			reg_icr;
 
 
@@ -1306,15 +1361,25 @@
 	 * MSI message reordering errata on certain systems.
 	 */
 	igb_disable_intr(adapter);
+#if __FreeBSD_version < 900000
 	taskqueue_enqueue(que->tq, &que->que_task);
+#endif
 
 	/* Link status change */
 	if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC))
+#if __FreeBSD_version < 900000
 		taskqueue_enqueue(que->tq, &adapter->link_task);
+#else
+		hwi_sched(adapter->link_tag);
+#endif
 
 	if (reg_icr & E1000_ICR_RXO)
 		adapter->rx_overruns++;
+#if __FreeBSD_version < 900000
 	return FILTER_HANDLED;
+#else
+	return FILTER_HANDLED | FILTER_SCHEDULE_THREAD;
+#endif
 }
 
 #ifdef DEVICE_POLLING
@@ -1350,7 +1415,11 @@
 		reg_icr = E1000_READ_REG(&adapter->hw, E1000_ICR);
 		/* Link status change */
 		if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC))
+#if __FreeBSD_version < 900000
 			igb_handle_link(adapter, 0);
+#else
+			igb_handle_link(adapter);
+#endif
 
 		if (reg_icr & E1000_ICR_RXO)
 			adapter->rx_overruns++;
@@ -1452,8 +1521,12 @@
 
 no_calc:
 	/* Schedule a clean task if needed*/
-	if (more_tx || more_rx) 
+	if (more_tx || more_rx)
+#if __FreeBSD_version < 900000
 		taskqueue_enqueue(que->tq, &que->que_task);
+#else
+		hwi_sched(que->tag);
+#endif
 	else
 		/* Reenable this interrupt */
 		E1000_WRITE_REG(&adapter->hw, E1000_EIMS, que->eims);
@@ -1477,7 +1550,11 @@
 	icr = E1000_READ_REG(&adapter->hw, E1000_ICR);
 	if (!(icr & E1000_ICR_LSC))
 		goto spurious;
+#if __FreeBSD_version < 900000
 	igb_handle_link(adapter, 0);
+#else
+	igb_handle_link(adapter);
+#endif
 
 spurious:
 	/* Rearm */
@@ -2087,6 +2164,7 @@
 {
 	device_t		dev = adapter->dev;
 	struct igb_queue	*que = adapter->queues;
+	struct tx_ring		*txr = adapter->tx_rings;
 	int			error, rid = 0;
 
 	/* Turn off all interrupts */
@@ -2105,6 +2183,9 @@
 		return (ENXIO);
 	}
 
+	TASK_INIT(&txr->txq_task, 0, igb_deferred_mq_start, txr);
+
+#if __FreeBSD_version < 900000
 	/*
 	 * Try allocating a fast interrupt and the associated deferred
 	 * processing contexts.
@@ -2125,6 +2206,28 @@
 		que->tq = NULL;
 		return (error);
 	}
+#else
+	/* Create a taskqueue for deferred transmit queue starts. */
+	que->tq = taskqueue_create("igb_taskq", M_NOWAIT,
+	    taskqueue_thread_enqueue, &que->tq);
+	taskqueue_start_threads(&que->tq, 1, PI_NET, "%s taskq",
+	    device_get_nameunit(adapter->dev));
+
+	error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE,
+	    igb_irq_fast, igb_handle_que, adapter, &adapter->tag);
+	if (error) {
+		device_printf(dev, "Failed to register que interrupt "
+			    "handler: %d\n", error);
+		return (error);
+	}
+	error = bus_setup_intr(dev, adapter->res, INTR_TYPE_NET | INTR_MPSAFE |
+	    INTR_MANUAL, NULL, igb_handle_link, adapter, &adapter->link_tag);
+	if (error) {
+		device_printf(dev, "Failed to register link interrupt "
+			    "handler: %d\n", error);
+		return (error);
+	}
+#endif
 
 	return (0);
 }
@@ -2175,9 +2278,13 @@
 		*/
 		if (adapter->num_queues > 1)
 			bus_bind_intr(dev, que->res, i);
+		TASK_INIT(&que->txr->txq_task, 0, igb_deferred_mq_start,
+		    que->txr);
+#if __FreeBSD_version < 900000
 		/* Make tasklet for deferred handling */
 		TASK_INIT(&que->que_task, 0, igb_handle_que, que);
-		que->tq = taskqueue_create_fast("igb_que", M_NOWAIT,
+#endif
+		que->tq = taskqueue_create("igb_que", M_NOWAIT,
 		    taskqueue_thread_enqueue, &que->tq);
 		taskqueue_start_threads(&que->tq, 1, PI_NET, "%s que",
 		    device_get_nameunit(adapter->dev));
@@ -2382,13 +2489,34 @@
 	else
 		(adapter->msix != 0) ? (rid = 1):(rid = 0);
 
+	que = adapter->queues;
 	if (adapter->tag != NULL) {
+#if __FreeBSD_version < 900000
+		taskqueue_drain(que->tq, &adapter->link_task);
+#endif
 		bus_teardown_intr(dev, adapter->res, adapter->tag);
 		adapter->tag = NULL;
 	}
+#if __FreeBSD_version >= 900000
+	if (adapter->link_tag != NULL) {
+		bus_teardown_intr(dev, adapter->res, adapter->link_tag);
+		adapter->link_tag = NULL;
+	}
+#endif
 	if (adapter->res != NULL)
 		bus_release_resource(dev, SYS_RES_IRQ, rid, adapter->res);
 
+	for (int i = 0; i < adapter->num_queues; i++, que++) {
+		if (que->tq != NULL) {
+#if __FreeBSD_version >= 800000
+			taskqueue_drain(que->tq, &que->txr->txq_task);
+#endif
+#if __FreeBSD_version < 900000
+			taskqueue_drain(que->tq, &que->que_task);
+#endif
+			taskqueue_free(que->tq);
+		}
+	}
 mem:
 	if (adapter->msix)
 		pci_release_msi(dev);
@@ -2637,10 +2765,11 @@
 	ifp->if_softc = adapter;
 	ifp->if_flags = IFF_BROADCAST | IFF_SIMPLEX | IFF_MULTICAST;
 	ifp->if_ioctl = igb_ioctl;
-	ifp->if_start = igb_start;
 #if __FreeBSD_version >= 800000
 	ifp->if_transmit = igb_mq_start;
 	ifp->if_qflush = igb_qflush;
+#else
+	ifp->if_start = igb_start;
 #endif
 	IFQ_SET_MAXLEN(&ifp->if_snd, adapter->num_tx_desc - 1);
 	ifp->if_snd.ifq_drv_maxlen = adapter->num_tx_desc - 1;
--- //depot/projects/smpng/sys/dev/e1000/if_igb.h	2010-06-25 20:21:33.000000000 0000
+++ //depot/user/jhb/intr/dev/e1000/if_igb.h	2010-07-08 19:19:45.000000000 0000
@@ -286,7 +286,9 @@
 	void			*tag;
 	struct tx_ring		*txr;
 	struct rx_ring		*rxr;
+#if __FreeBSD_version < 900000
 	struct task		que_task;
+#endif
 	struct taskqueue	*tq;
 	u64			irqs;
 };
@@ -309,6 +311,7 @@
 	struct buf_ring		*br;
 #endif
 	bus_dma_tag_t		txtag;
+	struct task		txq_task;
 
 	u32			bytes;
 	u32			packets;
@@ -377,7 +380,11 @@
 
 	int		linkvec;
 	int		link_mask;
+#if __FreeBSD_version < 900000
 	struct task	link_task;
+#else
+	void		*link_tag;
+#endif
 	int		link_irq;
 
 	struct ifmedia	media;
--- //depot/projects/smpng/sys/kern/kern_intr.c	2010-06-10 20:54:07.000000000 0000
+++ //depot/user/jhb/intr/kern/kern_intr.c	2010-06-11 13:47:32.000000000 0000
@@ -63,62 +63,92 @@
 #endif
 
 /*
- * Describe an interrupt thread.  There is one of these per interrupt event.
+ * Describe an interrupt thread.  Software interrupt events have
+ * dedicated threads.  Hardware interrupt events share a pool of
+ * threads.
  */
 struct intr_thread {
-	struct intr_event *it_event;
 	struct thread *it_thread;	/* Kernel thread. */
-	int	it_flags;		/* (j) IT_* flags. */
-	int	it_need;		/* Needs service. */
+	int	it_flags;		/* IT_* flags. */
+	TAILQ_ENTRY(intr_thread) it_list; /* List of free hwi threads. */
+#ifdef INVARIANTS
+	struct intr_handler *it_current; /* Current handler for hwi. */
+#endif
 };
+TAILQ_HEAD(ithread_queue, intr_thread);
 
 /* Interrupt thread flags kept in it_flags */
 #define	IT_DEAD		0x000001	/* Thread is waiting to exit. */
+#define	IT_SOFT		0x000002	/* Thread is for a software interrupt. */
 
 struct	intr_entropy {
 	struct	thread *td;
 	uintptr_t event;
 };
 
+/* Return values for intr_handler_execute(). */
+enum {
+	FINISHED,
+	DYING,
+	REQUEUE,
+};
+
 struct	intr_event *clk_intr_event;
 struct	intr_event *tty_intr_event;
 void	*vm_ih;
-struct proc *intrproc;
+struct proc *hwintr;		/* Pool of hardware interrupt threads. */
+struct proc *swintr;		/* Container for software interrupt threads. */
 
 static MALLOC_DEFINE(M_ITHREAD, "ithread", "Interrupt Threads");
 
+SYSCTL_NODE(_kern, OID_AUTO, intr, CTLFLAG_RD, 0, "Interrupt parameters");
+
+#ifdef notyet
 static int intr_storm_threshold = 1000;
-TUNABLE_INT("hw.intr_storm_threshold", &intr_storm_threshold);
-SYSCTL_INT(_hw, OID_AUTO, intr_storm_threshold, CTLFLAG_RW,
+TUNABLE_INT("kern.intr.storm_threshold", &intr_storm_threshold);
+SYSCTL_INT(_kern_intr, OID_AUTO, storm_threshold, CTLFLAG_RW,
     &intr_storm_threshold, 0,
     "Number of consecutive interrupts before storm protection is enabled");
-static TAILQ_HEAD(, intr_event) event_list =
-    TAILQ_HEAD_INITIALIZER(event_list);
+#endif
+
+static TAILQ_HEAD(, intr_hardware) hwi_event_list =
+    TAILQ_HEAD_INITIALIZER(hwi_event_list);
+static TAILQ_HEAD(, intr_software) swi_event_list =
+    TAILQ_HEAD_INITIALIZER(swi_event_list);
 static struct mtx event_lock;
-MTX_SYSINIT(intr_event_list, &event_lock, "intr event list", MTX_DEF);
+MTX_SYSINIT(intr_event_list, &event_lock, "intr event lists", MTX_DEF);
+
+static struct ithread_queue hwi_threads =
+    TAILQ_HEAD_INITIALIZER(hwi_threads);
+static struct mtx hwi_thread_lock;
+MTX_SYSINIT(hwi_thread_pool, &hwi_thread_lock, "hwi threads", MTX_SPIN |
+    MTX_RECURSE);
+static int hwi_check_cpus, hwi_nhandlers, hwi_thread_count, hwi_thread_warn;
 
+static void	hwi_alloc_thread(void);
+static void	hwi_create_thread(void);
+static int	hwi_destroy_thread(void);
+static void	hwi_loop(void *);
+static int	hwi_max_threads(void);
+static int	hwi_min_threads(void);
+static int	hwi_pending_cpu(struct intr_thread *it);
+static int	hwi_thread_sysctl(SYSCTL_HANDLER_ARGS);
+static void	intr_event_init(struct intr_event *ie, int flags,
+		    const char *fmt, va_list ap);
+static void	intr_event_run_lock(struct intr_event *ie);
+static void	intr_event_run_unlock(struct intr_event *ie);
 static void	intr_event_update(struct intr_event *ie);
-#ifdef INTR_FILTER
-static int	intr_event_schedule_thread(struct intr_event *ie,
-		    struct intr_thread *ithd);
-static int	intr_filter_loop(struct intr_event *ie,
-		    struct trapframe *frame, struct intr_thread **ithd);
-static struct intr_thread *ithread_create(const char *name,
-			      struct intr_handler *ih);
-#else
-static int	intr_event_schedule_thread(struct intr_event *ie);
-static struct intr_thread *ithread_create(const char *name);
-#endif
+static void	intr_handler_ack_dying(struct intr_event *ie,
+		    struct intr_handler *ih);
+static int	intr_handler_execute(struct intr_handler *ih,
+		    struct intr_event *ie, struct thread *td);
+static struct intr_hardware *intr_lookup(int irq);
+static struct intr_thread *ithread_create(struct proc **pp, void *func,
+		    void *arg, const char *name, int pri);
 static void	ithread_destroy(struct intr_thread *ithread);
-static void	ithread_execute_handlers(struct proc *p, 
-		    struct intr_event *ie);
-#ifdef INTR_FILTER
-static void	priv_ithread_execute_handler(struct proc *p, 
-		    struct intr_handler *ih);
-#endif
-static void	ithread_loop(void *);
-static void	ithread_update(struct intr_thread *ithd);
+static void	ithread_update(struct intr_event *ie);
 static void	start_softintr(void *);
+static void	swi_loop(void *);
 
 /* Map an interrupt type to an ithread priority. */
 u_char
@@ -163,17 +193,18 @@
 }
 
 /*
- * Update an ithread based on the associated intr_event.
+ * Update the ithread for a software interrupt based on the associated
+ * intr_event.
  */
 static void
-ithread_update(struct intr_thread *ithd)
+ithread_update(struct intr_event *ie)
 {
-	struct intr_event *ie;
+	struct intr_software *isw;
 	struct thread *td;
 	u_char pri;
 
-	ie = ithd->it_event;
-	td = ithd->it_thread;
+	isw = (struct intr_software *)ie;
+	td = isw->isw_thread->it_thread;
 
 	/* Determine the overall priority of this event. */
 	if (TAILQ_EMPTY(&ie->ie_handlers))
@@ -239,62 +270,67 @@
 	}
 
 	/*
-	 * If this event has an ithread, update it's priority and
-	 * name.
+	 * If this is a software interrupt event, update the priority
+	 * and name of the associated thread.
 	 */
-	if (ie->ie_thread != NULL)
-		ithread_update(ie->ie_thread);
+	if (ie->ie_flags & IE_SOFT)
+		ithread_update(ie);
 	CTR2(KTR_INTR, "%s: updated %s", __func__, ie->ie_fullname);
 }
 
-int
-intr_event_create(struct intr_event **event, void *source, int flags, int irq,
-    void (*pre_ithread)(void *), void (*post_ithread)(void *),
-    void (*post_filter)(void *), int (*assign_cpu)(void *, u_char),
-    const char *fmt, ...)
+static void
+intr_event_init(struct intr_event *ie, int flags, const char *fmt, va_list ap)
 {
-	struct intr_event *ie;
-	va_list ap;
 
-	/* The only valid flag during creation is IE_SOFT. */
-	if ((flags & ~IE_SOFT) != 0)
-		return (EINVAL);
-	ie = malloc(sizeof(struct intr_event), M_ITHREAD, M_WAITOK | M_ZERO);
-	ie->ie_source = source;
-	ie->ie_pre_ithread = pre_ithread;
-	ie->ie_post_ithread = post_ithread;
-	ie->ie_post_filter = post_filter;
-	ie->ie_assign_cpu = assign_cpu;
 	ie->ie_flags = flags;
-	ie->ie_irq = irq;
 	ie->ie_cpu = NOCPU;
 	TAILQ_INIT(&ie->ie_handlers);
 	mtx_init(&ie->ie_lock, "intr event", NULL, MTX_DEF);
 
+	vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap);
+	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
+	CTR1(KTR_INTR, "intr_event_init: created %s", ie->ie_name);
+}
+
+void
+hwi_create(struct intr_event **event, void *source, int irq,
+    void (*pre_ithread)(void *), void (*post_ithread)(void *),
+    void (*post_filter)(void *), int (*assign_cpu)(void *, u_char),
+    const char *fmt, ...)
+{
+	struct intr_hardware *ihw;
+	va_list ap;
+
+	ihw = malloc(sizeof(struct intr_hardware), M_ITHREAD,
+	    M_WAITOK | M_ZERO);
 	va_start(ap, fmt);
-	vsnprintf(ie->ie_name, sizeof(ie->ie_name), fmt, ap);
+	intr_event_init(&ihw->ihw_event, 0, fmt, ap);
 	va_end(ap);
-	strlcpy(ie->ie_fullname, ie->ie_name, sizeof(ie->ie_fullname));
+	ihw->ihw_source = source;
+	ihw->ihw_pre_ithread = pre_ithread;
+	ihw->ihw_post_ithread = post_ithread;
+	ihw->ihw_post_filter = post_filter;
+	ihw->ihw_assign_cpu = assign_cpu;
+	ihw->ihw_irq = irq;
+	TAILQ_INIT(&ihw->ihw_manual);
 	mtx_lock(&event_lock);
-	TAILQ_INSERT_TAIL(&event_list, ie, ie_list);
+	TAILQ_INSERT_TAIL(&hwi_event_list, ihw, ihw_list);
 	mtx_unlock(&event_lock);
 	if (event != NULL)
-		*event = ie;
-	CTR2(KTR_INTR, "%s: created %s", __func__, ie->ie_name);
-	return (0);
+		*event = &ihw->ihw_event;
 }
 
 /*
  * Bind an interrupt event to the specified CPU.  Note that not all
  * platforms support binding an interrupt to a CPU.  For those
- * platforms this request will fail.  For supported platforms, any
- * associated ithreads as well as the primary interrupt context will
- * be bound to the specificed CPU.  Using a cpu id of NOCPU unbinds
+ * platforms this request will fail.  Using a cpu id of NOCPU unbinds
  * the interrupt event.
  */
 int
 intr_event_bind(struct intr_event *ie, u_char cpu)
 {
+	struct intr_hardware *ihw;
+	struct intr_software *isw;
 	cpuset_t mask;
 	lwpid_t id;
 	int error;
@@ -303,46 +339,33 @@
 	if (cpu != NOCPU && CPU_ABSENT(cpu))
 		return (EINVAL);
 
-	if (ie->ie_assign_cpu == NULL)
-		return (EOPNOTSUPP);
-
 	error = priv_check(curthread, PRIV_SCHED_CPUSET_INTR);
 	if (error)
 		return (error);
 
-	/*
-	 * If we have any ithreads try to set their mask first to verify
-	 * permissions, etc.
-	 */
-	mtx_lock(&ie->ie_lock);
-	if (ie->ie_thread != NULL) {
+	if (ie->ie_flags & IE_SOFT) {
+		/* Software interrupts cpuset the associated swi thread. */
+		isw = (struct intr_software *)ie;
+		mtx_lock(&ie->ie_lock);
 		CPU_ZERO(&mask);
 		if (cpu == NOCPU)
 			CPU_COPY(cpuset_root, &mask);
 		else
 			CPU_SET(cpu, &mask);
-		id = ie->ie_thread->it_thread->td_tid;
+		id = isw->isw_thread->it_thread->td_tid;
 		mtx_unlock(&ie->ie_lock);
 		error = cpuset_setthread(id, &mask);
 		if (error)
 			return (error);
-	} else
-		mtx_unlock(&ie->ie_lock);
-	error = ie->ie_assign_cpu(ie->ie_source, cpu);
-	if (error) {
-		mtx_lock(&ie->ie_lock);
-		if (ie->ie_thread != NULL) {
-			CPU_ZERO(&mask);
-			if (ie->ie_cpu == NOCPU)
-				CPU_COPY(cpuset_root, &mask);
-			else
-				CPU_SET(cpu, &mask);
-			id = ie->ie_thread->it_thread->td_tid;
-			mtx_unlock(&ie->ie_lock);
-			(void)cpuset_setthread(id, &mask);
-		} else
-			mtx_unlock(&ie->ie_lock);
-		return (error);
+	} else {
+		ihw = (struct intr_hardware *)ie;
+
+		if (ihw->ihw_assign_cpu == NULL)
+			return (EOPNOTSUPP);
+
+		error = ihw->ihw_assign_cpu(ihw->ihw_source, cpu);
+		if (error)
+			return (error);
 	}
 
 	mtx_lock(&ie->ie_lock);
@@ -352,25 +375,24 @@
 	return (error);
 }
 
-static struct intr_event *
+static struct intr_hardware *
 intr_lookup(int irq)
 {
-	struct intr_event *ie;
+	struct intr_hardware *ihw;
 
 	mtx_lock(&event_lock);
-	TAILQ_FOREACH(ie, &event_list, ie_list)
-		if (ie->ie_irq == irq &&
-		    (ie->ie_flags & IE_SOFT) == 0 &&
-		    TAILQ_FIRST(&ie->ie_handlers) != NULL)
+	TAILQ_FOREACH(ihw, &hwi_event_list, ihw_list)
+		if (ihw->ihw_irq == irq &&
+		    TAILQ_FIRST(&ihw->ihw_event.ie_handlers) != NULL)
 			break;
 	mtx_unlock(&event_lock);
-	return (ie);
+	return (ihw);
 }
 
 int
 intr_setaffinity(int irq, void *m)
 {
-	struct intr_event *ie;
+	struct intr_hardware *ihw;
 	cpuset_t *mask;
 	u_char cpu;
 	int n;
@@ -390,50 +412,49 @@
 			cpu = (u_char)n;
 		}
 	}
-	ie = intr_lookup(irq);
-	if (ie == NULL)
+	ihw = intr_lookup(irq);
+	if (ihw == NULL)
 		return (ESRCH);
-	return (intr_event_bind(ie, cpu));
+	return (intr_event_bind(&ihw->ihw_event, cpu));
 }
 
 int
 intr_getaffinity(int irq, void *m)
 {
-	struct intr_event *ie;
+	struct intr_hardware *ihw;
 	cpuset_t *mask;
 
 	mask = m;
-	ie = intr_lookup(irq);
-	if (ie == NULL)
+	ihw = intr_lookup(irq);
+	if (ihw == NULL)
 		return (ESRCH);
 	CPU_ZERO(mask);
-	mtx_lock(&ie->ie_lock);
-	if (ie->ie_cpu == NOCPU)
+	mtx_lock(&ihw->ihw_event.ie_lock);
+	if (ihw->ihw_event.ie_cpu == NOCPU)
 		CPU_COPY(cpuset_root, mask);
 	else
-		CPU_SET(ie->ie_cpu, mask);
-	mtx_unlock(&ie->ie_lock);
+		CPU_SET(ihw->ihw_event.ie_cpu, mask);
+	mtx_unlock(&ihw->ihw_event.ie_lock);
 	return (0);
 }
 
 int
-intr_event_destroy(struct intr_event *ie)
+hwi_destroy(struct intr_event *ie)
 {
+	struct intr_hardware *ihw;
 
+	if (ie->ie_flags & IE_SOFT)
+		return (EINVAL);
+	ihw = (struct intr_hardware *)ie;
+
 	mtx_lock(&event_lock);
 	mtx_lock(&ie->ie_lock);
-	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
+	if (!TAILQ_EMPTY(&ie->ie_handlers) || !TAILQ_EMPTY(&ihw->ihw_manual)) {
 		mtx_unlock(&ie->ie_lock);
 		mtx_unlock(&event_lock);
 		return (EBUSY);
 	}
-	TAILQ_REMOVE(&event_list, ie, ie_list);
-#ifndef notyet
-	if (ie->ie_thread != NULL) {
-		ithread_destroy(ie->ie_thread);
-		ie->ie_thread = NULL;
-	}
-#endif
+	TAILQ_REMOVE(&hwi_event_list, ihw, ihw_list);
 	mtx_unlock(&ie->ie_lock);
 	mtx_unlock(&event_lock);
 	mtx_destroy(&ie->ie_lock);
@@ -441,63 +462,41 @@
 	return (0);
 }
 
-#ifndef INTR_FILTER
+/* Create an interrupt thread. */
 static struct intr_thread *
-ithread_create(const char *name)
+ithread_create(struct proc **pp, void *func, void *arg, const char *name,
+    int pri)
 {
 	struct intr_thread *ithd;
 	struct thread *td;
 	int error;
 
-	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
-
-	error = kproc_kthread_add(ithread_loop, ithd, &intrproc,
-		    &td, RFSTOPPED | RFHIGHPID,
-	    	    0, "intr", "%s", name);
+	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK |
+	    M_ZERO);
+	error = kproc_kthread_add(func, arg, pp, &td, RFSTOPPED | RFHIGHPID, 0,
+	    name, name);
 	if (error)
-		panic("kproc_create() failed with %d", error);
+		panic("failed to create interrupt thread with %d", error);
 	thread_lock(td);
 	sched_class(td, PRI_ITHD);
+	sched_prio(td, pri);
 	TD_SET_IWAIT(td);
 	thread_unlock(td);
 	td->td_pflags |= TDP_ITHREAD;
+	td->td_ithread = ithd;
 	ithd->it_thread = td;
-	CTR2(KTR_INTR, "%s: created %s", __func__, name);
+	CTR2(KTR_INTR, "ithread_create: created tid %d(%s)", td->td_tid, name);
 	return (ithd);
 }
-#else
-static struct intr_thread *
-ithread_create(const char *name, struct intr_handler *ih)
-{
-	struct intr_thread *ithd;
-	struct thread *td;
-	int error;
-
-	ithd = malloc(sizeof(struct intr_thread), M_ITHREAD, M_WAITOK | M_ZERO);
 
-	error = kproc_kthread_add(ithread_loop, ih, &intrproc,
-		    &td, RFSTOPPED | RFHIGHPID,
-	    	    0, "intr", "%s", name);
-	if (error)
-		panic("kproc_create() failed with %d", error);
-	thread_lock(td);
-	sched_class(td, PRI_ITHD);
-	TD_SET_IWAIT(td);
-	thread_unlock(td);
-	td->td_pflags |= TDP_ITHREAD;
-	ithd->it_thread = td;
-	CTR2(KTR_INTR, "%s: created %s", __func__, name);
-	return (ithd);
-}
-#endif
-
 static void
 ithread_destroy(struct intr_thread *ithread)
 {
 	struct thread *td;
 
-	CTR2(KTR_INTR, "%s: killing %s", __func__, ithread->it_event->ie_name);
 	td = ithread->it_thread;
+	CTR2(KTR_INTR, "ithread_destroy: killing tid %d(%s)", td->td_tid,
+	    td->td_name);
 	thread_lock(td);
 	ithread->it_flags |= IT_DEAD;
 	if (TD_AWAITING_INTR(td)) {
@@ -507,106 +506,236 @@
 	thread_unlock(td);
 }
 
-#ifndef INTR_FILTER
-int
-intr_event_add_handler(struct intr_event *ie, const char *name,
-    driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
-    enum intr_type flags, void **cookiep)
+/*
+ * Look for a CPU that has queued interrupts but no active interrupt
+ * thread to donate the specified thread to.  Returns ID of a CPU that
+ * has queued handlers or NOCPU if no such CPU was found.  If no CPU
+ * was found, hwi_check_cpus is reset.
+ *
+ * This is the recovery mechanism used for the case where an interrupt
+ * handler was scheduled on a CPU's active queue but an interrupt
+ * thread was not available.
+ */
+static int
+hwi_pending_cpu(struct intr_thread *it)
+{
+	struct pcpu *pc;
+
+	mtx_assert(&hwi_thread_lock, MA_OWNED);
+	SLIST_FOREACH(pc, &cpuhead, pc_allcpu) {
+		if (!STAILQ_EMPTY(&pc->pc_hwi_active) &&
+		    pc->pc_hwi_thread == NULL) {
+			CTR3(KTR_INTR,
+			    "hwi_pending_cpu: tid %d (%s) assigned to CPU %d",
+			    it->it_thread->td_tid, it->it_thread->td_name,
+			    pc->pc_cpuid);
+			pc->pc_hwi_thread = it->it_thread;
+			return (pc->pc_cpuid);
+		}
+	}
+	hwi_check_cpus = 0;
+	return (NOCPU);
+}
+
+/* Create a hardware interrupt thread. */
+static void
+hwi_create_thread(void)
 {
-	struct intr_handler *ih, *temp_ih;
 	struct intr_thread *it;
+	struct thread *td;
+	int cpuid;
 
-	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
-		return (EINVAL);
+	mtx_assert(&hwi_thread_lock, MA_OWNED);
+	hwi_thread_count++;
+	mtx_unlock_spin(&hwi_thread_lock);
+	it = ithread_create(&hwintr, hwi_loop, NULL, "intr", PRI_MIN_ITHD);
+	td = it->it_thread;
+	thread_lock(td);
+	mtx_lock_spin(&hwi_thread_lock);
+	hwi_thread_warn = 0;
+	thread_lock_set(td, &hwi_thread_lock);
 
-	/* Allocate and populate an interrupt handler structure. */
-	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
-	ih->ih_filter = filter;
-	ih->ih_handler = handler;
-	ih->ih_argument = arg;
-	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
-	ih->ih_event = ie;
-	ih->ih_pri = pri;
-	if (flags & INTR_EXCL)
-		ih->ih_flags = IH_EXCLUSIVE;
-	if (flags & INTR_MPSAFE)
-		ih->ih_flags |= IH_MPSAFE;
-	if (flags & INTR_ENTROPY)
-		ih->ih_flags |= IH_ENTROPY;
-
-	/* We can only have one exclusive handler in a event. */
-	mtx_lock(&ie->ie_lock);
-	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
-		if ((flags & INTR_EXCL) ||
-		    (TAILQ_FIRST(&ie->ie_handlers)->ih_flags & IH_EXCLUSIVE)) {
-			mtx_unlock(&ie->ie_lock);
-			free(ih, M_ITHREAD);
-			return (EINVAL);
+	/* Schedule this thread immediately if a CPU needs a thread. */
+	if (hwi_check_cpus) {
+		cpuid = hwi_pending_cpu(it);
+		if (cpuid != NOCPU) {
+			CTR3(KTR_INTR,
+		    "hwi_create_thread: schedule tid %d (%s) for CPU %d",
+			    td->td_tid, td->td_name, cpuid);
+			sched_bind_ithd(td, cpuid);
+			TD_CLR_IWAIT(td);
+			sched_add(td, SRQ_INTR);
+			thread_unlock(td);
+			mtx_lock_spin(&hwi_thread_lock);
+			return;
 		}
 	}
 
-	/* Add the new handler to the event in priority order. */
-	TAILQ_FOREACH(temp_ih, &ie->ie_handlers, ih_next) {
-		if (temp_ih->ih_pri > ih->ih_pri)
+	TAILQ_INSERT_TAIL(&hwi_threads, it, it_list);
+}
+
+/*
+ * Destroy a hardware interrupt thread if a free one is available.  If
+ * all threads are busy, this will return false instead.
+ */
+static int
+hwi_destroy_thread(void)
+{
+	struct intr_thread *it;
+
+	mtx_assert(&hwi_thread_lock, MA_OWNED);
+	KASSERT(hwi_thread_count > 0, ("no hwi threads to destroy"));
+	if (TAILQ_EMPTY(&hwi_threads))
+		return (0);
+	hwi_thread_count--;
+	it = TAILQ_LAST(&hwi_threads, ithread_queue);
+	TAILQ_REMOVE(&hwi_threads, it, it_list);
+	mtx_unlock_spin(&hwi_thread_lock);
+	ithread_destroy(it);
+	mtx_lock_spin(&hwi_thread_lock);
+	return (1);
+}
+
+/* Minimum number of hardware interrupt threads. */
+static __inline int
+hwi_min_threads(void)
+{
+
+	/*
+	 * XXX: Capping the minimum at 2 threads per CPU is completely
+	 * arbitrary.
+	 */
+	if (hwi_nhandlers < mp_ncpus * 2)
+		return (hwi_nhandlers);
+	else
+		return (mp_ncpus * 2);
+}
+
+/* Maximum number of hardware interrupt threads. */
+static __inline int
+hwi_max_threads(void)
+{
+
+	/* No reason to have more threads than handlers. */
+	return (hwi_nhandlers);
+}
+
+static int
+hwi_thread_sysctl(SYSCTL_HANDLER_ARGS)
+{
+	int error, value;
+
+	value = hwi_thread_count;
+	error = sysctl_handle_int(oidp, &value, 0, req);
+	if (error || req->newptr == NULL)
+		return (error);
+
+	if (value < hwi_min_threads() || value > hwi_max_threads())
+		return (EINVAL);
+
+	mtx_lock_spin(&hwi_thread_lock);
+	while (hwi_thread_count < value &&
+	    hwi_thread_count < hwi_max_threads())
+		hwi_create_thread();
+	while (hwi_thread_count > value &&
+	    hwi_thread_count > hwi_min_threads())
+		if (!hwi_destroy_thread())
 			break;
+	mtx_unlock_spin(&hwi_thread_lock);
+	return (0);
+}
+SYSCTL_PROC(_kern_intr, OID_AUTO, thread_count, CTLFLAG_RW | CTLTYPE_INT,
+    NULL, 0, hwi_thread_sysctl, "I", "Number of hardware interrupt threads");
+
+/*
+ * Acquire the interrupt event "run" lock in a non-interrupt context.
+ */
+static __inline void
+intr_event_run_lock(struct intr_event *ie)
+{
+
+	/*
+	 * We disable interrupts while we hold the interrupt event run
+	 * lock to avoid a priority inversion deadlock if this
+	 * interrupt fires.
+	 */
+	spinlock_enter();
+	while (!atomic_cmpset_acq_int(&ie->ie_running, 0, 1)) {
+		spinlock_exit();
+
+		while (ie->ie_running)
+			cpu_spinwait();
+
+		spinlock_enter();
 	}
-	if (temp_ih == NULL)
-		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
-	else
-		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
-	intr_event_update(ie);
+}
 
-	/* Create a thread if we need one. */
-	while (ie->ie_thread == NULL && handler != NULL) {
-		if (ie->ie_flags & IE_ADDING_THREAD)
-			msleep(ie, &ie->ie_lock, 0, "ithread", 0);
-		else {
-			ie->ie_flags |= IE_ADDING_THREAD;
-			mtx_unlock(&ie->ie_lock);
-			it = ithread_create("intr: newborn");
-			mtx_lock(&ie->ie_lock);
-			ie->ie_flags &= ~IE_ADDING_THREAD;
-			ie->ie_thread = it;
-			it->it_event = ie;
-			ithread_update(it);
-			wakeup(ie);
-		}
-	}
-	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
-	    ie->ie_name);
-	mtx_unlock(&ie->ie_lock);
+/*
+ * Release the interrupt event "run" lock in a non-interrupt context.
+ */
+static __inline void
+intr_event_run_unlock(struct intr_event *ie)
+{
 
-	if (cookiep != NULL)
-		*cookiep = ih;
-	return (0);
+	atomic_store_rel_int(&ie->ie_running, 0);
+	spinlock_exit();
 }
-#else
+
 int
 intr_event_add_handler(struct intr_event *ie, const char *name,
     driver_filter_t filter, driver_intr_t handler, void *arg, u_char pri,
     enum intr_type flags, void **cookiep)
 {
 	struct intr_handler *ih, *temp_ih;
-	struct intr_thread *it;
+	struct intr_hardware *ihw;
 
-	if (ie == NULL || name == NULL || (handler == NULL && filter == NULL))
+	if (ie == NULL || name == NULL ||
+	    (handler == NULL && filter == NULL) ||
+	    ((flags & INTR_MANUAL) && filter != NULL) ||
+	    ((flags & INTR_MANUAL) && (ie->ie_flags & IE_SOFT)) ||
+	    ((ie->ie_flags & IE_SOFT) && filter != NULL))
 		return (EINVAL);
 
 	/* Allocate and populate an interrupt handler structure. */
-	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK | M_ZERO);
+	ih = malloc(sizeof(struct intr_handler), M_ITHREAD, M_WAITOK |
+	    M_ZERO);
 	ih->ih_filter = filter;
 	ih->ih_handler = handler;
 	ih->ih_argument = arg;
 	strlcpy(ih->ih_name, name, sizeof(ih->ih_name));
 	ih->ih_event = ie;
 	ih->ih_pri = pri;
+	if (flags & INTR_MANUAL)
+		ih->ih_flags |= IH_MANUAL;
 	if (flags & INTR_EXCL)
-		ih->ih_flags = IH_EXCLUSIVE;
+		ih->ih_flags |= IH_EXCLUSIVE;
 	if (flags & INTR_MPSAFE)
 		ih->ih_flags |= IH_MPSAFE;
 	if (flags & INTR_ENTROPY)
 		ih->ih_flags |= IH_ENTROPY;
 
+	/*
+	 * Manually scheduled interrupt handlers are not part of the
+	 * normal list of handlers.  However, they require that at
+	 * least one non-manual handler is already active for this
+	 * event.  In general they should be handled by an existing
+	 * filter or handler.
+	 *
+	 * XXX: They should perhaps just be swi handlers instead.
+	 */
+	if (flags & INTR_MANUAL) {
+		mtx_lock(&ie->ie_lock);
+		if (TAILQ_EMPTY(&ie->ie_handlers)) {
+			mtx_unlock(&ie->ie_lock);
+			free(ih, M_ITHREAD);
+			return (EINVAL);
+		}
+		ihw = (struct intr_hardware *)ie;
+		TAILQ_INSERT_TAIL(&ihw->ihw_manual, ih, ih_next);
+		mtx_unlock(&ie->ie_lock);
+		goto finish;
+	}
+	
 	/* We can only have one exclusive handler in a event. */
 	mtx_lock(&ie->ie_lock);
 	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
@@ -623,46 +752,32 @@
 		if (temp_ih->ih_pri > ih->ih_pri)
 			break;
 	}
+	intr_event_run_lock(ie);
 	if (temp_ih == NULL)
 		TAILQ_INSERT_TAIL(&ie->ie_handlers, ih, ih_next);
 	else
 		TAILQ_INSERT_BEFORE(temp_ih, ih, ih_next);
+	intr_event_run_unlock(ie);
 	intr_event_update(ie);
 
-	/* For filtered handlers, create a private ithread to run on. */
-	if (filter != NULL && handler != NULL) { 
-		mtx_unlock(&ie->ie_lock);
-		it = ithread_create("intr: newborn", ih);		
-		mtx_lock(&ie->ie_lock);
-		it->it_event = ie; 
-		ih->ih_thread = it;
-		ithread_update(it); // XXX - do we really need this?!?!?
-	} else { /* Create the global per-event thread if we need one. */
-		while (ie->ie_thread == NULL && handler != NULL) {
-			if (ie->ie_flags & IE_ADDING_THREAD)
-				msleep(ie, &ie->ie_lock, 0, "ithread", 0);
-			else {
-				ie->ie_flags |= IE_ADDING_THREAD;
-				mtx_unlock(&ie->ie_lock);
-				it = ithread_create("intr: newborn", ih);
-				mtx_lock(&ie->ie_lock);
-				ie->ie_flags &= ~IE_ADDING_THREAD;
-				ie->ie_thread = it;
-				it->it_event = ie;
-				ithread_update(it);
-				wakeup(ie);
-			}
-		}
-	}
 	CTR3(KTR_INTR, "%s: added %s to %s", __func__, ih->ih_name,
 	    ie->ie_name);
 	mtx_unlock(&ie->ie_lock);
 
+	/* Expand the hardware interrupt thread pool if needed. */
+	if (!(ie->ie_flags & IE_SOFT) && ih->ih_handler != NULL) {
+		mtx_lock_spin(&hwi_thread_lock);
+		hwi_nhandlers++;
+		while (hwi_thread_count < hwi_min_threads())
+			hwi_create_thread();
+		mtx_unlock_spin(&hwi_thread_lock);
+	}
+
+finish:
 	if (cookiep != NULL)
 		*cookiep = ih;
 	return (0);
 }
-#endif
 
 /*
  * Append a description preceded by a ':' to the name of the specified
@@ -672,6 +787,9 @@
 intr_event_describe_handler(struct intr_event *ie, void *cookie,
     const char *descr)
 {
+#ifdef INVARIANTS
+	struct intr_hardware *ihw;
+#endif
 	struct intr_handler *ih;
 	size_t space;
 	char *start;
@@ -682,6 +800,13 @@
 		if (ih == cookie)
 			break;
 	}
+	if (ih == NULL && !(ie->ie_flags & IE_SOFT)) {
+		ihw = (struct intr_hardware *)ie;
+		TAILQ_FOREACH(ih, &ihw->ihw_manual, ih_next) {
+			if (ih == cookie)
+				break;
+		}
+	}
 	if (ih == NULL) {
 		mtx_unlock(&ie->ie_lock);
 		panic("handler %p not found in interrupt event %p", cookie, ie);
@@ -720,12 +845,13 @@
 }
 
 /*
- * Return the ie_source field from the intr_event an intr_handler is
- * associated with.
+ * Return the source cookie for a hardware interrupt that a hardware
+ * interrupt handler is associated with.
  */
 void *
-intr_handler_source(void *cookie)
+hwi_handler_source(void *cookie)
 {
+	struct intr_hardware *ihw;
 	struct intr_handler *ih;
 	struct intr_event *ie;
 
@@ -736,168 +862,37 @@
 	KASSERT(ie != NULL,
 	    ("interrupt handler \"%s\" has a NULL interrupt event",
 	    ih->ih_name));
-	return (ie->ie_source);
+	KASSERT(!(ie->ie_flags & IE_SOFT),
+	    ("intr_handler_source: swi handler"));
+	ihw = (struct intr_hardware *)ie;
+	return (ihw->ihw_source);
 }
 
-#ifndef INTR_FILTER
-int
-intr_event_remove_handler(void *cookie)
+/*
+ * Called from an interrupt thread loop when it encounters a dying
+ * interrupt handler.  This marks the handler as dead and awakens the
+ * sleeping thread that is removing the handler.
+ */
+static void
+intr_handler_ack_dying(struct intr_event *ie, struct intr_handler *ih)
 {
-	struct intr_handler *handler = (struct intr_handler *)cookie;
-	struct intr_event *ie;
-#ifdef INVARIANTS
-	struct intr_handler *ih;
-#endif
-#ifdef notyet
-	int dead;
-#endif
 
-	if (handler == NULL)
-		return (EINVAL);
-	ie = handler->ih_event;
-	KASSERT(ie != NULL,
-	    ("interrupt handler \"%s\" has a NULL interrupt event",
-	    handler->ih_name));
 	mtx_lock(&ie->ie_lock);
-	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
-	    ie->ie_name);
-#ifdef INVARIANTS
-	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
-		if (ih == handler)
-			goto ok;
+	ih->ih_state = IS_DEAD;
+	wakeup(ih);
 	mtx_unlock(&ie->ie_lock);
-	panic("interrupt handler \"%s\" not found in interrupt event \"%s\"",
-	    ih->ih_name, ie->ie_name);
-ok:
-#endif
-	/*
-	 * If there is no ithread, then just remove the handler and return.
-	 * XXX: Note that an INTR_FAST handler might be running on another
-	 * CPU!
-	 */
-	if (ie->ie_thread == NULL) {
-		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
-		mtx_unlock(&ie->ie_lock);
-		free(handler, M_ITHREAD);
-		return (0);
-	}
-
-	/*
-	 * If the interrupt thread is already running, then just mark this
-	 * handler as being dead and let the ithread do the actual removal.
-	 *
-	 * During a cold boot while cold is set, msleep() does not sleep,
-	 * so we have to remove the handler here rather than letting the
-	 * thread do it.
-	 */
-	thread_lock(ie->ie_thread->it_thread);
-	if (!TD_AWAITING_INTR(ie->ie_thread->it_thread) && !cold) {
-		handler->ih_flags |= IH_DEAD;
-
-		/*
-		 * Ensure that the thread will process the handler list
-		 * again and remove this handler if it has already passed
-		 * it on the list.
-		 */
-		ie->ie_thread->it_need = 1;
-	} else
-		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
-	thread_unlock(ie->ie_thread->it_thread);
-	while (handler->ih_flags & IH_DEAD)
-		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
-	intr_event_update(ie);
-#ifdef notyet
-	/*
-	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
-	 * this could lead to races of stale data when servicing an
-	 * interrupt.
-	 */
-	dead = 1;
-	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
-		if (!(ih->ih_flags & IH_FAST)) {
-			dead = 0;
-			break;
-		}
-	}
-	if (dead) {
-		ithread_destroy(ie->ie_thread);
-		ie->ie_thread = NULL;
-	}
-#endif
-	mtx_unlock(&ie->ie_lock);
-	free(handler, M_ITHREAD);
-	return (0);
 }
 
-static int
-intr_event_schedule_thread(struct intr_event *ie)
-{
-	struct intr_entropy entropy;
-	struct intr_thread *it;
-	struct thread *td;
-	struct thread *ctd;
-	struct proc *p;
-
-	/*
-	 * If no ithread or no handlers, then we have a stray interrupt.
-	 */
-	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) ||
-	    ie->ie_thread == NULL)
-		return (EINVAL);
-
-	ctd = curthread;
-	it = ie->ie_thread;
-	td = it->it_thread;
-	p = td->td_proc;
-
-	/*
-	 * If any of the handlers for this ithread claim to be good
-	 * sources of entropy, then gather some.
-	 */
-	if (harvest.interrupt && ie->ie_flags & IE_ENTROPY) {
-		CTR3(KTR_INTR, "%s: pid %d (%s) gathering entropy", __func__,
-		    p->p_pid, td->td_name);
-		entropy.event = (uintptr_t)ie;
-		entropy.td = ctd;
-		random_harvest(&entropy, sizeof(entropy), 2, 0,
-		    RANDOM_INTERRUPT);
-	}
-
-	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
-
-	/*
-	 * Set it_need to tell the thread to keep running if it is already
-	 * running.  Then, lock the thread and see if we actually need to
-	 * put it on the runqueue.
-	 */
-	it->it_need = 1;
-	thread_lock(td);
-	if (TD_AWAITING_INTR(td)) {
-		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
-		    td->td_name);
-		TD_CLR_IWAIT(td);
-		sched_add(td, SRQ_INTR);
-	} else {
-		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
-		    __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
-	}
-	thread_unlock(td);
-
-	return (0);
-}
-#else
 int
 intr_event_remove_handler(void *cookie)
 {
 	struct intr_handler *handler = (struct intr_handler *)cookie;
 	struct intr_event *ie;
-	struct intr_thread *it;
 #ifdef INVARIANTS
+	struct intr_hardware *ihw;
 	struct intr_handler *ih;
 #endif
-#ifdef notyet
-	int dead;
-#endif
+	int state;
 
 	if (handler == NULL)
 		return (EINVAL);
@@ -909,6 +904,13 @@
 	CTR3(KTR_INTR, "%s: removing %s from %s", __func__, handler->ih_name,
 	    ie->ie_name);
 #ifdef INVARIANTS
+	if (!(ie->ie_flags & IE_SOFT)) {
+		ihw = (struct intr_hardware *)ie;
+		TAILQ_FOREACH(ih, &ihw->ihw_manual, ih_next) {
+			if (ih == handler)
+				goto ok;
+		}
+	}
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
 		if (ih == handler)
 			goto ok;
@@ -917,139 +919,120 @@
 	    ih->ih_name, ie->ie_name);
 ok:
 #endif
-	/*
-	 * If there are no ithreads (per event and per handler), then
-	 * just remove the handler and return.  
-	 * XXX: Note that an INTR_FAST handler might be running on another CPU!
-	 */
-	if (ie->ie_thread == NULL && handler->ih_thread == NULL) {
-		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
-		mtx_unlock(&ie->ie_lock);
-		free(handler, M_ITHREAD);
-		return (0);
-	}
 
-	/* Private or global ithread? */
-	it = (handler->ih_thread) ? handler->ih_thread : ie->ie_thread;
 	/*
-	 * If the interrupt thread is already running, then just mark this
-	 * handler as being dead and let the ithread do the actual removal.
-	 *
-	 * During a cold boot while cold is set, msleep() does not sleep,
-	 * so we have to remove the handler here rather than letting the
-	 * thread do it.
+	 * Manual interrupt handlers are on a separate list in the
+	 * interrupt event.
 	 */
-	thread_lock(it->it_thread);
-	if (!TD_AWAITING_INTR(it->it_thread) && !cold) {
-		handler->ih_flags |= IH_DEAD;
-
+	if (handler->ih_flags & IH_MANUAL) {
+		ihw = (struct intr_hardware *)ie;
+		TAILQ_REMOVE(&ihw->ihw_manual, handler, ih_next);
+	} else {
 		/*
-		 * Ensure that the thread will process the handler list
-		 * again and remove this handler if it has already passed
-		 * it on the list.
+		 * First, wait for the interrupt event to go idle so
+		 * we can remove the handler from the event's list.
 		 */
-		it->it_need = 1;
-	} else
+		intr_event_run_lock(ie);
 		TAILQ_REMOVE(&ie->ie_handlers, handler, ih_next);
-	thread_unlock(it->it_thread);
-	while (handler->ih_flags & IH_DEAD)
-		msleep(handler, &ie->ie_lock, 0, "iev_rmh", 0);
-	/* 
-	 * At this point, the handler has been disconnected from the event,
-	 * so we can kill the private ithread if any.
-	 */
-	if (handler->ih_thread) {
-		ithread_destroy(handler->ih_thread);
-		handler->ih_thread = NULL;
+		intr_event_run_unlock(ie);
+		intr_event_update(ie);
 	}
-	intr_event_update(ie);
-#ifdef notyet
+	mtx_unlock(&ie->ie_lock);
+
 	/*
-	 * XXX: This could be bad in the case of ppbus(8).  Also, I think
-	 * this could lead to races of stale data when servicing an
-	 * interrupt.
+	 * Next, wait for the interrupt handler to go idle.  If it is
+	 * already idle, just mark it as dead.  If it has been queued
+	 * or is executing, attempt to mark it as dying and then wait
+	 * for an interrupt thread to drain it from an active list.
 	 */
-	dead = 1;
-	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
-		if (handler != NULL) {
-			dead = 0;
+	for (state = handler->ih_state; state != IS_DEAD;
+	     state = handler->ih_state) {
+		switch (state) {
+		case IS_IDLE:
+			/*
+			 * If the interrupt handler is idle, try to mark
+			 * it as dead.
+			 */
+			atomic_cmpset_int(&handler->ih_state, IS_IDLE, IS_DEAD);
+			break;
+		default:
+			/*
+			 * If the interrupt handler is busy, mark it
+			 * as dying and wait.
+			 */
+			if (atomic_cmpset_int(&handler->ih_state, state,
+			    IS_DYING)) {
+				mtx_lock(&ie->ie_lock);
+				while (handler->ih_state != IS_DEAD)
+					mtx_sleep(handler, &ie->ie_lock, 0,
+					    "iev_rmh", 0);
+				mtx_unlock(&ie->ie_lock);
+			}
 			break;
 		}
 	}
-	if (dead) {
-		ithread_destroy(ie->ie_thread);
-		ie->ie_thread = NULL;
+
+	/* Shrink the hardware interrupt thread pool if needed. */
+	if (!(ie->ie_flags & IE_SOFT) && handler->ih_handler != NULL &&
+	    !(handler->ih_flags & IH_MANUAL)) {
+		mtx_lock_spin(&hwi_thread_lock);
+		hwi_nhandlers--;
+		while (hwi_thread_count > hwi_max_threads())
+			if (!hwi_destroy_thread())
+				break;
+		mtx_unlock_spin(&hwi_thread_lock);
 	}
-#endif
-	mtx_unlock(&ie->ie_lock);
+
+	/* The handler is now unreferenced, so can finally free it. */
 	free(handler, M_ITHREAD);
 	return (0);
 }
 
-static int
-intr_event_schedule_thread(struct intr_event *ie, struct intr_thread *it)
+/* Create a software interrupt event and thread. */
+int
+swi_create(struct intr_event **event, const char *fmt, ...)
 {
-	struct intr_entropy entropy;
-	struct thread *td;
-	struct thread *ctd;
-	struct proc *p;
+	struct intr_software *isw;
+	va_list ap;
 
-	/*
-	 * If no ithread or no handlers, then we have a stray interrupt.
-	 */
-	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers) || it == NULL)
-		return (EINVAL);
-
-	ctd = curthread;
-	td = it->it_thread;
-	p = td->td_proc;
-
-	/*
-	 * If any of the handlers for this ithread claim to be good
-	 * sources of entropy, then gather some.
-	 */
-	if (harvest.interrupt && ie->ie_flags & IE_ENTROPY) {
-		CTR3(KTR_INTR, "%s: pid %d (%s) gathering entropy", __func__,
-		    p->p_pid, td->td_name);
-		entropy.event = (uintptr_t)ie;
-		entropy.td = ctd;
-		random_harvest(&entropy, sizeof(entropy), 2, 0,
-		    RANDOM_INTERRUPT);
-	}
-
-	KASSERT(p != NULL, ("ithread %s has no process", ie->ie_name));
-
-	/*
-	 * Set it_need to tell the thread to keep running if it is already
-	 * running.  Then, lock the thread and see if we actually need to
-	 * put it on the runqueue.
-	 */
-	it->it_need = 1;
-	thread_lock(td);
-	if (TD_AWAITING_INTR(td)) {
-		CTR3(KTR_INTR, "%s: schedule pid %d (%s)", __func__, p->p_pid,
-		    td->td_name);
-		TD_CLR_IWAIT(td);
-		sched_add(td, SRQ_INTR);
-	} else {
-		CTR5(KTR_INTR, "%s: pid %d (%s): it_need %d, state %d",
-		    __func__, p->p_pid, td->td_name, it->it_need, td->td_state);
-	}
-	thread_unlock(td);
-
+	isw = malloc(sizeof(struct intr_software), M_ITHREAD,
+	    M_WAITOK | M_ZERO);
+	va_start(ap, fmt);
+	intr_event_init(&isw->isw_event, IE_SOFT, fmt, ap);
+	va_end(ap);
+	isw->isw_thread = ithread_create(&swintr, swi_loop, isw, "swi",
+	    PRI_MAX_ITHD);
+	isw->isw_thread->it_flags |= IT_SOFT;
+	STAILQ_INIT(&isw->isw_active);
+	mtx_lock(&event_lock);
+	TAILQ_INSERT_TAIL(&swi_event_list, isw, isw_list);
+	mtx_unlock(&event_lock);
+	if (event != NULL)
+		*event = &isw->isw_event;
 	return (0);
 }
-#endif
 
-/*
- * Allow interrupt event binding for software interrupt handlers -- a no-op,
- * since interrupts are generated in software rather than being directed by
- * a PIC.
- */
-static int
-swi_assign_cpu(void *arg, u_char cpu)
+/* Tear down a software interrupt event and thread. */
+int
+swi_destroy(struct intr_event *ie)
 {
+	struct intr_software *isw;
 
+	if (!(ie->ie_flags & IE_SOFT))
+		return (EINVAL);
+	isw = (struct intr_software *)ie;
+
+	mtx_lock(&event_lock);
+	mtx_lock(&ie->ie_lock);
+	if (!TAILQ_EMPTY(&ie->ie_handlers)) {
+		mtx_unlock(&ie->ie_lock);
+		mtx_unlock(&event_lock);
+		return (EBUSY);
+	}
+	TAILQ_REMOVE(&swi_event_list, isw, isw_list);
+	mtx_unlock(&ie->ie_lock);
+	mtx_unlock(&event_lock);
+	ithread_destroy(isw->isw_thread);
 	return (0);
 }
 
@@ -1063,6 +1046,7 @@
 {
 	struct thread *td;
 	struct intr_event *ie;
+	struct intr_software *isw;
 	int error;
 
 	if (flags & INTR_ENTROPY)
@@ -1074,8 +1058,7 @@
 		if (!(ie->ie_flags & IE_SOFT))
 			return (EINVAL);
 	} else {
-		error = intr_event_create(&ie, NULL, IE_SOFT, 0,
-		    NULL, NULL, NULL, swi_assign_cpu, "swi%d:", pri);
+		error = swi_create(&ie, "swi%d:", pri);
 		if (error)
 			return (error);
 		if (eventp != NULL)
@@ -1086,7 +1069,8 @@
 	if (error)
 		return (error);
 	if (pri == SWI_CLOCK) {
-		td = ie->ie_thread->it_thread;
+		isw = (struct intr_software *)ie;
+		td = isw->isw_thread->it_thread;
 		thread_lock(td);
 		td->td_flags |= TDF_NOLOAD;
 		thread_unlock(td);
@@ -1095,547 +1079,799 @@
 }
 
 /*
- * Schedule a software interrupt thread.
+ * Schedule a software interrupt handler.
  */
 void
 swi_sched(void *cookie, int flags)
 {
-	struct intr_handler *ih = (struct intr_handler *)cookie;
-	struct intr_event *ie = ih->ih_event;
-	int error;
+	struct intr_software *isw;
+	struct intr_handler *ih;
+	struct intr_event *ie;
+	struct thread *td;
+	int state;
+
+	ih = cookie;
+	ie = ih->ih_event;
+	KASSERT(ie->ie_flags & IE_SOFT,
+	    ("swi_sched: hardware interrupt event"));
+	isw = (struct intr_software *)ie;
+	td = isw->isw_thread->it_thread;
+	CTR3(KTR_INTR, "swi_sched: %s %s state=%d", ie->ie_name, ih->ih_name,
+	    ih->ih_state);
 
-	CTR3(KTR_INTR, "swi_sched: %s %s need=%d", ie->ie_name, ih->ih_name,
-	    ih->ih_need);
+	for (;;) {
+		state = ih->ih_state;
+		switch (state) {
+		case IS_IDLE:
+			/*
+			 * Try to change state to queued.  If that fails,
+			 * try the loop again.
+			 */
+			if (!atomic_cmpset_int(&ih->ih_state, IS_IDLE,
+			    IS_QUEUED))
+				break;
 
-	/*
-	 * Set ih_need for this handler so that if the ithread is already
-	 * running it will execute this handler on the next pass.  Otherwise,
-	 * it will execute it the next time it runs.
-	 */
-	atomic_store_rel_int(&ih->ih_need, 1);
+			/* Queue the handler. */
+			thread_lock(td);
+			STAILQ_INSERT_TAIL(&isw->isw_active, ih, ih_queued);
+			PCPU_INC(cnt.v_soft);
 
-	if (!(flags & SWI_DELAY)) {
-		PCPU_INC(cnt.v_soft);
-#ifdef INTR_FILTER
-		error = intr_event_schedule_thread(ie, ie->ie_thread);
-#else
-		error = intr_event_schedule_thread(ie);
-#endif
-		KASSERT(error == 0, ("stray software interrupt"));
+			/* Schedule the thread if needed. */
+			if (!(flags & SWI_DELAY)) {
+				if (TD_AWAITING_INTR(td)) {
+					CTR2(KTR_INTR,
+					    "swi_sched: schedule tid %d (%s)",
+					    td->td_tid, td->td_name);
+					TD_CLR_IWAIT(td);
+					sched_add(td, SRQ_INTR);
+				} else {
+					CTR3(KTR_INTR,
+					    "swi_sched: tid %d (%s): state %d",
+					    td->td_tid, td->td_name,
+					    td->td_state);
+				}
+			}
+			thread_unlock(td);
+			return;
+		case IS_QUEUED:
+		case IS_REQUEUE:
+			/*
+			 * Do an atomic op to ensure it is in one of the
+			 * queued states.  If so, nothing else to do.
+			 */
+			if (atomic_cmpset_int(&ih->ih_state, state,
+			    state)) {
+				PCPU_INC(cnt.v_soft);
+				return;
+			}
+			break;
+		case IS_RUNNING:
+			/*
+			 * Try to change the state to requeue so that
+			 * the interrupt thread will requeue the
+			 * handler when it is finished executing.
+			 */
+			if (atomic_cmpset_int(&ih->ih_state, IS_RUNNING,
+			    IS_REQUEUE))
+				return;
+			break;
+		case IS_DEAD:
+		case IS_DYING:
+			/*
+			 * If this happens, it is probably a bug in
+			 * the calling code, but just ignore it.
+			 */
+			return;
+		}
 	}
 }
 
 /*
  * Remove a software interrupt handler.  Currently this code does not
- * remove the associated interrupt event if it becomes empty.  Calling code
- * may do so manually via intr_event_destroy(), but that's not really
- * an optimal interface.
+ * remove the associated interrupt event if it becomes empty.
  */
 int
 swi_remove(void *cookie)
 {
+#ifdef INVARIANTS
+	struct intr_handler *ih;
 
+	ih = cookie;
+	KASSERT(ih->ih_event->ie_flags & IE_SOFT,
+	    ("swi_remove: hardware interrupt event"));
+#endif
 	return (intr_event_remove_handler(cookie));
 }
 
-#ifdef INTR_FILTER
-static void
-priv_ithread_execute_handler(struct proc *p, struct intr_handler *ih)
+/*
+ * Executes a threaded interrupt handler.  Returns true if the handler
+ * has been marked as dying.
+ */
+static __inline int
+intr_handler_execute(struct intr_handler *ih, struct intr_event *ie,
+    struct thread *td)
 {
-	struct intr_event *ie;
+	int state;
 
-	ie = ih->ih_event;
-	/*
-	 * If this handler is marked for death, remove it from
-	 * the list of handlers and wake up the sleeper.
-	 */
-	if (ih->ih_flags & IH_DEAD) {
-		mtx_lock(&ie->ie_lock);
-		TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
-		ih->ih_flags &= ~IH_DEAD;
-		wakeup(ih);
-		mtx_unlock(&ie->ie_lock);
-		return;
+	/* Transition state from queued to running. */
+	for (;;) {
+		state = ih->ih_state;
+		switch (state) {
+		case IS_DYING:
+			return (DYING);
+		case IS_QUEUED:
+			/* Mark the handler as running. */
+			if (atomic_cmpset_int(&ih->ih_state, IS_QUEUED,
+			    IS_RUNNING)) {
+				/* XXXTEST */
+				CTR1(KTR_INTR, "%s: IS_QUEUED -> IS_RUNNING",
+				    ih->ih_name);
+				goto run;
+			}
+			break;
+#ifdef INVARIANTS
+		default:
+			panic("bad pre-exec intr handler state %d", state);
+#endif
+		}
 	}
-	
-	/* Execute this handler. */
-	CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
-	     __func__, p->p_pid, (void *)ih->ih_handler, ih->ih_argument,
-	     ih->ih_name, ih->ih_flags);
-	
+
+run:
+	CTR5(KTR_INTR, "intr_exec: tid %d exec %p(%p) for %s flg=%x",
+	    td->td_tid, (void *)ih->ih_handler, ih->ih_argument, ih->ih_name,
+	    ih->ih_flags);
 	if (!(ih->ih_flags & IH_MPSAFE))
 		mtx_lock(&Giant);
 	ih->ih_handler(ih->ih_argument);
 	if (!(ih->ih_flags & IH_MPSAFE))
 		mtx_unlock(&Giant);
-}
+
+	/* Transition state from running back to idle. */
+	for (;;) {
+		state = ih->ih_state;
+		switch (state) {
+		case IS_DYING:
+			return (1);
+		case IS_REQUEUE:
+			/*
+			 * Try to set the state to queued.  If that
+			 * succeeds, requeue the handler.  The return
+			 * value tells the interrupt thread to requeue
+			 * the handler.  For hardware interrupts,
+			 * there is no need to schedule a thread as
+			 * this thread will reclaim the current CPU if
+			 * there is not another associated thread
+			 * already.
+			 */
+			if (atomic_cmpset_int(&ih->ih_state, IS_REQUEUE,
+			    IS_QUEUED)) {
+				/* XXXTEST */
+				CTR1(KTR_INTR, "%s: IS_REQUEUE -> IS_QUEUED",
+				    ih->ih_name);
+				return (REQUEUE);
+			}
+			break;
+		case IS_RUNNING:
+			if (atomic_cmpset_int(&ih->ih_state, IS_RUNNING,
+			    IS_IDLE)) {
+				/* XXXTEST */
+				CTR1(KTR_INTR, "%s: IS_RUNNING -> IS_IDLE",
+				    ih->ih_name);
+				return (FINISHED);
+			}
+			break;
+#ifdef INVARIANTS
+		default:
+			panic("bad post-exec intr handler state %d", state);
 #endif
-
-/*
- * This is a public function for use by drivers that mux interrupt
- * handlers for child devices from their interrupt handler.
- */
-void
-intr_event_execute_handlers(struct proc *p, struct intr_event *ie)
-{
-	struct intr_handler *ih, *ihn;
-
-	TAILQ_FOREACH_SAFE(ih, &ie->ie_handlers, ih_next, ihn) {
-		/*
-		 * If this handler is marked for death, remove it from
-		 * the list of handlers and wake up the sleeper.
-		 */
-		if (ih->ih_flags & IH_DEAD) {
-			mtx_lock(&ie->ie_lock);
-			TAILQ_REMOVE(&ie->ie_handlers, ih, ih_next);
-			ih->ih_flags &= ~IH_DEAD;
-			wakeup(ih);
-			mtx_unlock(&ie->ie_lock);
-			continue;
 		}
-
-		/* Skip filter only handlers */
-		if (ih->ih_handler == NULL)
-			continue;
-
-		/*
-		 * For software interrupt threads, we only execute
-		 * handlers that have their need flag set.  Hardware
-		 * interrupt threads always invoke all of their handlers.
-		 */
-		if (ie->ie_flags & IE_SOFT) {
-			if (!ih->ih_need)
-				continue;
-			else
-				atomic_store_rel_int(&ih->ih_need, 0);
-		}
-
-		/* Execute this handler. */
-		CTR6(KTR_INTR, "%s: pid %d exec %p(%p) for %s flg=%x",
-		    __func__, p->p_pid, (void *)ih->ih_handler, 
-		    ih->ih_argument, ih->ih_name, ih->ih_flags);
-
-		if (!(ih->ih_flags & IH_MPSAFE))
-			mtx_lock(&Giant);
-		ih->ih_handler(ih->ih_argument);
-		if (!(ih->ih_flags & IH_MPSAFE))
-			mtx_unlock(&Giant);
 	}
 }
 
+/*
+ * Main loop for software interrupt threads.  Each software interrupt
+ * thread is bound to a specific software interrupt event and only
+ * executes handlers for that event.
+ */
 static void
-ithread_execute_handlers(struct proc *p, struct intr_event *ie)
+swi_loop(void *arg)
 {
+	struct intr_software *isw;
+	struct intr_handler *ih;
+	struct intr_thread *it;
+	struct thread *td;
+	int state;
 
-	/* Interrupt handlers should not sleep. */
-	if (!(ie->ie_flags & IE_SOFT))
-		THREAD_NO_SLEEPING();
-	intr_event_execute_handlers(p, ie);
-	if (!(ie->ie_flags & IE_SOFT))
-		THREAD_SLEEPING_OK();
+	td = curthread;
+	isw = arg;
+	it = td->td_ithread;
+	KASSERT(it->it_thread == td, ("swi_loop: ithread linkage out of sync"));
+	KASSERT(it == isw->isw_thread,
+	    ("swi_loop: intr_sofware linkage out of sync"));
 
 	/*
-	 * Interrupt storm handling:
-	 *
-	 * If this interrupt source is currently storming, then throttle
-	 * it to only fire the handler once  per clock tick.
-	 *
-	 * If this interrupt source is not currently storming, but the
-	 * number of back to back interrupts exceeds the storm threshold,
-	 * then enter storming mode.
+	 * Execute handlers queued on the active list.  If there are
+	 * no handlers, block waiting for more handlers.
 	 */
-	if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold &&
-	    !(ie->ie_flags & IE_SOFT)) {
-		/* Report the message only once every second. */
-		if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) {
-			printf(
-	"interrupt storm detected on \"%s\"; throttling interrupt source\n",
-			    ie->ie_name);
+	thread_lock(td);
+	while (!(it->it_flags & IT_DEAD)) {
+		/* Remove active handlers and execute them. */
+		while ((ih = STAILQ_FIRST(&isw->isw_active)) != NULL) {
+			STAILQ_REMOVE_HEAD(&isw->isw_active, ih_queued);
+			thread_unlock(td);
+			state = intr_handler_execute(ih, &isw->isw_event, td);
+			if (state == DYING)
+				intr_handler_ack_dying(&isw->isw_event, ih);
+
+			WITNESS_WARN(WARN_PANIC, NULL, "finished swi");
+			mtx_assert(&Giant, MA_NOTOWNED);
+			thread_lock(td);
+			if (state == REQUEUE)
+				STAILQ_INSERT_TAIL(&isw->isw_active, ih,
+				    ih_queued);
+		}
+
+		/* Block waiting for more work. */
+		if (!(it->it_flags & IT_DEAD)) {
+			TD_SET_IWAIT(td);
+			mi_switch(SW_VOL | SWT_IWAIT, NULL);
 		}
-		pause("istorm", 1);
-	} else
-		ie->ie_count++;
+	}
+	thread_unlock(td);
 
-	/*
-	 * Now that all the handlers have had a chance to run, reenable
-	 * the interrupt source.
-	 */
-	if (ie->ie_post_ithread != NULL)
-		ie->ie_post_ithread(ie->ie_source);
+	CTR2(KTR_INTR, "swi_loop: tid %d (%s) exiting", td->td_tid,
+	    td->td_name);
+	free(it, M_ITHREAD);
+	mtx_destroy(&isw->isw_event.ie_lock);
+	free(isw, M_ITHREAD);
+	kthread_exit();
 }
 
-#ifndef INTR_FILTER
 /*
- * This is the main code for interrupt threads.
+ * Main loop for hardware interrupt threads.  Each thread is pinned to
+ * a specific CPU when it executes and drains handlers for that CPU
+ * until there are no active handlers left.
  */
 static void
-ithread_loop(void *arg)
+hwi_loop(void *arg)
 {
-	struct intr_thread *ithd;
+	struct intr_hardware *ihw;
+	struct intr_handler *ih;
+	struct intr_thread *it;
 	struct intr_event *ie;
 	struct thread *td;
-	struct proc *p;
+	int cpuid, state;
+	
+	td = curthread;
+	it = td->td_ithread;
+	KASSERT(it->it_thread == td, ("hwi_loop: ithread linkage out of sync"));
 
-	td = curthread;
-	p = td->td_proc;
-	ithd = (struct intr_thread *)arg;
-	KASSERT(ithd->it_thread == td,
-	    ("%s: ithread and proc linkage out of sync", __func__));
-	ie = ithd->it_event;
-	ie->ie_count = 0;
+	/* Hardware interrupt handlers should not sleep. */
+	THREAD_NO_SLEEPING();
 
 	/*
-	 * As long as we have interrupts outstanding, go through the
-	 * list of handlers, giving each one a go at it.
+	 * Execute handlers queued on this CPU's active list.  If there are
+	 * no handlers, block waiting for more handlers.
 	 */
-	for (;;) {
-		/*
-		 * If we are an orphaned thread, then just die.
-		 */
-		if (ithd->it_flags & IT_DEAD) {
-			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
-			    p->p_pid, td->td_name);
-			free(ithd, M_ITHREAD);
-			kthread_exit();
-		}
+	thread_lock(td);
+	while (!(it->it_flags & IT_DEAD)) {
+		spinlock_enter();
+		thread_unlock(td);
+
+		/* Remove active handlers and execute them. */
+		while ((ih = STAILQ_FIRST(PCPU_PTR(hwi_active))) != NULL) {
+			STAILQ_REMOVE_HEAD(PCPU_PTR(hwi_active), ih_queued);
+			spinlock_exit();
+			ie = ih->ih_event;
+			ihw = (struct intr_hardware *)ie;
+#ifdef INVARIANTS
+			it->it_current = ih;
+#endif
+			state = intr_handler_execute(ih, ie, td);
+#ifdef INVARIANTS
+			it->it_current = NULL;
+#endif
+
+			/*
+			 * See if we need to invoke the 'post_ithread'
+			 * hook for this event.  Skip this if requeueing
+			 * the handler.
+			 *
+			 * XXX: Need to detect and handle interrupt
+			 * storms here somehow.
+			 *
+			 * XXX: If a filter returns just
+			 * FILTER_SCHEDULE_THREAD, then this will
+			 * break.
+			 */
+			if (state != REQUEUE && ih->ih_filter == NULL &&
+			    ihw->ihw_post_ithread != NULL &&
+			    atomic_fetchadd_int(&ihw->ihw_queued, -1) == 1) {
+				CTR1(KTR_INTR, "hwi_loop: post_ithread for %s",
+				    ie->ie_name);
+				ihw->ihw_post_ithread(ihw->ihw_source);
+			}
+
+			if (state == DYING)
+				intr_handler_ack_dying(ie, ih);
+
+			WITNESS_WARN(WARN_PANIC, NULL, "finished hwi");
+			mtx_assert(&Giant, MA_NOTOWNED);
+			spinlock_enter();
+
+			if (state == REQUEUE)
+				STAILQ_INSERT_TAIL(PCPU_PTR(hwi_active), ih,
+				    ih_queued);
 
-		/*
-		 * Service interrupts.  If another interrupt arrives while
-		 * we are running, it will set it_need to note that we
-		 * should make another pass.
-		 */
-		while (ithd->it_need) {
 			/*
-			 * This might need a full read and write barrier
-			 * to make sure that this write posts before any
-			 * of the memory or device accesses in the
-			 * handlers.
+			 * If the handler blocked on a lock, then this
+			 * thread is no longer tied to this CPU.  If
+			 * this CPU does not have an active interrupt,
+			 * then reclaim this CPU.  Otherwise, fall out
+			 * of the loop and let the other active thread
+			 * for this CPU process any queued handlers.
 			 */
-			atomic_store_rel_int(&ithd->it_need, 0);
-			ithread_execute_handlers(p, ie);
+			if (PCPU_GET(hwi_thread) != td) {
+				if (PCPU_GET(hwi_thread) == NULL) {
+					CTR3(KTR_INTR,
+				    "hwi_loop: tid %d (%s) reclaiming CPU %d",
+					    td->td_tid, td->td_name,
+					    PCPU_GET(cpuid));
+					PCPU_SET(hwi_thread, td);
+				} else
+					break;
+			}
 		}
-		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
-		mtx_assert(&Giant, MA_NOTOWNED);
+		spinlock_exit();
 
-		/*
-		 * Processed all our interrupts.  Now get the sched
-		 * lock.  This may take a while and it_need may get
-		 * set again, so we have to check it again.
-		 */
+		/* Block waiting for more work. */
 		thread_lock(td);
-		if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) {
+		if (!(it->it_flags & IT_DEAD) && (PCPU_GET(hwi_thread) != td ||
+		    STAILQ_EMPTY(PCPU_PTR(hwi_active)))) {
+
+			/* Disassociate from the current CPU. */
+			sched_unbind(td);
+			mtx_lock_spin(&hwi_thread_lock);
+			if (PCPU_GET(hwi_thread) == td) {
+				CTR3(KTR_INTR,
+				    "hwi_loop: tid %d (%s) freeing CPU %d",
+				    td->td_tid, td->td_name, PCPU_GET(cpuid));
+				PCPU_SET(hwi_thread, NULL);
+			}
+
+			/* Handle queued handlers for another CPU if needed. */
+			if (hwi_check_cpus) {
+				cpuid = hwi_pending_cpu(it);
+				if (cpuid != NOCPU) {
+					mtx_unlock_spin(&hwi_thread_lock);
+					sched_bind(td, cpuid);
+					continue;
+				}
+			}
+
+			/* Put this thread on the idle list and block. */
+			thread_lock_set(td, &hwi_thread_lock);
+			TAILQ_INSERT_HEAD(&hwi_threads, it, it_list);
 			TD_SET_IWAIT(td);
-			ie->ie_count = 0;
 			mi_switch(SW_VOL | SWT_IWAIT, NULL);
 		}
+	}
+	thread_unlock(td);
+
+	CTR2(KTR_INTR, "hwi_loop: tid %d (%s) exiting", td->td_tid,
+	    td->td_name);
+	THREAD_SLEEPING_OK();
+	free(it, M_ITHREAD);
+	kthread_exit();
+}
+
+/*
+ * Allocate a new hwi thread for the current CPU.  Must be called with
+ * the hwi_thread_lock held and will return with it dropped.
+ */
+static void
+hwi_alloc_thread(void)
+{
+	struct intr_thread *it;
+	struct thread *td;
+
+	mtx_assert(&hwi_thread_lock, MA_OWNED);
+
+	/* Try to grab a free thread. */
+	it = TAILQ_FIRST(&hwi_threads);
+	if (it != NULL) {
+		/*
+		 * Claim this thread.  Bind it to this CPU while it
+		 * drains interrupt handlers.  Even though this
+		 * thread's per-thread lock should be hwi_thread_lock
+		 * and thus already held, grab it again via
+		 * thread_lock() to force this code to wait if another
+		 * CPU is switching away from this thread and thus
+		 * td_lock is actually the blocked lock.
+		 */
+		TAILQ_REMOVE(&hwi_threads, it, it_list);
+		td = it->it_thread;
+		PCPU_SET(hwi_thread, td);
+		thread_lock(td);
+		THREAD_LOCKPTR_ASSERT(td, &hwi_thread_lock);
+		mtx_unlock_spin(&hwi_thread_lock);
+		sched_bind_ithd(td, PCPU_GET(cpuid));
+		KASSERT(TD_AWAITING_INTR(td), ("free hwi thread not idle"));
+		CTR3(KTR_INTR,
+		    "hwi_alloc_thread: schedule tid %d (%s) for CPU %d",
+		    td->td_tid, td->td_name, PCPU_GET(cpuid));
+		TD_CLR_IWAIT(td);
+		sched_add(td, SRQ_INTR);
 		thread_unlock(td);
+	} else {
+		hwi_check_cpus = 1;
+		if (!hwi_thread_warn) {
+			hwi_thread_warn = 1;
+			mtx_unlock(&hwi_thread_lock);
+			printf("Exhausted hardware interrupt thread pool, "
+			    "increase kern.intr.thread_count\n");
+		} else
+			mtx_unlock(&hwi_thread_lock);
 	}
 }
 
 /*
- * Main interrupt handling body.
- *
- * Input:
- * o ie:                        the event connected to this interrupt.
- * o frame:                     some archs (i.e. i386) pass a frame to some.
- *                              handlers as their main argument.
- * Return value:
- * o 0:                         everything ok.
- * o EINVAL:                    stray interrupt.
+ * Entry point for MD code to call to handle a hardware interrupt.
+ * The trapframe is passed as the argument to any filter handlers that
+ * specify NULL as their argument.
  */
 int
-intr_event_handle(struct intr_event *ie, struct trapframe *frame)
+hwi_handle(struct intr_event *ie, struct trapframe *frame)
 {
+	struct intr_entropy entropy;
+	struct intr_handler_list *active;
+	struct intr_hardware *ihw;
 	struct intr_handler *ih;
 	struct trapframe *oldframe;
-	struct thread *td;
-	int error, ret, thread;
+	struct thread *td, *ctd;
+	void *arg;
+	int nonfilter, ret, state, thread;
 
-	td = curthread;
-
 	/* An interrupt with no event or handlers is a stray interrupt. */
 	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
 		return (EINVAL);
 
+	td = curthread;
+	td->td_intr_nesting_level++;
+	oldframe = td->td_intr_frame;
+	td->td_intr_frame = frame;
+	critical_enter();
+
+	/*
+	 * Lock the 'run' lock on the interrupt event while we run filters
+	 * and queue threaded handlers.  Unlike top-half code, we do not
+	 * need to disable interrupts, so just spin on the lock token.
+	 */
+	while (!atomic_cmpset_acq_int(&ie->ie_running, 0, 1)) {
+		while (ie->ie_running)
+			cpu_spinwait();
+	}
+
 	/*
-	 * Execute fast interrupt handlers directly.
+	 * Execute filter interrupt handlers directly.
 	 * To support clock handlers, if a handler registers
 	 * with a NULL argument, then we pass it a pointer to
 	 * a trapframe as its argument.
 	 */
-	td->td_intr_nesting_level++;
 	thread = 0;
-	ret = 0;
-	critical_enter();
-	oldframe = td->td_intr_frame;
-	td->td_intr_frame = frame;
+	nonfilter = 0;
+	active = PCPU_PTR(hwi_active);
+	ihw = (struct intr_hardware *)ie;
 	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
-		if (ih->ih_filter == NULL) {
-			thread = 1;
+		/* If we have a filter, run it first. */
+		if (ih->ih_filter != NULL) {
+			arg = ih->ih_argument;
+			if (arg == NULL)
+				arg = frame;
+			CTR3(KTR_INTR, "hwi_handle: exec %p(%p) for %s",
+			    ih->ih_filter, arg, ih->ih_name);
+			ret = ih->ih_filter(arg);
+			CTR1(KTR_INTR, "hwi_handle: filter returned %#x", ret);
+			KASSERT(ret == FILTER_STRAY || ret == FILTER_HANDLED ||
+			    ret == (FILTER_HANDLED | FILTER_SCHEDULE_THREAD) ||
+			    ret == FILTER_SCHEDULE_THREAD,
+			    ("incorrect filter return value %#x from %s", ret,
+			    ih->ih_name));
+		} else
+			ret = FILTER_SCHEDULE_THREAD;
+
+		/*
+		 * If no need to schedule threaded handler, nothing
+		 * left to do for this handler.
+		 */
+		if (!(ret & FILTER_SCHEDULE_THREAD))
 			continue;
-		}
-		CTR4(KTR_INTR, "%s: exec %p(%p) for %s", __func__,
-		    ih->ih_filter, ih->ih_argument == NULL ? frame :
-		    ih->ih_argument, ih->ih_name);
-		if (ih->ih_argument == NULL)
-			ret = ih->ih_filter(frame);
-		else
-			ret = ih->ih_filter(ih->ih_argument);
-		KASSERT(ret == FILTER_STRAY ||
-		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
-		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
-		    ("%s: incorrect return value %#x from %s", __func__, ret,
-		    ih->ih_name));
+
+		/* Place this interrupt handler on this CPU's queue. */
+		for (;;) {
+			state = ih->ih_state;
+			switch (state) {
+			case IS_IDLE:
+				/* Try to change the state to queued. */
+				if (!atomic_cmpset_int(&ih->ih_state, IS_IDLE,
+				    IS_QUEUED))
+					break;
+
+				/* XXXTEST */
+				CTR1(KTR_INTR, "%s: IS_IDLE -> IS_QUEUED",
+				    ih->ih_name);
 
-		/* 
-		 * Wrapper handler special handling:
-		 *
-		 * in some particular cases (like pccard and pccbb), 
-		 * the _real_ device handler is wrapped in a couple of
-		 * functions - a filter wrapper and an ithread wrapper.
-		 * In this case (and just in this case), the filter wrapper 
-		 * could ask the system to schedule the ithread and mask
-		 * the interrupt source if the wrapped handler is composed
-		 * of just an ithread handler.
-		 *
-		 * TODO: write a generic wrapper to avoid people rolling 
-		 * their own
-		 */
-		if (!thread) {
-			if (ret == FILTER_SCHEDULE_THREAD)
+				/*
+				 * Queue the handler.  If this
+				 * handler's filter did not handle the
+				 * interrupt, note that we need to use
+				 * the 'pre_ithread' and
+				 * 'post_ithread' hooks.
+				 */
+				if (!(ret & FILTER_HANDLED)) {
+					nonfilter = 1;
+					atomic_add_int(&ihw->ihw_queued, 1);
+				}
 				thread = 1;
+				CTR2(KTR_INTR,
+				    "hwi_handle: scheduled %s for %s",
+				    ih->ih_name, ie->ie_name);
+				STAILQ_INSERT_TAIL(active, ih, ih_queued);
+				goto next;
+			case IS_QUEUED:
+			case IS_REQUEUE:
+				/* Ensure it is truly still queued. */
+				if (atomic_cmpset_int(&ih->ih_state, state,
+				    state)) {
+					/* XXXTEST */
+					CTR3(KTR_INTR, "%s: %s -> %s",
+					    ih->ih_name, state == IS_QUEUED ?
+					    "IS_QUEUED" : "IS_REQUEUE",
+					    state == IS_QUEUED ? "IS_QUEUED" :
+					    "IS_REQUEUE");
+					goto next;
+				}
+				break;
+			case IS_RUNNING:
+				/* Try to change the state to requeue. */
+				if (atomic_cmpset_int(&ih->ih_state, IS_RUNNING,
+				    IS_REQUEUE)) {
+					/* XXXTEST */
+					CTR1(KTR_INTR,
+					    "%s: IS_RUNNING -> IS_REQUEUE",
+					    ih->ih_name);
+					goto next;
+				}
+				break;
+#ifdef INVARIANTS
+			default:
+				/*
+				 * The dying/dead states should not
+				 * happen.  They are only set while
+				 * holding the run lock and once they
+				 * are set the event is removed from
+				 * the interrupt event's handler
+				 * list.
+				 */
+				panic("hwi_handle: bad state %d", state);
+#endif
+			}
 		}
+	next:;
 	}
-	td->td_intr_frame = oldframe;
+
+	/* Drop the 'run' lock. */
+	atomic_store_rel_int(&ie->ie_running, 0);
 
-	if (thread) {
-		if (ie->ie_pre_ithread != NULL)
-			ie->ie_pre_ithread(ie->ie_source);
+	/*
+	 * If handlers without filters were queued, invoke the
+	 * 'pre_ithread' hook, otherwise invoke the 'post_filter'
+	 * hook.
+	 */
+	if (nonfilter) {
+		if (ihw->ihw_pre_ithread != NULL) {
+			CTR1(KTR_INTR, "hwi_handle: pre_ithread for %s",
+			    ie->ie_name);
+			ihw->ihw_pre_ithread(ihw->ihw_source);
+		}
 	} else {
-		if (ie->ie_post_filter != NULL)
-			ie->ie_post_filter(ie->ie_source);
+		if (ihw->ihw_post_filter != NULL) {
+			CTR1(KTR_INTR, "hwi_handle: post_filter for %s",
+			    ie->ie_name);
+			ihw->ihw_post_filter(ihw->ihw_source);
+		}
+	}
+
+	/*
+	 * If any of the handlers for this event claim to be good
+	 * sources of entropy, then gather some.
+	 */
+	if (harvest.interrupt && ie->ie_flags & IE_ENTROPY) {
+		ctd = curthread;
+		CTR2(KTR_INTR, "hwi_handle: tid %d (%s) gathering entropy",
+		    ctd->td_tid, ctd->td_name);
+		entropy.event = (uintptr_t)ie;
+		entropy.td = ctd;
+		random_harvest(&entropy, sizeof(entropy), 2, 0,
+		    RANDOM_INTERRUPT);
 	}
-	
-	/* Schedule the ithread if needed. */
-	if (thread) {
-		error = intr_event_schedule_thread(ie);
-#ifndef XEN		
-		KASSERT(error == 0, ("bad stray interrupt"));
-#else
-		if (error != 0)
-			log(LOG_WARNING, "bad stray interrupt");
-#endif		
+
+	/*
+	 * If this CPU doesn't have an active interrupt thread,
+	 * schedule a new one.
+	 */
+	if (thread && PCPU_GET(hwi_thread) == NULL) {
+		mtx_lock_spin(&hwi_thread_lock);
+		hwi_alloc_thread();
 	}
 	critical_exit();
+	td->td_intr_frame = oldframe;
 	td->td_intr_nesting_level--;
 	return (0);
 }
-#else
+
 /*
- * This is the main code for interrupt threads.
+ * Allow a hardware interrupt handler to be manually scheduled on the
+ * current CPU's queue.  This can be used either to schedule manual
+ * interrupt handlers from a filter or handler or to reschedule the
+ * currently executing handler.  Much of the logic is copied from
+ * hwi_handle().
  */
-static void
-ithread_loop(void *arg)
+void
+hwi_sched(void *cookie)
 {
-	struct intr_thread *ithd;
+	struct intr_hardware *ihw;
 	struct intr_handler *ih;
 	struct intr_event *ie;
-	struct thread *td;
-	struct proc *p;
-	int priv;
+	int state;
 
-	td = curthread;
-	p = td->td_proc;
-	ih = (struct intr_handler *)arg;
-	priv = (ih->ih_thread != NULL) ? 1 : 0;
-	ithd = (priv) ? ih->ih_thread : ih->ih_event->ie_thread;
-	KASSERT(ithd->it_thread == td,
-	    ("%s: ithread and proc linkage out of sync", __func__));
-	ie = ithd->it_event;
-	ie->ie_count = 0;
+	ih = cookie;
+	ie = ih->ih_event;
+	ihw = (struct intr_hardware *)ie;
+	KASSERT((curthread->td_pflags & TDP_ITHREAD) ||
+	    curthread->td_intr_nesting_level > 0,
+	    ("hwi_sched: invalid calling thread context"));
+	KASSERT((ih->ih_flags & IH_MANUAL) ||
+	    ((curthread->td_pflags & TDP_ITHREAD) &&
+	    curthread->td_ithread->it_current == ih),
+	    ("hwi_sched: attempt to schedule invalid handler"));
+	KASSERT(!(ie->ie_flags & IE_SOFT), ("hwi_sched: swi event"));
 
-	/*
-	 * As long as we have interrupts outstanding, go through the
-	 * list of handlers, giving each one a go at it.
-	 */
+	/* Place this interrupt handler on this CPU's queue. */
+	spinlock_enter();
 	for (;;) {
-		/*
-		 * If we are an orphaned thread, then just die.
-		 */
-		if (ithd->it_flags & IT_DEAD) {
-			CTR3(KTR_INTR, "%s: pid %d (%s) exiting", __func__,
-			    p->p_pid, td->td_name);
-			free(ithd, M_ITHREAD);
-			kthread_exit();
-		}
+		state = ih->ih_state;
+		switch (state) {
+		case IS_IDLE:
+			/* Try to change the state to queued. */
+			if (!atomic_cmpset_int(&ih->ih_state, IS_IDLE,
+			    IS_QUEUED))
+				break;
+
+			/*
+			 * If requeueing the currently executing
+			 * handler and it does not have a filter, bump
+			 * the queued count to defer the
+			 * 'post_ithread' hook.
+			 */
+			if (!(ih->ih_flags & IH_MANUAL) &&
+			    ih->ih_filter != NULL)
+				atomic_add_int(&ihw->ihw_queued, 1);
 
-		/*
-		 * Service interrupts.  If another interrupt arrives while
-		 * we are running, it will set it_need to note that we
-		 * should make another pass.
-		 */
-		while (ithd->it_need) {
+			/* Queue the handler. */
+			CTR2(KTR_INTR, "hwi_sched: scheduled %s for %s",
+			    ih->ih_name, ie->ie_name);
+			STAILQ_INSERT_TAIL(PCPU_PTR(hwi_active), ih, ih_queued);
+			goto queued;
+		case IS_QUEUED:
+		case IS_REQUEUE:
+			/* Ensure it is truly still queued. */
+			if (atomic_cmpset_int(&ih->ih_state, state, state))
+				goto queued;
+			break;
+		case IS_RUNNING:
+			/* Try to change the state to requeue. */
+			if (atomic_cmpset_int(&ih->ih_state, IS_RUNNING,
+			    IS_REQUEUE)) {
+				spinlock_exit();
+				return;
+			}
+			break;
+		case IS_DYING:
+			/*
+			 * This can happen if the currently executing
+			 * handler is being removed by another thread.
+			 * In that case, just ignore the reschedule
+			 * attempt.  The main loop of the hwi thread
+			 * will ack the dying request once this
+			 * handler finishes.
+			 */
+			KASSERT(curthread->td_ithread != NULL &&
+			    curthread->td_ithread->it_current == ih,
+			    ("hwi_sched: dying handler is not current"));
+			return;
+#ifdef INVARIANTS
+		default:
 			/*
-			 * This might need a full read and write barrier
-			 * to make sure that this write posts before any
-			 * of the memory or device accesses in the
-			 * handlers.
+			 * The dead state should not happen.  The
+			 * currently executing handler cannot be dead,
+			 * only dying, and the owner of a manual
+			 * handler is responsible for destroying any
+			 * filters or handlers that can schedule that
+			 * event before destroying the manual handler.
 			 */
-			atomic_store_rel_int(&ithd->it_need, 0);
-			if (priv)
-				priv_ithread_execute_handler(p, ih);
-			else 
-				ithread_execute_handlers(p, ie);
+			panic("hwi_sched: bad state %d", state);
+#endif
 		}
-		WITNESS_WARN(WARN_PANIC, NULL, "suspending ithread");
-		mtx_assert(&Giant, MA_NOTOWNED);
+	}
 
-		/*
-		 * Processed all our interrupts.  Now get the sched
-		 * lock.  This may take a while and it_need may get
-		 * set again, so we have to check it again.
-		 */
-		thread_lock(td);
-		if (!ithd->it_need && !(ithd->it_flags & IT_DEAD)) {
-			TD_SET_IWAIT(td);
-			ie->ie_count = 0;
-			mi_switch(SW_VOL | SWT_IWAIT, NULL);
-		}
-		thread_unlock(td);
+queued:
+	/*
+	 * If this CPU doesn't have an active interrupt thread,
+	 * schedule a new one.
+	 */
+	if (PCPU_GET(hwi_thread) == NULL) {
+		mtx_lock_spin(&hwi_thread_lock);
+		hwi_alloc_thread();
 	}
+	spinlock_exit();
 }
 
-/* 
- * Main loop for interrupt filter.
- *
- * Some architectures (i386, amd64 and arm) require the optional frame 
- * parameter, and use it as the main argument for fast handler execution
- * when ih_argument == NULL.
- *
- * Return value:
- * o FILTER_STRAY:              No filter recognized the event, and no
- *                              filter-less handler is registered on this 
- *                              line.
- * o FILTER_HANDLED:            A filter claimed the event and served it.
- * o FILTER_SCHEDULE_THREAD:    No filter claimed the event, but there's at
- *                              least one filter-less handler on this line.
- * o FILTER_HANDLED | 
- *   FILTER_SCHEDULE_THREAD:    A filter claimed the event, and asked for
- *                              scheduling the per-handler ithread.
- *
- * In case an ithread has to be scheduled, in *ithd there will be a 
- * pointer to a struct intr_thread containing the thread to be
- * scheduled.
- */
-
-static int
-intr_filter_loop(struct intr_event *ie, struct trapframe *frame, 
-		 struct intr_thread **ithd) 
+/* Called when an interrupt thread blocks on a turnstile waiting for a lock. */
+void
+intr_thread_block(struct thread *td)
 {
-	struct intr_handler *ih;
-	void *arg;
-	int ret, thread_only;
 
-	ret = 0;
-	thread_only = 0;
-	TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
-		/*
-		 * Execute fast interrupt handlers directly.
-		 * To support clock handlers, if a handler registers
-		 * with a NULL argument, then we pass it a pointer to
-		 * a trapframe as its argument.
-		 */
-		arg = ((ih->ih_argument == NULL) ? frame : ih->ih_argument);
-		
-		CTR5(KTR_INTR, "%s: exec %p/%p(%p) for %s", __func__,
-		     ih->ih_filter, ih->ih_handler, arg, ih->ih_name);
+	/*
+	 * Only allocate a new thread for this CPU if the currently
+	 * active hwi thread blocks.
+	 */
+	if (PCPU_GET(hwi_thread) != td)
+		return;
 
-		if (ih->ih_filter != NULL)
-			ret = ih->ih_filter(arg);
-		else {
-			thread_only = 1;
-			continue;
-		}
-		KASSERT(ret == FILTER_STRAY ||
-		    ((ret & (FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) != 0 &&
-		    (ret & ~(FILTER_SCHEDULE_THREAD | FILTER_HANDLED)) == 0),
-		    ("%s: incorrect return value %#x from %s", __func__, ret,
-		    ih->ih_name));
-		if (ret & FILTER_STRAY)
-			continue;
-		else { 
-			*ithd = ih->ih_thread;
-			return (ret);
-		}
-	}
+	/* Mark this CPU as not having an active hwi thread. */
+	mtx_lock_spin(&hwi_thread_lock);
+	PCPU_SET(hwi_thread, NULL);
+	CTR3(KTR_INTR, "intr_thread_block: tid %d (%s) freeing CPU %d",
+	    td->td_tid, td->td_name, PCPU_GET(cpuid));
 
 	/*
-	 * No filters handled the interrupt and we have at least
-	 * one handler without a filter.  In this case, we schedule
-	 * all of the filter-less handlers to run in the ithread.
-	 */	
-	if (thread_only) {
-		*ithd = ie->ie_thread;
-		return (FILTER_SCHEDULE_THREAD);
-	}
-	return (FILTER_STRAY);
+	 * If there are any queued handlers, allocate a new hwi thread
+	 * for this CPU.
+	 */
+	if (!STAILQ_EMPTY(PCPU_PTR(hwi_active)))
+		hwi_alloc_thread();
+	else
+		mtx_unlock_spin(&hwi_thread_lock);
 }
 
-/*
- * Main interrupt handling body.
- *
- * Input:
- * o ie:                        the event connected to this interrupt.
- * o frame:                     some archs (i.e. i386) pass a frame to some.
- *                              handlers as their main argument.
- * Return value:
- * o 0:                         everything ok.
- * o EINVAL:                    stray interrupt.
- */
-int
-intr_event_handle(struct intr_event *ie, struct trapframe *frame)
+#ifdef old
+static void
+ithread_execute_handlers(struct proc *p, struct intr_event *ie)
 {
-	struct intr_thread *ithd;
-	struct trapframe *oldframe;
-	struct thread *td;
-	int thread;
 
-	ithd = NULL;
-	td = curthread;
-
-	if (ie == NULL || TAILQ_EMPTY(&ie->ie_handlers))
-		return (EINVAL);
-
-	td->td_intr_nesting_level++;
-	thread = 0;
-	critical_enter();
-	oldframe = td->td_intr_frame;
-	td->td_intr_frame = frame;
-	thread = intr_filter_loop(ie, frame, &ithd);	
-	if (thread & FILTER_HANDLED) {
-		if (ie->ie_post_filter != NULL)
-			ie->ie_post_filter(ie->ie_source);
-	} else {
-		if (ie->ie_pre_ithread != NULL)
-			ie->ie_pre_ithread(ie->ie_source);
-	}
-	td->td_intr_frame = oldframe;
-	critical_exit();
-	
-	/* Interrupt storm logic */
-	if (thread & FILTER_STRAY) {
+	/*
+	 * Interrupt storm handling:
+	 *
+	 * If this interrupt source is currently storming, then throttle
+	 * it to only fire the handler once  per clock tick.
+	 *
+	 * If this interrupt source is not currently storming, but the
+	 * number of back to back interrupts exceeds the storm threshold,
+	 * then enter storming mode.
+	 */
+	if (intr_storm_threshold != 0 && ie->ie_count >= intr_storm_threshold &&
+	    !(ie->ie_flags & IE_SOFT)) {
+		/* Report the message only once every second. */
+		if (ppsratecheck(&ie->ie_warntm, &ie->ie_warncnt, 1)) {
+			printf(
+	"interrupt storm detected on \"%s\"; throttling interrupt source\n",
+			    ie->ie_name);
+		}
+		pause("istorm", 1);
+	} else
 		ie->ie_count++;
-		if (ie->ie_count < intr_storm_threshold)
-			printf("Interrupt stray detection not present\n");
-	}
 
-	/* Schedule an ithread if needed. */
-	if (thread & FILTER_SCHEDULE_THREAD) {
-		if (intr_event_schedule_thread(ie, ithd) != 0)
-			panic("%s: impossible stray interrupt", __func__);
-	}
-	td->td_intr_nesting_level--;
-	return (0);
 }
 #endif
 
@@ -1643,8 +1879,8 @@
 /*
  * Dump details about an interrupt handler
  */
-static void
-db_dump_intrhand(struct intr_handler *ih)
+void
+db_dump_intrhand(struct intr_handler *ih, int display_event)
 {
 	int comma;
 
@@ -1681,11 +1917,15 @@
 		break;
 	}
 	db_printf(" ");
+	if (display_event)
+		db_printf("(%s) ", ih->ih_event->ie_name);
+	if (ih->ih_filter != NULL) {
+		db_printsym((uintptr_t)ih->ih_filter, DB_STGY_PROC);
+		db_printf(",");
+	}
 	db_printsym((uintptr_t)ih->ih_handler, DB_STGY_PROC);
 	db_printf("(%p)", ih->ih_argument);
-	if (ih->ih_need ||
-	    (ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_DEAD |
-	    IH_MPSAFE)) != 0) {
+	if ((ih->ih_flags & (IH_EXCLUSIVE | IH_ENTROPY | IH_MPSAFE)) != 0) {
 		db_printf(" {");
 		comma = 0;
 		if (ih->ih_flags & IH_EXCLUSIVE) {
@@ -1700,25 +1940,37 @@
 			db_printf("ENTROPY");
 			comma = 1;
 		}
-		if (ih->ih_flags & IH_DEAD) {
-			if (comma)
-				db_printf(", ");
-			db_printf("DEAD");
-			comma = 1;
-		}
 		if (ih->ih_flags & IH_MPSAFE) {
 			if (comma)
 				db_printf(", ");
 			db_printf("MPSAFE");
-			comma = 1;
 		}
-		if (ih->ih_need) {
-			if (comma)
-				db_printf(", ");
-			db_printf("NEED");
-		}
 		db_printf("}");
 	}
+	db_printf(" ");
+	switch (ih->ih_state) {
+	case IS_IDLE:
+		db_printf("IDLE");
+		break;
+	case IS_QUEUED:
+		db_printf("QUEUED");
+		break;
+	case IS_RUNNING:
+		db_printf("RUNNING");
+		break;
+	case IS_REQUEUE:
+		db_printf("REQUEUE");
+		break;
+	case IS_DYING:
+		db_printf("DYING");
+		break;
+	case IS_DEAD:
+		db_printf("DEAD");
+		break;
+	default:
+		db_printf("0x%x", ih->ih_state);
+		break;
+	}
 	db_printf("\n");
 }
 
@@ -1728,18 +1980,20 @@
 void
 db_dump_intr_event(struct intr_event *ie, int handlers)
 {
+	struct intr_software *isw;
+	struct intr_hardware *ihw;
 	struct intr_handler *ih;
-	struct intr_thread *it;
 	int comma;
 
 	db_printf("%s ", ie->ie_fullname);
-	it = ie->ie_thread;
-	if (it != NULL)
-		db_printf("(pid %d)", it->it_thread->td_proc->p_pid);
-	else
-		db_printf("(no thread)");
-	if ((ie->ie_flags & (IE_SOFT | IE_ENTROPY | IE_ADDING_THREAD)) != 0 ||
-	    (it != NULL && it->it_need)) {
+	if (ie->ie_flags & IE_SOFT) {
+		isw = (struct intr_software *)ie;
+		db_printf("(tid %d)", isw->isw_thread->it_thread->td_tid);
+	} else {
+		ihw = (struct intr_hardware *)ie;
+		db_printf("IRQ %d queued %d", ihw->ihw_irq, ihw->ihw_queued);
+	}
+	if ((ie->ie_flags & (IE_SOFT | IE_ENTROPY)) != 0) {
 		db_printf(" {");
 		comma = 0;
 		if (ie->ie_flags & IE_SOFT) {
@@ -1750,26 +2004,15 @@
 			if (comma)
 				db_printf(", ");
 			db_printf("ENTROPY");
-			comma = 1;
 		}
-		if (ie->ie_flags & IE_ADDING_THREAD) {
-			if (comma)
-				db_printf(", ");
-			db_printf("ADDING_THREAD");
-			comma = 1;
-		}
-		if (it != NULL && it->it_need) {
-			if (comma)
-				db_printf(", ");
-			db_printf("NEED");
-		}
 		db_printf("}");
 	}
 	db_printf("\n");
 
 	if (handlers)
-		TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next)
-		    db_dump_intrhand(ih);
+		TAILQ_FOREACH(ih, &ie->ie_handlers, ih_next) {
+			db_dump_intrhand(ih, 0);
+		}
 }
 
 /*
@@ -1777,15 +2020,23 @@
  */
 DB_SHOW_COMMAND(intr, db_show_intr)
 {
-	struct intr_event *ie;
+	struct intr_software *isw;
+	struct intr_hardware *ihw;
 	int all, verbose;
 
 	verbose = index(modif, 'v') != NULL;
 	all = index(modif, 'a') != NULL;
-	TAILQ_FOREACH(ie, &event_list, ie_list) {
-		if (!all && TAILQ_EMPTY(&ie->ie_handlers))
+	TAILQ_FOREACH(ihw, &hwi_event_list, ihw_list) {
+		if (!all && TAILQ_EMPTY(&ihw->ihw_event.ie_handlers))
+			continue;
+		db_dump_intr_event(&ihw->ihw_event, verbose);
+		if (db_pager_quit)
+			break;
+	}
+	TAILQ_FOREACH(isw, &swi_event_list, isw_list) {
+		if (!all && TAILQ_EMPTY(&isw->isw_event.ie_handlers))
 			continue;
-		db_dump_intr_event(ie, verbose);
+		db_dump_intr_event(&isw->isw_event, verbose);
 		if (db_pager_quit)
 			break;
 	}
@@ -1793,7 +2044,7 @@
 #endif /* DDB */
 
 /*
- * Start standard software interrupt threads
+ * Start standard software interrupt threads.
  */
 static void
 start_softintr(void *dummy)
--- //depot/projects/smpng/sys/kern/sched_4bsd.c	2010-06-25 20:21:33.000000000 0000
+++ //depot/user/jhb/intr/kern/sched_4bsd.c	2010-06-25 21:44:42.000000000 0000
@@ -1476,6 +1476,21 @@
 }
 
 void
+sched_bind_ithd(struct thread *td, int cpu)
+{
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
+	KASSERT(TD_AWAITING_INTR(td), ("sched_bind_ithd: td is not waiting"));
+	ts = td->td_sched;
+	KASSERT(!(td->td_flags & TDF_BOUND), ("sched_bind_ithd: td is bound"));
+	td->td_flags |= TDF_BOUND;
+#ifdef SMP
+	ts->ts_runq = &runq_pcpu[cpu];
+#endif
+}
+
+void
 sched_unbind(struct thread* td)
 {
 	THREAD_LOCK_ASSERT(td, MA_OWNED);
--- //depot/projects/smpng/sys/kern/sched_ule.c	2010-06-25 20:21:33.000000000 0000
+++ //depot/user/jhb/intr/kern/sched_ule.c	2010-06-25 21:44:42.000000000 0000
@@ -2438,6 +2438,23 @@
 }
 
 /*
+ * Bind a waiting interrupt thread to a target cpu.
+ */
+void
+sched_bind_ithd(struct thread *td, int cpu)
+{
+	struct td_sched *ts;
+
+	THREAD_LOCK_ASSERT(td, MA_OWNED|MA_NOTRECURSED);
+	KASSERT(TD_AWAITING_INTR(td), ("sched_bind_ithd: td is not waiting"));
+	ts = td->td_sched;
+	KASSERT(!(ts->ts_flags & TSF_BOUND), ("sched_bind_ithd: td is bound"));
+	ts->ts_flags |= TSF_BOUND;
+	td->td_pinned = 1;
+	ts->ts_cpu = cpu;
+}
+
+/*
  * Release a bound thread.
  */
 void
--- //depot/projects/smpng/sys/kern/subr_pcpu.c	2010-06-25 20:21:33.000000000 0000
+++ //depot/user/jhb/intr/kern/subr_pcpu.c	2010-06-25 21:44:42.000000000 0000
@@ -52,7 +52,8 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
-#include <sys/sysctl.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
 #include <sys/linker_set.h>
 #include <sys/lock.h>
 #include <sys/malloc.h>
@@ -60,6 +61,7 @@
 #include <sys/proc.h>
 #include <sys/smp.h>
 #include <sys/sx.h>
+#include <sys/sysctl.h>
 #include <ddb/ddb.h>
 
 MALLOC_DEFINE(M_PCPU, "Per-cpu", "Per-cpu resource accouting.");
@@ -92,6 +94,7 @@
 	cpuid_to_pcpu[cpuid] = pcpu;
 	SLIST_INSERT_HEAD(&cpuhead, pcpu, pc_allcpu);
 	cpu_pcpu_init(pcpu, cpuid, size);
+	STAILQ_INIT(&pcpu->pc_hwi_active);
 	pcpu->pc_rm_queue.rmq_next = &pcpu->pc_rm_queue;
 	pcpu->pc_rm_queue.rmq_prev = &pcpu->pc_rm_queue;
 #ifdef KTR
@@ -327,6 +330,7 @@
 static void
 show_pcpu(struct pcpu *pc)
 {
+	struct intr_handler *ih;
 	struct thread *td;
 
 	db_printf("cpuid        = %d\n", pc->pc_cpuid);
@@ -352,12 +356,23 @@
 		db_printf("%p: tid %d \"%s\"\n", td, td->td_tid, td->td_name);
 	else
 		db_printf("none\n");
+	db_printf("hwi_thread   = ");
+	td = pc->pc_hwi_thread;
+	if (td != NULL)
+		db_printf("%p: tid %d \"%s\"\n", td, td->td_tid, td->td_name);
+	else
+		db_printf("none\n");
 	db_show_mdpcpu(pc);
 
 #ifdef VIMAGE
 	db_printf("curvnet      = %p\n", pc->pc_curthread->td_vnet);
 #endif
 
+	db_printf("queued interrupt handlers:\n");
+	STAILQ_FOREACH(ih, &pc->pc_hwi_active, ih_queued) {
+		db_dump_intrhand(ih, 1);
+	}
+	
 #ifdef WITNESS
 	db_printf("spin locks held:\n");
 	witness_list_locks(&pc->pc_spinlocks, db_printf);
--- //depot/projects/smpng/sys/kern/subr_turnstile.c	2010-03-10 22:33:24.000000000 0000
+++ //depot/user/jhb/intr/kern/subr_turnstile.c	2010-05-20 15:49:03.000000000 0000
@@ -65,6 +65,8 @@
 
 #include <sys/param.h>
 #include <sys/systm.h>
+#include <sys/bus.h>
+#include <sys/interrupt.h>
 #include <sys/kernel.h>
 #include <sys/ktr.h>
 #include <sys/lock.h>
@@ -679,6 +681,10 @@
 		MPASS(owner->td_proc->p_magic == P_MAGIC);
 	MPASS(queue == TS_SHARED_QUEUE || queue == TS_EXCLUSIVE_QUEUE);
 
+	/* Handle special case when an interrupt thread blocks. */
+	if (td->td_pflags & TDP_ITHREAD)
+		intr_thread_block(td);
+
 	/*
 	 * If the lock does not already have a turnstile, use this thread's
 	 * turnstile.  Otherwise insert the current thread into the
--- //depot/projects/smpng/sys/sys/bus.h	2010-06-25 20:21:33.000000000 0000
+++ //depot/user/jhb/intr/sys/bus.h	2010-06-25 21:44:42.000000000 0000
@@ -194,7 +194,8 @@
 	INTR_FAST = 128,
 	INTR_EXCL = 256,		/* exclusive interrupt */
 	INTR_MPSAFE = 512,		/* this interrupt is SMP safe */
-	INTR_ENTROPY = 1024		/* this interrupt provides entropy */
+	INTR_ENTROPY = 1024,		/* this interrupt provides entropy */
+	INTR_MANUAL = 2048		/* only scheduled via hwi_sched() */
 };
 
 enum intr_trigger {
--- //depot/projects/smpng/sys/sys/interrupt.h	2009-12-16 22:22:28.000000000 0000
+++ //depot/user/jhb/intr/sys/interrupt.h	2010-06-07 21:31:16.000000000 0000
@@ -37,50 +37,123 @@
 struct trapframe;
 
 /*
+ * Interrupt handlers are scheduled for execution across a pool of
+ * interrupt threads.  Each CPU maintains a per-CPU list of scheduled
+ * interrupt handlers.  When an interrupt occurs, its handler is added
+ * to the per-CPU list of handlers.  If the list is empty, then an
+ * interrupt thread is scheduled.  This thread is pinned to a specific
+ * CPU and will continue to execute until it has drained all of the
+ * handlers for its assigned CPU.  If an interrupt thread blocks on a
+ * lock and other interrupt handlers are queued for the assigned CPU,
+ * then a new interrupt thread is scheduled to execute those handlers
+ * if a thread is available.
+ *
+ * Pinning all of the work to a per-CPU list with pinned threads
+ * allows the list of handlers to be maintained with minimal locking
+ * overhead.  The simplest implementation is for the code executing in
+ * an interrupt thread context to disable interrupts when examining
+ * the per-CPU list.
+ *
+ * An additional wrinkle is needed to handle shared interrupt handlers
+ * that do not use filters.  For this case the interrupt event needs
+ * to not be enabled until all of the handlers for that event have
+ * executed.  To handle this, each hardware interrupt holds a count of
+ * non-filter events queued for execution.  Once these handlers are
+ * drained, the 'post_ithread' hook is invoked.
+ *
+ * XXX: Not sure how to handle interrupt storms in this mode.
+ *
+ * Software interrupt events are still assigned a dedicated interrupt
+ * thread.  A list of scheduled handlers is maintained in the software
+ * event itself.
+ */
+
+/*
  * Describe a hardware interrupt handler.
  *
  * Multiple interrupt handlers for a specific event can be chained
  * together.
  */
 struct intr_handler {
-	driver_filter_t	*ih_filter;	/* Filter handler function. */
+	driver_filter_t *ih_filter;	/* Filter handler function. */
 	driver_intr_t	*ih_handler;	/* Threaded handler function. */
 	void		*ih_argument;	/* Argument to pass to handlers. */
 	int		 ih_flags;
+	volatile int	 ih_state;	/* IS_* state. */
 	char		 ih_name[MAXCOMLEN + 1]; /* Name of handler. */
 	struct intr_event *ih_event;	/* Event we are connected to. */
-	int		 ih_need;	/* Needs service. */
 	TAILQ_ENTRY(intr_handler) ih_next; /* Next handler for this event. */
 	u_char		 ih_pri;	/* Priority of this handler. */
-	struct intr_thread *ih_thread;	/* Ithread for filtered handler. */
+	STAILQ_ENTRY(intr_handler) ih_queued; /* Links for active list. */
 };
 
 /* Interrupt handle flags kept in ih_flags */
+#define	IH_MANUAL	0x00000001	/* Manually scheduled via hwi_sched(). */
 #define	IH_EXCLUSIVE	0x00000002	/* Exclusive interrupt. */
 #define	IH_ENTROPY	0x00000004	/* Device is a good entropy source. */
-#define	IH_DEAD		0x00000008	/* Handler should be removed. */
 #define	IH_MPSAFE	0x80000000	/* Handler does not need Giant. */
 
 /*
+ * Interrupt handle states.
+ *
+ * Initially an interrupt handler is idle.  An idle handler can move
+ * either into the dead state (when it is being removed) or queued
+ * state (when it is queued to an interrup thread).
+ *
+ * A queued handler can move either into the dying state (when it is
+ * being removed), the queued state (an attempt to queue an
+ * already-queued handler), or the running state (when an interrupt
+ * thread executes the handler).
+ *
+ * An interrupt handler is placed into the running state by an
+ * interrupt thread while it is being executed.  A running handler can
+ * move either into the dying state (when it is being removed), the
+ * requeue state (an attempt to queue an executing handler), or the
+ * idle state.
+ *
+ * If an interrupt handler is rescheduled while it is executing, it is
+ * placed into the requeue state.  A requeued handler can move either
+ * into the dying state (when it is being removed) or the queued state
+ * (when the interrupt thread requeues it after execution finishes).
+ *
+ * When an interrupt handler is removed, it is placed into the dying
+ * state if it is not currently idle.  The removing thread then sleeps
+ * until an interrupt thread dequeues the handler or finishes
+ * executing the handler.  The interrupt thread then acks the dying
+ * request by moving the handler into the dead state.
+ */
+#define	IS_IDLE		0
+#define	IS_QUEUED	1
+#define	IS_RUNNING	2
+#define	IS_REQUEUE	3
+#define	IS_DYING	4
+#define	IS_DEAD		5
+
+/*
  * Describe an interrupt event.  An event holds a list of handlers.
+ * Events are split into two classes: hardware interrupt events and
+ * software interrupt events.
+ * 
  * The 'pre_ithread', 'post_ithread', 'post_filter', and 'assign_cpu'
- * hooks are used to invoke MD code for certain operations.
+ * hooks are used to invoke MD code for certain operations for
+ * hardware interrupt events.
  *
- * The 'pre_ithread' hook is called when an interrupt thread for
- * handlers without filters is scheduled.  It is responsible for
- * ensuring that 1) the system won't be swamped with an interrupt
- * storm from the associated source while the ithread runs and 2) the
- * current CPU is able to receive interrupts from other interrupt
- * sources.  The first is usually accomplished by disabling
- * level-triggered interrupts until the ithread completes.  The second
- * is accomplished on some platforms by acknowledging the interrupt
- * via an EOI.
+ * The 'pre_ithread' hook is called when interrupt handlers without
+ * filters are scheduled.  It is responsible for ensuring that 1) the
+ * system won't be swamped with an interrupt storm from the associated
+ * source while the ithread runs and 2) the current CPU is able to
+ * receive interrupts from other interrupt sources.  The first is
+ * usually accomplished by disabling level-triggered interrupts until
+ * all of the handlers for this event have completed.  The second is
+ * accomplished on some platforms by acknowledging the interrupt via
+ * an EOI.
  *
- * The 'post_ithread' hook is invoked when an ithread finishes.  It is
- * responsible for ensuring that the associated interrupt source will
- * trigger an interrupt when it is asserted in the future.  Usually
- * this is implemented by enabling a level-triggered interrupt that
- * was previously disabled via the 'pre_ithread' hook.
+ * The 'post_ithread' hook is invoked when all of the interrupt
+ * handlers without filters for an event finish.  It is responsible
+ * for ensuring that the associated interrupt source will trigger an
+ * interrupt when it is asserted in the future.  Usually this is
+ * implemented by enabling a level-triggered interrupt that was
+ * previously disabled via the 'pre_ithread' hook.
  *
  * The 'post_filter' hook is invoked when a filter handles an
  * interrupt.  It is responsible for ensuring that the current CPU is
@@ -91,40 +164,52 @@
  * specific CPU.  If the interrupt cannot be bound, this function may
  * return an error.
  *
- * Note that device drivers may also use interrupt events to manage
- * multiplexing interrupt interrupt handler into handlers for child
- * devices.  In that case, the above hooks are not used.  The device
- * can create an event for its interrupt resource and register child
- * event handlers with that event.  It can then use
- * intr_event_execute_handlers() to execute non-filter handlers.
- * Currently filter handlers are not supported by this, but that can
- * be added by splitting out the filter loop from intr_event_handle()
- * if desired.
+ * The list of handlers in an interrupt event are protected by two
+ * locks.  First, there is a regular mutex that can be used alone for
+ * read-only access in top-half code.  Second, there is a very simple
+ * 0/1 spinlock stored in "ie_running".  This lightweight lock is held
+ * in the low-level interrupt code while walking the list of interrupt
+ * handlers.  It must also be held in top-half code that adds or
+ * removes handlers to or from the list.
  */
 struct intr_event {
-	TAILQ_ENTRY(intr_event) ie_list;
 	TAILQ_HEAD(, intr_handler) ie_handlers; /* Interrupt handlers. */
 	char		ie_name[MAXCOMLEN + 1]; /* Individual event name. */
 	char		ie_fullname[MAXCOMLEN + 1];
 	struct mtx	ie_lock;
-	void		*ie_source;	/* Cookie used by MD code. */
-	struct intr_thread *ie_thread;	/* Thread we are connected to. */
-	void		(*ie_pre_ithread)(void *);
-	void		(*ie_post_ithread)(void *);
-	void		(*ie_post_filter)(void *);
-	int		(*ie_assign_cpu)(void *, u_char);
+	volatile int	ie_running;
 	int		ie_flags;
-	int		ie_count;	/* Loop counter. */
-	int		ie_warncnt;	/* Rate-check interrupt storm warns. */
-	struct timeval	ie_warntm;
-	int		ie_irq;		/* Physical irq number if !SOFT. */
-	u_char		ie_cpu;		/* CPU this event is bound to. */
+	int		ie_cpu;		/* CPU this event is bound to. */
+};
+
+struct intr_hardware {
+	struct intr_event ihw_event;
+	TAILQ_ENTRY(intr_hardware) ihw_list;
+	void		*ihw_source;	/* Cookie used by MD code. */
+	void		(*ihw_pre_ithread)(void *);
+	void		(*ihw_post_ithread)(void *);
+	void		(*ihw_post_filter)(void *);
+	int		(*ihw_assign_cpu)(void *, u_char);
+	int		ihw_queued;	/* Number of queued non-filter handlers. */
+	int		ihw_irq;	/* Physical irq number. */
+#ifdef notyet
+	int		ihw_count;	/* Loop counter. */
+	int		ihw_warncnt;	/* Rate-check interrupt storm warns. */
+	struct timeval ihw_warntm;
+#endif
+	TAILQ_HEAD(, intr_handler) ihw_manual; /* Manual interrupt handlers. */
+};
+
+struct intr_software {
+	struct intr_event isw_event;
+	TAILQ_ENTRY(intr_software) isw_list;
+	struct intr_thread *isw_thread; /* Dedicated thread. */
+	STAILQ_HEAD(, intr_handler) isw_active;
 };
 
 /* Interrupt event flags kept in ie_flags. */
 #define	IE_SOFT		0x000001	/* Software interrupt. */
 #define	IE_ENTROPY	0x000002	/* Interrupt is an entropy source. */
-#define	IE_ADDING_THREAD 0x000004	/* Currently building an ithread. */
 
 /* Flags to pass to sched_swi. */
 #define	SWI_DELAY	0x2
@@ -143,6 +228,7 @@
 #define	SWI_TQ_GIANT	6
 
 struct proc;
+struct thread;
 
 extern struct	intr_event *tty_intr_event;
 extern struct	intr_event *clk_intr_event;
@@ -157,30 +243,45 @@
 
 #ifdef DDB
 void	db_dump_intr_event(struct intr_event *ie, int handlers);
+void	db_dump_intrhand(struct intr_handler *ih, int display_event);
 #endif
+void	hwi_create(struct intr_event **event, void *source, int irq,
+	    void (*pre_ithread)(void *), void (*post_ithread)(void *),
+	    void (*post_filter)(void *), int (*assign_cpu)(void *, u_char),
+	    const char *fmt, ...) __printflike(8, 9);
+int	hwi_destroy(struct intr_event *ie);
+int	hwi_handle(struct intr_event *ie, struct trapframe *frame);
+void	*hwi_handler_source(void *cookie);
+void	hwi_sched(void *cookie);
 u_char	intr_priority(enum intr_type flags);
 int	intr_event_add_handler(struct intr_event *ie, const char *name,
 	    driver_filter_t filter, driver_intr_t handler, void *arg, 
 	    u_char pri, enum intr_type flags, void **cookiep);	    
 int	intr_event_bind(struct intr_event *ie, u_char cpu);
-int	intr_event_create(struct intr_event **event, void *source,
-	    int flags, int irq, void (*pre_ithread)(void *),
-	    void (*post_ithread)(void *), void (*post_filter)(void *),
-	    int (*assign_cpu)(void *, u_char), const char *fmt, ...)
-	    __printflike(9, 10);
 int	intr_event_describe_handler(struct intr_event *ie, void *cookie,
 	    const char *descr);
-int	intr_event_destroy(struct intr_event *ie);
-void	intr_event_execute_handlers(struct proc *p, struct intr_event *ie);
-int	intr_event_handle(struct intr_event *ie, struct trapframe *frame);
 int	intr_event_remove_handler(void *cookie);
 int	intr_getaffinity(int irq, void *mask);
-void	*intr_handler_source(void *cookie);
 int	intr_setaffinity(int irq, void *mask);
+void	intr_thread_block(struct thread *td);
 int	swi_add(struct intr_event **eventp, const char *name,
 	    driver_intr_t handler, void *arg, int pri, enum intr_type flags,
 	    void **cookiep);
+int	swi_create(struct intr_event **event, const char *fmt, ...);
+int	swi_destroy(struct intr_event *ie);
+int	swi_remove(void *cookie);
 void	swi_sched(void *cookie, int flags);
-int	swi_remove(void *cookie);
+
+/* XXX: Compat shims */
+#define	intr_event_create(ev, src, f, irq, prei, posti, postf, ac, ...) \
+	(hwi_create(ev, src, irq, prei, posti, postf, ac, __VA_ARGS__), 0)
+#define	intr_event_destroy(ev) \
+	hwi_destroy(ev)
+#define	intr_event_handle(ev, frame) \
+	hwi_handle(ev, frame)
+#define	intr_handler_source(cookie) \
+	hwi_handler_source(cookie)
+
+/* XXX: Should we have hwi_add() and hwi_remove()? */
 
 #endif
--- //depot/projects/smpng/sys/sys/pcpu.h	2010-06-25 20:21:33.000000000 0000
+++ //depot/user/jhb/intr/sys/pcpu.h	2010-06-25 21:44:42.000000000 0000
@@ -121,6 +121,9 @@
 	sum;								\
 })
 
+struct intr_handler;
+STAILQ_HEAD(intr_handler_list, intr_handler);
+
 /* 
  * XXXUPS remove as soon as we have per cpu variable
  * linker sets and can define rm_queue in _rm_lock.h
@@ -159,6 +162,8 @@
 	struct device	*pc_device;
 	void		*pc_netisr;		/* netisr SWI cookie */
 	int		pc_dnweight;		/* vm_page_dontneed() */
+	struct intr_handler_list pc_hwi_active;	/* Queued HWI handlers */
+	struct thread	*pc_hwi_thread;		/* Active per-CPU HWI thread */
 
 	/*
 	 * Stuff for read mostly lock
--- //depot/projects/smpng/sys/sys/proc.h	2010-06-25 20:21:33.000000000 0000
+++ //depot/user/jhb/intr/sys/proc.h	2010-06-25 21:44:42.000000000 0000
@@ -157,6 +157,7 @@
  * either lock is sufficient for read access, but both locks must be held
  * for write access.
  */
+struct intr_thread;
 struct kaudit_record;
 struct td_sched;
 struct nlminfo;
@@ -301,7 +302,8 @@
 	int		td_errno;	/* Error returned by last syscall. */
 	struct vnet	*td_vnet;	/* (k) Effective vnet. */
 	const char	*td_vnet_lpush;	/* (k) Debugging vnet push / pop. */
-	struct trapframe *td_intr_frame;/* (k) Frame of the current irq */
+	struct trapframe *td_intr_frame;/* (k) Frame of the current irq. */
+	struct intr_thread *td_ithread; /* (b) Interrupt thread state. */
 };
 
 struct mtx *thread_lock_block(struct thread *);
--- //depot/projects/smpng/sys/sys/sched.h	2009-06-25 15:02:49.000000000 0000
+++ //depot/user/jhb/intr/sys/sched.h	2010-05-20 15:10:26.000000000 0000
@@ -121,6 +121,7 @@
  * hold a thread on a particular CPU.
  */
 void	sched_bind(struct thread *td, int cpu);
+void	sched_bind_ithd(struct thread *td, int cpu);
 static __inline void sched_pin(void);
 void	sched_unbind(struct thread *td);
 static __inline void sched_unpin(void);