mirror of
https://github.com/opnsense/src.git
synced 2026-05-04 17:05:14 -04:00
Significant performance improvements for the if_em driver:
- Only update the rx ring consumer pointer after running through the rx loop,
not with each iteration through the loop.
- If possible, use a fast interupt handler instead of an ithread handler. Use
the interrupt handler to check and squelch the interrupt, then schedule a
taskqueue to do the actual work. This has three benefits:
- Eliminates the 'interrupt aliasing' problem found in many chipsets by
allowing the driver to mask the interrupt in the NIC instead of the
OS masking the interrupt in the APIC.
- Allows the driver to control the amount of work done in the interrupt
handler. This results in what I call 'adaptive polling', where you get
the latency benefits of a quick response to interrupts with the
interrupt mitigation and work partitioning of polling. Polling is still
an option in the driver, but I consider it orthogonal to this work.
- Don't hold the driver lock in the RX handler. The handler and all data
associated is effectively serialized already. This eliminates the cost of
dropping and reaquiring the lock for every receieved packet. The result
is much lower contention for the driver lock, resulting in lower CPU usage
and lower latency for interactive workloads.
The amount of work done in the taskqueue is controlled by the sysctl
dev.em.N.rx_processing_limit
and tunable
hw.em.rx_process_limit
Setting these to -1 effectively removes the limit.
The fast interrupt and taskqueue can be disabled by defining NO_EM_FASTINTR.
This work has been shown to increase fast-forwarding from ~570 kpps to
~750 kpps (note that the same NIC hardware seems unable to transmit more than
800 kpps, so this increase appears to be limited almost solely by the
hardware). Gains have been shown in other workloads, ranging from better
performance to elimination of over-saturation livelocks.
Thanks to Andre Opperman for his time and resources from his network
performance project in performing much of the testing. Thanks to Gleb
Smirnoff and Danny Braniss for their help in testing also.
This commit is contained in:
parent
3b4c974228
commit
2ff7d1b635
2 changed files with 203 additions and 19 deletions
|
|
@ -141,6 +141,9 @@ static int em_shutdown(device_t);
|
|||
static int em_suspend(device_t);
|
||||
static int em_resume(device_t);
|
||||
static void em_intr(void *);
|
||||
#ifndef NO_EM_FASTINTR
|
||||
static void em_intr_fast(void *);
|
||||
#endif
|
||||
static void em_start(struct ifnet *);
|
||||
static void em_start_locked(struct ifnet *ifp);
|
||||
static int em_ioctl(struct ifnet *, u_long, caddr_t);
|
||||
|
|
@ -168,7 +171,7 @@ static void em_update_stats_counters(struct adapter *);
|
|||
static void em_clean_transmit_interrupts(struct adapter *);
|
||||
static int em_allocate_receive_structures(struct adapter *);
|
||||
static int em_allocate_transmit_structures(struct adapter *);
|
||||
static void em_process_receive_interrupts(struct adapter *, int);
|
||||
static int em_process_receive_interrupts(struct adapter *, int);
|
||||
#ifndef __NO_STRICT_ALIGNMENT
|
||||
static int em_fixup_rx(struct adapter *);
|
||||
#endif
|
||||
|
|
@ -209,6 +212,12 @@ static int em_sysctl_int_delay(SYSCTL_HANDLER_ARGS);
|
|||
static void em_add_int_delay_sysctl(struct adapter *, const char *,
|
||||
const char *, struct em_int_delay_info *,
|
||||
int, int);
|
||||
#ifndef NO_EM_FASTINTR
|
||||
static void em_add_int_process_limit(struct adapter *, const char *,
|
||||
const char *, int *, int);
|
||||
static void em_handle_rxtx(void *context, int pending);
|
||||
static void em_handle_link(void *context, int pending);
|
||||
#endif
|
||||
#ifdef DEVICE_POLLING
|
||||
static poll_handler_t em_poll;
|
||||
#endif
|
||||
|
|
@ -257,6 +266,10 @@ TUNABLE_INT("hw.em.tx_abs_int_delay", &em_tx_abs_int_delay_dflt);
|
|||
TUNABLE_INT("hw.em.rx_abs_int_delay", &em_rx_abs_int_delay_dflt);
|
||||
TUNABLE_INT("hw.em.rxd", &em_rxd);
|
||||
TUNABLE_INT("hw.em.txd", &em_txd);
|
||||
#ifndef NO_EM_FASTINTR
|
||||
static int em_rx_process_limit = 100;
|
||||
TUNABLE_INT("hw.em.rx_process_limit", &em_rx_process_limit);
|
||||
#endif
|
||||
|
||||
/*********************************************************************
|
||||
* Device identification routine
|
||||
|
|
@ -379,6 +392,13 @@ em_attach(device_t dev)
|
|||
em_tx_abs_int_delay_dflt);
|
||||
}
|
||||
|
||||
/* Sysctls for limiting the amount of work done in the taskqueue */
|
||||
#ifndef NO_EM_FASTINTR
|
||||
em_add_int_process_limit(adapter, "rx_processing_limit",
|
||||
"max number of rx packets to process", &adapter->rx_process_limit,
|
||||
em_rx_process_limit);
|
||||
#endif
|
||||
|
||||
/*
|
||||
* Validate number of transmit and receive descriptors. It
|
||||
* must not exceed hardware maximum, and must be multiple
|
||||
|
|
@ -433,7 +453,6 @@ em_attach(device_t dev)
|
|||
*/
|
||||
adapter->hw.report_tx_early = 1;
|
||||
|
||||
|
||||
if (em_allocate_pci_resources(adapter)) {
|
||||
printf("em%d: Allocation of PCI resources failed\n",
|
||||
adapter->unit);
|
||||
|
|
@ -563,6 +582,17 @@ em_detach(device_t dev)
|
|||
ether_poll_deregister(ifp);
|
||||
#endif
|
||||
|
||||
if (adapter->res_interrupt != NULL) {
|
||||
bus_teardown_intr(dev, adapter->res_interrupt,
|
||||
adapter->int_handler_tag);
|
||||
bus_release_resource(dev, SYS_RES_IRQ, 0,
|
||||
adapter->res_interrupt);
|
||||
adapter->res_interrupt = NULL;
|
||||
if (adapter->tq != NULL) {
|
||||
taskqueue_drain(adapter->tq, &adapter->rxtx_task);
|
||||
taskqueue_drain(taskqueue_fast, &adapter->link_task);
|
||||
}
|
||||
}
|
||||
EM_LOCK(adapter);
|
||||
adapter->in_detach = 1;
|
||||
em_stop(adapter);
|
||||
|
|
@ -1052,11 +1082,113 @@ em_poll(struct ifnet *ifp, enum poll_cmd cmd, int count)
|
|||
}
|
||||
#endif /* DEVICE_POLLING */
|
||||
|
||||
#ifndef NO_EM_FASTINTR
|
||||
static void
|
||||
em_handle_link(void *context, int pending)
|
||||
{
|
||||
struct adapter *adapter = context;
|
||||
struct ifnet *ifp;
|
||||
|
||||
ifp = adapter->ifp;
|
||||
|
||||
EM_LOCK(adapter);
|
||||
|
||||
callout_stop(&adapter->timer);
|
||||
adapter->hw.get_link_status = 1;
|
||||
em_check_for_link(&adapter->hw);
|
||||
em_print_link_status(adapter);
|
||||
callout_reset(&adapter->timer, hz, em_local_timer,
|
||||
adapter);
|
||||
EM_UNLOCK(adapter);
|
||||
}
|
||||
|
||||
static void
|
||||
em_handle_rxtx(void *context, int pending)
|
||||
{
|
||||
struct adapter *adapter = context;
|
||||
struct ifnet *ifp;
|
||||
|
||||
ifp = adapter->ifp;
|
||||
|
||||
/*
|
||||
* TODO:
|
||||
* It should be possible to run the tx clean loop without the lock.
|
||||
*/
|
||||
if (ifp->if_drv_flags & IFF_DRV_RUNNING) {
|
||||
if (em_process_receive_interrupts(adapter,
|
||||
adapter->rx_process_limit) != 0)
|
||||
taskqueue_enqueue(adapter->tq, &adapter->rxtx_task);
|
||||
EM_LOCK(adapter);
|
||||
em_clean_transmit_interrupts(adapter);
|
||||
|
||||
if (!IFQ_DRV_IS_EMPTY(&ifp->if_snd))
|
||||
em_start_locked(ifp);
|
||||
EM_UNLOCK(adapter);
|
||||
}
|
||||
|
||||
em_enable_intr(adapter);
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
/*********************************************************************
|
||||
*
|
||||
* Interrupt Service routine
|
||||
*
|
||||
**********************************************************************/
|
||||
#ifndef NO_EM_FASTINTR
|
||||
static void
|
||||
em_intr_fast(void *arg)
|
||||
{
|
||||
struct adapter *adapter = arg;
|
||||
struct ifnet *ifp;
|
||||
uint32_t reg_icr;
|
||||
|
||||
ifp = adapter->ifp;
|
||||
|
||||
#ifdef DEVICE_POLLING
|
||||
if (ifp->if_capenable & IFCAP_POLLING) {
|
||||
return;
|
||||
}
|
||||
#endif /* DEVICE_POLLING */
|
||||
|
||||
reg_icr = E1000_READ_REG(&adapter->hw, ICR);
|
||||
|
||||
/* Hot eject? */
|
||||
if (reg_icr == 0xffffffff)
|
||||
return;
|
||||
|
||||
/* Definitely not our interrupt. */
|
||||
if (reg_icr == 0x0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Starting with the 82571 chip, bit 31 should be used to
|
||||
* determine whether the interrupt belongs to us.
|
||||
*/
|
||||
if (adapter->hw.mac_type >= em_82571 &&
|
||||
(reg_icr & E1000_ICR_INT_ASSERTED) == 0)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Mask interrupts until the taskqueue is finished running. This is
|
||||
* cheap, just assume that it is needed. This also works around the
|
||||
* MSI message reordering errata on certain systems.
|
||||
*/
|
||||
em_disable_intr(adapter);
|
||||
taskqueue_enqueue(adapter->tq, &adapter->rxtx_task);
|
||||
|
||||
/* Link status change */
|
||||
if (reg_icr & (E1000_ICR_RXSEQ | E1000_ICR_LSC))
|
||||
taskqueue_enqueue(taskqueue_fast, &adapter->link_task);
|
||||
|
||||
if (reg_icr & E1000_ICR_RXO) {
|
||||
adapter->rx_overruns++;
|
||||
}
|
||||
return;
|
||||
}
|
||||
#endif
|
||||
|
||||
static void
|
||||
em_intr(void *arg)
|
||||
{
|
||||
|
|
@ -1897,13 +2029,40 @@ em_allocate_pci_resources(struct adapter * adapter)
|
|||
adapter->unit);
|
||||
return(ENXIO);
|
||||
}
|
||||
|
||||
/*
|
||||
* Try allocating a fast interrupt and the associated deferred
|
||||
* processing contexts. If that doesn't work, try just using an
|
||||
* ithread.
|
||||
*/
|
||||
#ifndef NO_EM_FASTINTR
|
||||
if (bus_setup_intr(dev, adapter->res_interrupt,
|
||||
INTR_TYPE_NET | INTR_MPSAFE,
|
||||
(void (*)(void *)) em_intr, adapter,
|
||||
&adapter->int_handler_tag)) {
|
||||
printf("em%d: Error registering interrupt handler!\n",
|
||||
adapter->unit);
|
||||
return(ENXIO);
|
||||
INTR_TYPE_NET | INTR_FAST, em_intr_fast, adapter,
|
||||
&adapter->int_handler_tag) == 0) {
|
||||
|
||||
/* Init the deferred processing contexts. */
|
||||
TASK_INIT(&adapter->rxtx_task, 0, em_handle_rxtx, adapter);
|
||||
TASK_INIT(&adapter->link_task, 0, em_handle_link, adapter);
|
||||
adapter->tq = taskqueue_create_fast("em_taskq", M_NOWAIT,
|
||||
taskqueue_thread_enqueue,
|
||||
&adapter->tq, &adapter->tqproc);
|
||||
kthread_create(taskqueue_thread_loop,
|
||||
&adapter->tq, &adapter->tqproc,
|
||||
0, 0, "%s taskq", device_get_nameunit(adapter->dev));
|
||||
mtx_lock_spin(&sched_lock);
|
||||
sched_prio(FIRST_THREAD_IN_PROC(adapter->tqproc), PI_NET);
|
||||
mtx_unlock_spin(&sched_lock);
|
||||
}
|
||||
#endif
|
||||
if (adapter->int_handler_tag == NULL) {
|
||||
if (bus_setup_intr(dev, adapter->res_interrupt,
|
||||
INTR_TYPE_NET | INTR_MPSAFE,
|
||||
em_intr, adapter,
|
||||
&adapter->int_handler_tag)) {
|
||||
printf("em%d: Error registering interrupt handler!\n",
|
||||
adapter->unit);
|
||||
return(ENXIO);
|
||||
}
|
||||
}
|
||||
|
||||
adapter->hw.back = &adapter->osdep;
|
||||
|
|
@ -1916,6 +2075,9 @@ em_free_pci_resources(struct adapter * adapter)
|
|||
{
|
||||
device_t dev = adapter->dev;
|
||||
|
||||
if (adapter->tq != NULL) {
|
||||
taskqueue_free(adapter->tq);
|
||||
}
|
||||
if (adapter->res_interrupt != NULL) {
|
||||
bus_teardown_intr(dev, adapter->res_interrupt,
|
||||
adapter->int_handler_tag);
|
||||
|
|
@ -2889,7 +3051,7 @@ em_free_receive_structures(struct adapter *adapter)
|
|||
* count < 0.
|
||||
*
|
||||
*********************************************************************/
|
||||
static void
|
||||
static int
|
||||
em_process_receive_interrupts(struct adapter * adapter, int count)
|
||||
{
|
||||
struct ifnet *ifp;
|
||||
|
|
@ -2902,8 +3064,6 @@ em_process_receive_interrupts(struct adapter * adapter, int count)
|
|||
/* Pointer to the receive descriptor being examined. */
|
||||
struct em_rx_desc *current_desc;
|
||||
|
||||
mtx_assert(&adapter->mtx, MA_OWNED);
|
||||
|
||||
ifp = adapter->ifp;
|
||||
i = adapter->next_rx_desc_to_check;
|
||||
current_desc = &adapter->rx_desc_base[i];
|
||||
|
|
@ -2911,7 +3071,7 @@ em_process_receive_interrupts(struct adapter * adapter, int count)
|
|||
BUS_DMASYNC_POSTREAD);
|
||||
|
||||
if (!((current_desc->status) & E1000_RXD_STAT_DD)) {
|
||||
return;
|
||||
return (0);
|
||||
}
|
||||
|
||||
while ((current_desc->status & E1000_RXD_STAT_DD) &&
|
||||
|
|
@ -3037,23 +3197,26 @@ skip:
|
|||
bus_dmamap_sync(adapter->rxdma.dma_tag, adapter->rxdma.dma_map,
|
||||
BUS_DMASYNC_PREREAD | BUS_DMASYNC_PREWRITE);
|
||||
|
||||
/* Advance the E1000's Receive Queue #0 "Tail Pointer". */
|
||||
E1000_WRITE_REG(&adapter->hw, RDT, i);
|
||||
|
||||
/* Advance our pointers to the next descriptor */
|
||||
if (++i == adapter->num_rx_desc)
|
||||
i = 0;
|
||||
if (m != NULL) {
|
||||
adapter->next_rx_desc_to_check = i;
|
||||
EM_UNLOCK(adapter);
|
||||
(*ifp->if_input)(ifp, m);
|
||||
EM_LOCK(adapter);
|
||||
i = adapter->next_rx_desc_to_check;
|
||||
}
|
||||
current_desc = &adapter->rx_desc_base[i];
|
||||
}
|
||||
adapter->next_rx_desc_to_check = i;
|
||||
return;
|
||||
|
||||
/* Advance the E1000's Receive Queue #0 "Tail Pointer". */
|
||||
if (--i < 0) i = adapter->num_rx_desc - 1;
|
||||
E1000_WRITE_REG(&adapter->hw, RDT, i);
|
||||
|
||||
if (!((current_desc->status) & E1000_RXD_STAT_DD)) {
|
||||
return (0);
|
||||
}
|
||||
return (1);
|
||||
}
|
||||
|
||||
#ifndef __NO_STRICT_ALIGNMENT
|
||||
|
|
@ -3610,3 +3773,15 @@ em_add_int_delay_sysctl(struct adapter *adapter, const char *name,
|
|||
OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW,
|
||||
info, 0, em_sysctl_int_delay, "I", description);
|
||||
}
|
||||
|
||||
#ifndef NO_EM_FASTINTR
|
||||
static void
|
||||
em_add_int_process_limit(struct adapter *adapter, const char *name,
|
||||
const char *description, int *limit, int value)
|
||||
{
|
||||
*limit = value;
|
||||
SYSCTL_ADD_INT(device_get_sysctl_ctx(adapter->dev),
|
||||
SYSCTL_CHILDREN(device_get_sysctl_tree(adapter->dev)),
|
||||
OID_AUTO, name, CTLTYPE_INT|CTLFLAG_RW, limit, value, description);
|
||||
}
|
||||
#endif
|
||||
|
|
|
|||
|
|
@ -48,6 +48,10 @@ POSSIBILITY OF SUCH DAMAGE.
|
|||
#include <sys/socket.h>
|
||||
#include <sys/sockio.h>
|
||||
#include <sys/sysctl.h>
|
||||
#include <sys/taskqueue.h>
|
||||
#include <sys/kthread.h>
|
||||
#include <sys/proc.h>
|
||||
#include <sys/sched.h>
|
||||
|
||||
#include <machine/bus.h>
|
||||
#include <sys/rman.h>
|
||||
|
|
@ -334,6 +338,10 @@ struct adapter {
|
|||
u_int8_t unit;
|
||||
struct mtx mtx;
|
||||
int em_insert_vlan_header;
|
||||
struct task link_task;
|
||||
struct task rxtx_task;
|
||||
struct taskqueue *tq; /* private task queue */
|
||||
struct proc *tqproc; /* thread handling sc_tq */
|
||||
|
||||
/* Info about the board itself */
|
||||
u_int32_t part_num;
|
||||
|
|
@ -378,8 +386,9 @@ struct adapter {
|
|||
struct em_dma_alloc rxdma; /* bus_dma glue for rx desc */
|
||||
struct em_rx_desc *rx_desc_base;
|
||||
u_int32_t next_rx_desc_to_check;
|
||||
u_int16_t num_rx_desc;
|
||||
u_int32_t rx_buffer_len;
|
||||
u_int16_t num_rx_desc;
|
||||
int rx_process_limit;
|
||||
struct em_buffer *rx_buffer_area;
|
||||
bus_dma_tag_t rxtag;
|
||||
|
||||
|
|
|
|||
Loading…
Reference in a new issue