[dpdk-dev,2/2] i40e rx Bulk Alloc: Larger list size (33 to 128) throughput optimization
Commit Message
Added check of minimum of 2 packet allocation count to eliminate the extra overhead for
supporting prefetch for the case of checking for only one packet allocated into the queue
at a time.
Used some standard variables to help reduce overhead of non-standard variable sizes.
Added second level prefetch to get packet address in cache 0 earlier and eliminated
calculation inside loop to determine end of prefetch loop.
Used old time instruction C optimization methods of, using pointers instead of arrays,
and reducing scope of some variables to improve chances of using register variables
instead of a stack variables.
Signed-off-by: Mike A. Polehn <mike.a.polehn@intel.com>
@@ -64,6 +64,7 @@
#define DEFAULT_TX_FREE_THRESH 32
#define I40E_MAX_PKT_TYPE 256
#define I40E_RX_INPUT_BUF_MAX 256
+#define I40E_RX_FREE_THRESH_MIN 2
#define I40E_TX_MAX_BURST 32
@@ -942,6 +943,12 @@ check_rx_burst_bulk_alloc_preconditions(__rte_unused struct i40e_rx_queue *rxq)
"rxq->rx_free_thresh=%d",
rxq->nb_rx_desc, rxq->rx_free_thresh);
ret = -EINVAL;
+ } else if (rxq->rx_free_thresh < I40E_RX_FREE_THRESH_MIN) {
+ PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions: "
+ "rxq->rx_free_thresh=%d, "
+ "I40E_RX_FREE_THRESH_MIN=%d",
+ rxq->rx_free_thresh, I40E_RX_FREE_THRESH_MIN);
+ ret = -EINVAL;
} else if (!(rxq->nb_rx_desc < (I40E_MAX_RING_DESC -
RTE_PMD_I40E_RX_MAX_BURST))) {
PMD_INIT_LOG(DEBUG, "Rx Burst Bulk Alloc Preconditions: "
@@ -1058,9 +1065,8 @@ i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq)
{
volatile union i40e_rx_desc *rxdp;
struct i40e_rx_entry *rxep;
- struct rte_mbuf *mb;
- unsigned alloc_idx, i;
- uint64_t dma_addr;
+ struct rte_mbuf *pk, *npk;
+ unsigned alloc_idx, i, l;
int diag;
/* Allocate buffers in bulk */
@@ -1076,22 +1082,36 @@ i40e_rx_alloc_bufs(struct i40e_rx_queue *rxq)
return -ENOMEM;
}
+ pk = rxep->mbuf;
+ rte_prefetch0(pk);
+ rxep++;
+ npk = rxep->mbuf;
+ rte_prefetch0(npk);
+ rxep++;
+ l = rxq->rx_free_thresh - 2;
+
rxdp = &rxq->rx_ring[alloc_idx];
for (i = 0; i < rxq->rx_free_thresh; i++) {
- if (likely(i < (rxq->rx_free_thresh - 1)))
+ struct rte_mbuf *mb = pk;
+ pk = npk;
+ if (likely(i < l)) {
/* Prefetch next mbuf */
- rte_prefetch0(rxep[i + 1].mbuf);
-
- mb = rxep[i].mbuf;
- rte_mbuf_refcnt_set(mb, 1);
- mb->next = NULL;
+ npk = rxep->mbuf;
+ rte_prefetch0(npk);
+ rxep++;
+ }
mb->data_off = RTE_PKTMBUF_HEADROOM;
+ rte_mbuf_refcnt_set(mb, 1);
mb->nb_segs = 1;
mb->port = rxq->port_id;
- dma_addr = rte_cpu_to_le_64(\
- RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb));
- rxdp[i].read.hdr_addr = 0;
- rxdp[i].read.pkt_addr = dma_addr;
+ mb->next = NULL;
+ {
+ uint64_t dma_addr = rte_cpu_to_le_64(
+ RTE_MBUF_DATA_DMA_ADDR_DEFAULT(mb));
+ rxdp->read.hdr_addr = dma_addr;
+ rxdp->read.pkt_addr = dma_addr;
+ }
+ rxdp++;
}
rxq->rx_last_pos = alloc_idx + rxq->rx_free_thresh - 1;