[dpdk-dev] mlx4: use dummy rxqs when a non-pow2 number is requested

Message ID 1458576484-28211-1-git-send-email-olivier.matz@6wind.com (mailing list archive)
State Changes Requested, archived
Delegated to: Bruce Richardson
Headers

Commit Message

Olivier Matz March 21, 2016, 4:08 p.m. UTC
  When using RSS, the number of rxqs has to be a power of two.
This is a problem because there is no API is dpdk that makes
the application aware of that.

A good compromise is to allow the application to request a
number of rxqs that is not a power of 2, but having inactive
queues that will never receive packets. In this configuration,
a warning will be issued to users to let them know that
this is not an optimal configuration.

Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 drivers/net/mlx4/mlx4.c | 27 +++++++++++++++++----------
 1 file changed, 17 insertions(+), 10 deletions(-)
  

Comments

Adrien Mazarguil March 21, 2016, 4:45 p.m. UTC | #1
On Mon, Mar 21, 2016 at 05:08:04PM +0100, Olivier Matz wrote:
> When using RSS, the number of rxqs has to be a power of two.
> This is a problem because there is no API is dpdk that makes
> the application aware of that.
> 
> A good compromise is to allow the application to request a
> number of rxqs that is not a power of 2, but having inactive
> queues that will never receive packets. In this configuration,
> a warning will be issued to users to let them know that
> this is not an optimal configuration.
> 
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>  drivers/net/mlx4/mlx4.c | 27 +++++++++++++++++----------
>  1 file changed, 17 insertions(+), 10 deletions(-)

Acked-by: Adrien Mazarguil <adrien.mazarguil@6wind.com>
  
Wiles, Keith March 21, 2016, 5:38 p.m. UTC | #2
Sent from my iPhone

> On Mar 21, 2016, at 11:10 AM, Olivier Matz <olivier.matz@6wind.com> wrote:
> 
> When using RSS, the number of rxqs has to be a power of two.
> This is a problem because there is no API is dpdk that makes
> the application aware of that.
> 
> A good compromise is to allow the application to request a
> number of rxqs that is not a power of 2, but having inactive
> queues that will never receive packets. In this configuration,
> a warning will be issued to users to let them know that
> this is not an optimal configuration.

Not sure I like this solution. I think an error should be returned with a log message instead. What if the next driver needs power of three or must be odd or even number. 

The bigger problem is the application is no longer portable for any given nic configuration.

We need a method for the application to query the system for these types of information. But as we do not have that API we need to just error the request off.

> 
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
> drivers/net/mlx4/mlx4.c | 27 +++++++++++++++++----------
> 1 file changed, 17 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
> index cc4e9aa..eaf06db 100644
> --- a/drivers/net/mlx4/mlx4.c
> +++ b/drivers/net/mlx4/mlx4.c
> @@ -698,7 +698,7 @@ txq_cleanup(struct txq *txq);
> 
> static int
> rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
> -      unsigned int socket, const struct rte_eth_rxconf *conf,
> +      unsigned int socket, int inactive, const struct rte_eth_rxconf *conf,
>      struct rte_mempool *mp);
> 
> static void
> @@ -734,12 +734,12 @@ dev_configure(struct rte_eth_dev *dev)
>    }
>    if (rxqs_n == priv->rxqs_n)
>        return 0;
> -    if ((rxqs_n & (rxqs_n - 1)) != 0) {
> -        ERROR("%p: invalid number of RX queues (%u),"
> -              " must be a power of 2",
> +    if (!rte_is_power_of_2(rxqs_n)) {
> +        WARN("%p: number of RX queues (%u), must be a"
> +              " power of 2: remaining queues will be inactive",
>              (void *)dev, rxqs_n);
> -        return EINVAL;
>    }
> +
>    INFO("%p: RX queues number update: %u -> %u",
>         (void *)dev, priv->rxqs_n, rxqs_n);
>    /* If RSS is enabled, disable it first. */
> @@ -775,7 +775,7 @@ dev_configure(struct rte_eth_dev *dev)
>    priv->rss = 1;
>    tmp = priv->rxqs_n;
>    priv->rxqs_n = rxqs_n;
> -    ret = rxq_setup(dev, &priv->rxq_parent, 0, 0, NULL, NULL);
> +    ret = rxq_setup(dev, &priv->rxq_parent, 0, 0, 0, NULL, NULL);
>    if (!ret)
>        return 0;
>    /* Failure, rollback. */
> @@ -3466,7 +3466,8 @@ rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
>        attr.qpg.qpg_type = IBV_EXP_QPG_PARENT;
>        /* TSS isn't necessary. */
>        attr.qpg.parent_attrib.tss_child_count = 0;
> -        attr.qpg.parent_attrib.rss_child_count = priv->rxqs_n;
> +        attr.qpg.parent_attrib.rss_child_count =
> +            rte_align32pow2(priv->rxqs_n + 1) >> 1;
>        DEBUG("initializing parent RSS queue");
>    } else {
>        attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX;
> @@ -3689,6 +3690,9 @@ skip_rtr:
>  *   Number of descriptors to configure in queue.
>  * @param socket
>  *   NUMA socket on which memory must be allocated.
> + * @param inactive
> + *   If true, the queue is disabled because its index is higher or
> + *   equal to the real number of queues, which must be a power of 2.
>  * @param[in] conf
>  *   Thresholds parameters.
>  * @param mp
> @@ -3699,7 +3703,7 @@ skip_rtr:
>  */
> static int
> rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
> -      unsigned int socket, const struct rte_eth_rxconf *conf,
> +      unsigned int socket, int inactive, const struct rte_eth_rxconf *conf,
>      struct rte_mempool *mp)
> {
>    struct priv *priv = dev->data->dev_private;
> @@ -3800,7 +3804,7 @@ skip_mr:
>    DEBUG("priv->device_attr.max_sge is %d",
>          priv->device_attr.max_sge);
> #ifdef RSS_SUPPORT
> -    if (priv->rss)
> +    if (priv->rss && !inactive)
>        tmpl.qp = rxq_setup_qp_rss(priv, tmpl.cq, desc, parent,
>                       tmpl.rd);
>    else
> @@ -3936,6 +3940,7 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
> {
>    struct priv *priv = dev->data->dev_private;
>    struct rxq *rxq = (*priv->rxqs)[idx];
> +    int inactive = 0;
>    int ret;
> 
>    if (mlx4_is_secondary())
> @@ -3967,7 +3972,9 @@ mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
>            return -ENOMEM;
>        }
>    }
> -    ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
> +    if (idx >= rte_align32pow2(priv->rxqs_n + 1) >> 1)
> +        inactive = 1;
> +    ret = rxq_setup(dev, rxq, desc, socket, inactive, conf, mp);
>    if (ret)
>        rte_free(rxq);
>    else {
> -- 
> 2.1.4
>
  
Olivier Matz March 22, 2016, 9:48 a.m. UTC | #3
Hi Keith,

On 03/21/2016 06:38 PM, Wiles, Keith wrote:
>> On Mar 21, 2016, at 11:10 AM, Olivier Matz <olivier.matz@6wind.com> wrote:
>>
>> When using RSS, the number of rxqs has to be a power of two.
>> This is a problem because there is no API is dpdk that makes
>> the application aware of that.
>>
>> A good compromise is to allow the application to request a
>> number of rxqs that is not a power of 2, but having inactive
>> queues that will never receive packets. In this configuration,
>> a warning will be issued to users to let them know that
>> this is not an optimal configuration.
> 
> Not sure I like this solution. I think an error should be returned with a log message instead. What if the next driver needs power of three or must be odd or even number. 
> 
> The bigger problem is the application is no longer portable for any given nic configuration.
> 
> We need a method for the application to query the system for these types of information. But as we do not have that API we need to just error the request off.


The initial problem is that the driver says "I support a maximum
of X queues" and if the application configures a lower number, it
gets an error.

There is no API in DPDK to tell that only specific number of queues
are supported. Adding an API is a solution, but in this case it's
probably overkill. With this patch, the driver can present the proper
number of queues to the application, knowing that the spreading of
the packets won't be ideal (some queues won't receive packets), but
it will work.

A step further in this direction would be to configure more queues
than asked in hardware to do a better spreading, almost similar to
what is done with RETA tables in mlx5. But this is more complicated
to do, especially if we want it for 16.04.

Hope this is clearer with the explanation.

Regards,
Olivier
  
Wiles, Keith March 22, 2016, 2:27 p.m. UTC | #4
Hi Olivier,

>Hi Keith,

>

>On 03/21/2016 06:38 PM, Wiles, Keith wrote:

>>> On Mar 21, 2016, at 11:10 AM, Olivier Matz <olivier.matz@6wind.com> wrote:

>>>

>>> When using RSS, the number of rxqs has to be a power of two.

>>> This is a problem because there is no API is dpdk that makes

>>> the application aware of that.

>>>

>>> A good compromise is to allow the application to request a

>>> number of rxqs that is not a power of 2, but having inactive

>>> queues that will never receive packets. In this configuration,

>>> a warning will be issued to users to let them know that

>>> this is not an optimal configuration.

>> 

>> Not sure I like this solution. I think an error should be returned with a log message instead. What if the next driver needs power of three or must be odd or even number. 

>> 

>> The bigger problem is the application is no longer portable for any given nic configuration.

>> 

>> We need a method for the application to query the system for these types of information. But as we do not have that API we need to just error the request off.

>

>

>The initial problem is that the driver says "I support a maximum

>of X queues" and if the application configures a lower number, it

>gets an error.

>

>There is no API in DPDK to tell that only specific number of queues

>are supported. Adding an API is a solution, but in this case it's

>probably overkill. With this patch, the driver can present the proper

>number of queues to the application, knowing that the spreading of

>the packets won't be ideal (some queues won't receive packets), but

>it will work.

>

>A step further in this direction would be to configure more queues

>than asked in hardware to do a better spreading, almost similar to

>what is done with RETA tables in mlx5. But this is more complicated

>to do, especially if we want it for 16.04.


Well I guess I must agree with the solution, but I am not real happy. Can we mark this a temp fix until we figure out a cleaner solution as I would not want this type of solution forever or be the standard way to handle these problems.

>

>Hope this is clearer with the explanation.

>

>Regards,

>Olivier

>

>



Regards,
Keith
  
Bruce Richardson March 24, 2016, 12:20 p.m. UTC | #5
On Mon, Mar 21, 2016 at 05:08:04PM +0100, Olivier Matz wrote:
> When using RSS, the number of rxqs has to be a power of two.
> This is a problem because there is no API is dpdk that makes
> the application aware of that.
> 
> A good compromise is to allow the application to request a
> number of rxqs that is not a power of 2, but having inactive
> queues that will never receive packets. In this configuration,
> a warning will be issued to users to let them know that
> this is not an optimal configuration.
> 
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>  drivers/net/mlx4/mlx4.c | 27 +++++++++++++++++----------
>  1 file changed, 17 insertions(+), 10 deletions(-)
> 
> diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
> index cc4e9aa..eaf06db 100644
> --- a/drivers/net/mlx4/mlx4.c
> +++ b/drivers/net/mlx4/mlx4.c
> @@ -698,7 +698,7 @@ txq_cleanup(struct txq *txq);
>  
>  static int
>  rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
> -	  unsigned int socket, const struct rte_eth_rxconf *conf,
> +	  unsigned int socket, int inactive, const struct rte_eth_rxconf *conf,
>  	  struct rte_mempool *mp);
>  
>  static void
> @@ -734,12 +734,12 @@ dev_configure(struct rte_eth_dev *dev)
>  	}
>  	if (rxqs_n == priv->rxqs_n)
>  		return 0;
> -	if ((rxqs_n & (rxqs_n - 1)) != 0) {
> -		ERROR("%p: invalid number of RX queues (%u),"
> -		      " must be a power of 2",
> +	if (!rte_is_power_of_2(rxqs_n)) {
> +		WARN("%p: number of RX queues (%u), must be a"
> +		      " power of 2: remaining queues will be inactive",

I'm not sure how clear this warning message is. To the reader there are no
extra "remaining" queues referred to, as it's not stated that the driver is
allocating extra queues. How about e.g.:

WARN("%p: number of RX queues on device must by a power of 2. Allocating %u
	queues, of which %u will be active. Remaining queues will be inactive"...)

/Bruce
  
Olivier Matz March 24, 2016, 12:25 p.m. UTC | #6
Hi Bruce,

On 03/24/2016 01:20 PM, Bruce Richardson wrote:
>> @@ -734,12 +734,12 @@ dev_configure(struct rte_eth_dev *dev)
>>  	}
>>  	if (rxqs_n == priv->rxqs_n)
>>  		return 0;
>> -	if ((rxqs_n & (rxqs_n - 1)) != 0) {
>> -		ERROR("%p: invalid number of RX queues (%u),"
>> -		      " must be a power of 2",
>> +	if (!rte_is_power_of_2(rxqs_n)) {
>> +		WARN("%p: number of RX queues (%u), must be a"
>> +		      " power of 2: remaining queues will be inactive",
> 
> I'm not sure how clear this warning message is. To the reader there are no
> extra "remaining" queues referred to, as it's not stated that the driver is
> allocating extra queues. How about e.g.:
> 
> WARN("%p: number of RX queues on device must by a power of 2. Allocating %u
> 	queues, of which %u will be active. Remaining queues will be inactive"...)
> 

You're right, I'll send a v2 with a clearer message.

Regards,
Olivier
  
Olivier Matz April 15, 2016, 9:11 a.m. UTC | #7
Hi,

On 03/22/2016 03:27 PM, Wiles, Keith wrote:
> Hi Olivier,
>
>> Hi Keith,
>>
>> On 03/21/2016 06:38 PM, Wiles, Keith wrote:
>>>> On Mar 21, 2016, at 11:10 AM, Olivier Matz <olivier.matz@6wind.com> wrote:
>>>>
>>>> When using RSS, the number of rxqs has to be a power of two.
>>>> This is a problem because there is no API is dpdk that makes
>>>> the application aware of that.
>>>>
>>>> A good compromise is to allow the application to request a
>>>> number of rxqs that is not a power of 2, but having inactive
>>>> queues that will never receive packets. In this configuration,
>>>> a warning will be issued to users to let them know that
>>>> this is not an optimal configuration.
>>>
>>> Not sure I like this solution. I think an error should be returned with a log message instead. What if the next driver needs power of three or must be odd or even number.
>>>
>>> The bigger problem is the application is no longer portable for any given nic configuration.
>>>
>>> We need a method for the application to query the system for these types of information. But as we do not have that API we need to just error the request off.
>>
>>
>> The initial problem is that the driver says "I support a maximum
>> of X queues" and if the application configures a lower number, it
>> gets an error.
>>
>> There is no API in DPDK to tell that only specific number of queues
>> are supported. Adding an API is a solution, but in this case it's
>> probably overkill. With this patch, the driver can present the proper
>> number of queues to the application, knowing that the spreading of
>> the packets won't be ideal (some queues won't receive packets), but
>> it will work.
>>
>> A step further in this direction would be to configure more queues
>> than asked in hardware to do a better spreading, almost similar to
>> what is done with RETA tables in mlx5. But this is more complicated
>> to do, especially if we want it for 16.04.
>
> Well I guess I must agree with the solution, but I am not real happy. Can we mark this a temp fix until we figure out a cleaner solution as I would not want this type of solution forever or be the standard way to handle these problems.

Back on this issue, I agree that a cleaner solution may be needed,
probably in the ethdev API. I did a quick look in the drivers directory
and, from what I remember, vmxnet3 also need to have a number of rxq
and txq to be a power of 2.

From what I see in the code, if an application tries to configure a
number of queue which is not a power of 2 on vmxnet3, the driver will
fail to start without any log.

Yong, do you feel a patch similar to what was done on mlx4 is
feasable/suitable in vmxnet3 driver? Shouldn't at least have some
error logs saying that the number of rxq/txq is invalid?

It cleanup or rework is planned in the ethdev API for 16.11, maybe
this is a problem that should be addressed.


Regards,
Olivier
  

Patch

diff --git a/drivers/net/mlx4/mlx4.c b/drivers/net/mlx4/mlx4.c
index cc4e9aa..eaf06db 100644
--- a/drivers/net/mlx4/mlx4.c
+++ b/drivers/net/mlx4/mlx4.c
@@ -698,7 +698,7 @@  txq_cleanup(struct txq *txq);
 
 static int
 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
-	  unsigned int socket, const struct rte_eth_rxconf *conf,
+	  unsigned int socket, int inactive, const struct rte_eth_rxconf *conf,
 	  struct rte_mempool *mp);
 
 static void
@@ -734,12 +734,12 @@  dev_configure(struct rte_eth_dev *dev)
 	}
 	if (rxqs_n == priv->rxqs_n)
 		return 0;
-	if ((rxqs_n & (rxqs_n - 1)) != 0) {
-		ERROR("%p: invalid number of RX queues (%u),"
-		      " must be a power of 2",
+	if (!rte_is_power_of_2(rxqs_n)) {
+		WARN("%p: number of RX queues (%u), must be a"
+		      " power of 2: remaining queues will be inactive",
 		      (void *)dev, rxqs_n);
-		return EINVAL;
 	}
+
 	INFO("%p: RX queues number update: %u -> %u",
 	     (void *)dev, priv->rxqs_n, rxqs_n);
 	/* If RSS is enabled, disable it first. */
@@ -775,7 +775,7 @@  dev_configure(struct rte_eth_dev *dev)
 	priv->rss = 1;
 	tmp = priv->rxqs_n;
 	priv->rxqs_n = rxqs_n;
-	ret = rxq_setup(dev, &priv->rxq_parent, 0, 0, NULL, NULL);
+	ret = rxq_setup(dev, &priv->rxq_parent, 0, 0, 0, NULL, NULL);
 	if (!ret)
 		return 0;
 	/* Failure, rollback. */
@@ -3466,7 +3466,8 @@  rxq_setup_qp_rss(struct priv *priv, struct ibv_cq *cq, uint16_t desc,
 		attr.qpg.qpg_type = IBV_EXP_QPG_PARENT;
 		/* TSS isn't necessary. */
 		attr.qpg.parent_attrib.tss_child_count = 0;
-		attr.qpg.parent_attrib.rss_child_count = priv->rxqs_n;
+		attr.qpg.parent_attrib.rss_child_count =
+			rte_align32pow2(priv->rxqs_n + 1) >> 1;
 		DEBUG("initializing parent RSS queue");
 	} else {
 		attr.qpg.qpg_type = IBV_EXP_QPG_CHILD_RX;
@@ -3689,6 +3690,9 @@  skip_rtr:
  *   Number of descriptors to configure in queue.
  * @param socket
  *   NUMA socket on which memory must be allocated.
+ * @param inactive
+ *   If true, the queue is disabled because its index is higher or
+ *   equal to the real number of queues, which must be a power of 2.
  * @param[in] conf
  *   Thresholds parameters.
  * @param mp
@@ -3699,7 +3703,7 @@  skip_rtr:
  */
 static int
 rxq_setup(struct rte_eth_dev *dev, struct rxq *rxq, uint16_t desc,
-	  unsigned int socket, const struct rte_eth_rxconf *conf,
+	  unsigned int socket, int inactive, const struct rte_eth_rxconf *conf,
 	  struct rte_mempool *mp)
 {
 	struct priv *priv = dev->data->dev_private;
@@ -3800,7 +3804,7 @@  skip_mr:
 	DEBUG("priv->device_attr.max_sge is %d",
 	      priv->device_attr.max_sge);
 #ifdef RSS_SUPPORT
-	if (priv->rss)
+	if (priv->rss && !inactive)
 		tmpl.qp = rxq_setup_qp_rss(priv, tmpl.cq, desc, parent,
 					   tmpl.rd);
 	else
@@ -3936,6 +3940,7 @@  mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 {
 	struct priv *priv = dev->data->dev_private;
 	struct rxq *rxq = (*priv->rxqs)[idx];
+	int inactive = 0;
 	int ret;
 
 	if (mlx4_is_secondary())
@@ -3967,7 +3972,9 @@  mlx4_rx_queue_setup(struct rte_eth_dev *dev, uint16_t idx, uint16_t desc,
 			return -ENOMEM;
 		}
 	}
-	ret = rxq_setup(dev, rxq, desc, socket, conf, mp);
+	if (idx >= rte_align32pow2(priv->rxqs_n + 1) >> 1)
+		inactive = 1;
+	ret = rxq_setup(dev, rxq, desc, socket, inactive, conf, mp);
 	if (ret)
 		rte_free(rxq);
 	else {