[dpdk-dev,v3,2/2] virtio/vdev: add a new vdev named eth_cvio

Message ID 1461207396-42742-3-git-send-email-jianfeng.tan@intel.com (mailing list archive)
State Superseded, archived
Delegated to: Yuanhan Liu
Headers

Commit Message

Jianfeng Tan April 21, 2016, 2:56 a.m. UTC
  Add a new virtual device named eth_cvio, it can be used just like
eth_ring, eth_null, etc.

Configured parameters include:
  - rx (optional, 1 by default), number of rx, not used for now.
  - tx (optional, 1 by default), number of tx, not used for now.
  - cq (optional, 0 by default), not supported for now.
  - mac (optional), random value will be given if not specified.
  - queue_size (optional, 256 by default), size of virtqueue.
  - path (madatory), path of vhost, depends on the file type, vhost
    user if the given path points to a unix socket; vhost-net if the
    given path points to a char device.
  - ifname (optional), specify the name of backend tap device; only
    valid when backend is vhost-net.

The major difference with original virtio for vm is that, here we use
virtual addr instead of physical addr for vhost to calculate relative
address.

When enable CONFIG_RTE_VIRTIO_VDEV (enabled by default), the compiled
library can be used in both VM and container environment.

Examples:
path_vhost=/dev/vhost-net # use vhost-net as a backend
path_vhost=<path_to_vhost_user> # use vhost-user as a backend

sudo ./examples/l2fwd/build/l2fwd -c 0x100000 -n 4 \
    --socket-mem 0,1024 --no-pci --file-prefix=l2fwd \
    --vdev=eth_cvio0,mac=00:01:02:03:04:05,path=$path_vhost -- -p 0x1

Known issues:
 - Control queue and multi-queue are not supported yet.
 - Cannot work with --huge-unlink.
 - Cannot work with no-huge.
 - Cannot work when there are more than VHOST_MEMORY_MAX_NREGIONS(8)
   hugepages.
 - Root privilege is a must (mainly becase of sorting hugepages according
   to physical address).
 - Applications should not use file name like HUGEFILE_FMT ("%smap_%d").

Signed-off-by: Huawei Xie <huawei.xie@intel.com>
Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
Acked-By: Neil Horman <nhorman@tuxdrver.com>
---
 doc/guides/nics/overview.rst             |  58 +++++-----
 doc/guides/rel_notes/release_16_07.rst   |   4 +
 drivers/net/virtio/rte_eth_virtio_vdev.c | 188 ++++++++++++++++++++++++++++++-
 drivers/net/virtio/virtio_ethdev.c       | 134 ++++++++++++++--------
 drivers/net/virtio/virtio_ethdev.h       |   3 +
 drivers/net/virtio/virtio_pci.h          |   2 +-
 drivers/net/virtio/virtio_rxtx.c         |   5 +-
 drivers/net/virtio/virtio_rxtx_simple.c  |  13 ++-
 drivers/net/virtio/virtqueue.h           |  10 ++
 9 files changed, 326 insertions(+), 91 deletions(-)
  

Comments

David Marchand April 21, 2016, 8:51 a.m. UTC | #1
Hello,

On Thu, Apr 21, 2016 at 4:56 AM, Jianfeng Tan <jianfeng.tan@intel.com> wrote:
> Add a new virtual device named eth_cvio, it can be used just like
> eth_ring, eth_null, etc.
>
> Configured parameters include:
>   - rx (optional, 1 by default), number of rx, not used for now.
>   - tx (optional, 1 by default), number of tx, not used for now.
>   - cq (optional, 0 by default), not supported for now.
>   - mac (optional), random value will be given if not specified.
>   - queue_size (optional, 256 by default), size of virtqueue.
>   - path (madatory), path of vhost, depends on the file type, vhost
>     user if the given path points to a unix socket; vhost-net if the
>     given path points to a char device.
>   - ifname (optional), specify the name of backend tap device; only
>     valid when backend is vhost-net.
>
> The major difference with original virtio for vm is that, here we use
> virtual addr instead of physical addr for vhost to calculate relative
> address.
>
> When enable CONFIG_RTE_VIRTIO_VDEV (enabled by default), the compiled
> library can be used in both VM and container environment.

This implementation heavily relies on dev_type to keep as much code
shared between pci / vdev path as possible.

virtio code relies on drv_flags (even updating it while this should be
per-device).
So first, virtio should rely on dev_flags.

The rest needs to be astracted in some virtio ops ?


Thanks.
  
Thomas Monjalon April 21, 2016, 10:05 a.m. UTC | #2
2016-04-21 02:56, Jianfeng Tan:
> Add a new virtual device named eth_cvio, it can be used just like
> eth_ring, eth_null, etc.

Why this name eth_cvio?
Why the prefix eth_?
The virtio-net driver uses a kernel device. Here it is a userland device.
Why not virtio-user?

>  .. table:: Features availability in networking drivers
>  
> -   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
> -   Feature              a b b b c e e e i i i i i i i i i i f f f f m m m n n p r s v v v v x
> -                        f n n o x 1 n n 4 4 4 4 g g x x x x m m m m l l p f u c i z h i i m e
> -                        p x x n g 0 a i 0 0 0 0 b b g g g g 1 1 1 1 x x i p l a n e o r r x n
> -                        a 2 2 d b 0   c e e e e   v b b b b 0 0 0 0 4 5 p   l p g d s t t n v
> -                        c x x i e 0       . v v   f e e e e k k k k     e         a t i i e i
> -                        k   v n           . f f       . v v   . v v               t   o o t r
> -                        e   f g           .   .       . f f   . f f               a     . 3 t
> +   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
> +   Feature              a b b b c e e e i i i i i i i i i i f f f f m m m n n p r s v v v v x c
> +                        f n n o x 1 n n 4 4 4 4 g g x x x x m m m m l l p f u c i z h i i m e v
> +                        p x x n g 0 a i 0 0 0 0 b b g g g g 1 1 1 1 x x i p l a n e o r r x n i
> +                        a 2 2 d b 0   c e e e e   v b b b b 0 0 0 0 4 5 p   l p g d s t t n v r
> +                        c x x i e 0       . v v   f e e e e k k k k     e         a t i i e i t
> +                        k   v n           . f f       . v v   . v v               t   o o t r i
> +                        e   f g           .   .       . f f   . f f               a     . 3 t o
>                          t                 v   v       v   v   v   v               2     v
>                                            e   e       e   e   e   e                     e
>                                            c   c       c   c   c   c                     c
> -   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =

Please keep the alphabetical order.
  
Yuanhan Liu April 21, 2016, 10:14 p.m. UTC | #3
On Thu, Apr 21, 2016 at 02:56:36AM +0000, Jianfeng Tan wrote:
> Add a new virtual device named eth_cvio, it can be used just like
> eth_ring, eth_null, etc.
> 
> Configured parameters include:
>   - rx (optional, 1 by default), number of rx, not used for now.
>   - tx (optional, 1 by default), number of tx, not used for now.
>   - cq (optional, 0 by default), not supported for now.
>   - mac (optional), random value will be given if not specified.
>   - queue_size (optional, 256 by default), size of virtqueue.
>   - path (madatory), path of vhost, depends on the file type, vhost
>     user if the given path points to a unix socket; vhost-net if the
>     given path points to a char device.
>   - ifname (optional), specify the name of backend tap device; only
>     valid when backend is vhost-net.
> 
> The major difference with original virtio for vm is that, here we use
> virtual addr instead of physical addr for vhost to calculate relative
> address.
> 
> When enable CONFIG_RTE_VIRTIO_VDEV (enabled by default), the compiled
> library can be used in both VM and container environment.
> 
> Examples:
> path_vhost=/dev/vhost-net # use vhost-net as a backend
> path_vhost=<path_to_vhost_user> # use vhost-user as a backend
> 
> sudo ./examples/l2fwd/build/l2fwd -c 0x100000 -n 4 \
>     --socket-mem 0,1024 --no-pci --file-prefix=l2fwd \
>     --vdev=eth_cvio0,mac=00:01:02:03:04:05,path=$path_vhost -- -p 0x1
> 
> Known issues:
>  - Control queue and multi-queue are not supported yet.
>  - Cannot work with --huge-unlink.
>  - Cannot work with no-huge.
>  - Cannot work when there are more than VHOST_MEMORY_MAX_NREGIONS(8)
>    hugepages.
>  - Root privilege is a must (mainly becase of sorting hugepages according
>    to physical address).
>  - Applications should not use file name like HUGEFILE_FMT ("%smap_%d").
> 
> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
> Acked-By: Neil Horman <nhorman@tuxdrver.com>
> ---
>  doc/guides/nics/overview.rst             |  58 +++++-----
>  doc/guides/rel_notes/release_16_07.rst   |   4 +
>  drivers/net/virtio/rte_eth_virtio_vdev.c | 188 ++++++++++++++++++++++++++++++-

Why prefixing it with "rte_eth_..."?

> -   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
> -   Feature              a b b b c e e e i i i i i i i i i i f f f f m m m n n p r s v v v v x
> -                        f n n o x 1 n n 4 4 4 4 g g x x x x m m m m l l p f u c i z h i i m e
> -                        p x x n g 0 a i 0 0 0 0 b b g g g g 1 1 1 1 x x i p l a n e o r r x n
> -                        a 2 2 d b 0   c e e e e   v b b b b 0 0 0 0 4 5 p   l p g d s t t n v
> -                        c x x i e 0       . v v   f e e e e k k k k     e         a t i i e i
> -                        k   v n           . f f       . v v   . v v               t   o o t r
> -                        e   f g           .   .       . f f   . f f               a     . 3 t
> +   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
> +   Feature              a b b b c e e e i i i i i i i i i i f f f f m m m n n p r s v v v v x c
> +                        f n n o x 1 n n 4 4 4 4 g g x x x x m m m m l l p f u c i z h i i m e v
> +                        p x x n g 0 a i 0 0 0 0 b b g g g g 1 1 1 1 x x i p l a n e o r r x n i
> +                        a 2 2 d b 0   c e e e e   v b b b b 0 0 0 0 4 5 p   l p g d s t t n v r
> +                        c x x i e 0       . v v   f e e e e k k k k     e         a t i i e i t
> +                        k   v n           . f f       . v v   . v v               t   o o t r i
> +                        e   f g           .   .       . f f   . f f               a     . 3 t o
>                          t                 v   v       v   v   v   v               2     v


I would wish we have a diff that could do compare by columns but not by
rows :)


> diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
> index 68097e6..3b47332 100644
> --- a/drivers/net/virtio/virtio_pci.h
> +++ b/drivers/net/virtio/virtio_pci.h
> @@ -264,7 +264,7 @@ struct virtio_hw {
>  #define VHOST_KERNEL	0
>  #define VHOST_USER	1
>  	int		type; /* type of backend */
> -	uint32_t	queue_num;
> +	uint32_t	queue_size;

Hmm, this kind of change should not be squeezed here, stealthily. I
would agree that the rename in decreases the stealthily, which is a
good thing. You should submit a standalone patch instead.

	--yliu
  
Jianfeng Tan April 22, 2016, 5:15 a.m. UTC | #4
Hi,

On 4/21/2016 4:51 PM, David Marchand wrote:
> Hello,
>
> On Thu, Apr 21, 2016 at 4:56 AM, Jianfeng Tan <jianfeng.tan@intel.com> wrote:
>> Add a new virtual device named eth_cvio, it can be used just like
>> eth_ring, eth_null, etc.
>>
>> Configured parameters include:
>>    - rx (optional, 1 by default), number of rx, not used for now.
>>    - tx (optional, 1 by default), number of tx, not used for now.
>>    - cq (optional, 0 by default), not supported for now.
>>    - mac (optional), random value will be given if not specified.
>>    - queue_size (optional, 256 by default), size of virtqueue.
>>    - path (madatory), path of vhost, depends on the file type, vhost
>>      user if the given path points to a unix socket; vhost-net if the
>>      given path points to a char device.
>>    - ifname (optional), specify the name of backend tap device; only
>>      valid when backend is vhost-net.
>>
>> The major difference with original virtio for vm is that, here we use
>> virtual addr instead of physical addr for vhost to calculate relative
>> address.
>>
>> When enable CONFIG_RTE_VIRTIO_VDEV (enabled by default), the compiled
>> library can be used in both VM and container environment.
> This implementation heavily relies on dev_type to keep as much code
> shared between pci / vdev path as possible.

Yes, I still have no method to make it more clear.

>
> virtio code relies on drv_flags (even updating it while this should be
> per-device).
> So first, virtio should rely on dev_flags.

Mainly drv_flags's RTE_PCI_DRV_INTR_LSC, and RTE_PCI_DRV_DETACHABLE bit 
is used. I understand the issue, pointed out by you here, that if two 
virtio devices are used in a VM, one with feature VIRTIO_NET_F_STATUS, 
and the other without feature VIRTIO_NET_F_STATUS (under the case that 
two vhost backends are used). Then it leads to uncertainty of the behavior.

Since the flags has been copied into dev_flags after features 
negotiated, I believe we should use dev_flags instead of drv_flags. A 
patch to fix this will be sent.


>
> The rest needs to be astracted in some virtio ops ?

So with that fix goes, we may make it more clear now. Do you mean this?

Thanks,
Jianfeng

>
>
> Thanks.
>
  
Jianfeng Tan April 22, 2016, 7:26 a.m. UTC | #5
Hi Thomas,

On 4/21/2016 6:05 PM, Thomas Monjalon wrote:
> 2016-04-21 02:56, Jianfeng Tan:
>> Add a new virtual device named eth_cvio, it can be used just like
>> eth_ring, eth_null, etc.
> Why this name eth_cvio?
> Why the prefix eth_?
> The virtio-net driver uses a kernel device. Here it is a userland device.
> Why not virtio-user?

I was looking for an appropriate name for this device; have tried a lot, 
but none is good enough. Thank you for your advice virtio-user.
The prefix eth_ is to keep the same style with eth_ring, eth_null.
So how about eth_virtio_user?

>
>>   .. table:: Features availability in networking drivers
>>   
>> -   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
>> -   Feature              a b b b c e e e i i i i i i i i i i f f f f m m m n n p r s v v v v x
>> -                        f n n o x 1 n n 4 4 4 4 g g x x x x m m m m l l p f u c i z h i i m e
>> -                        p x x n g 0 a i 0 0 0 0 b b g g g g 1 1 1 1 x x i p l a n e o r r x n
>> -                        a 2 2 d b 0   c e e e e   v b b b b 0 0 0 0 4 5 p   l p g d s t t n v
>> -                        c x x i e 0       . v v   f e e e e k k k k     e         a t i i e i
>> -                        k   v n           . f f       . v v   . v v               t   o o t r
>> -                        e   f g           .   .       . f f   . f f               a     . 3 t
>> +   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
>> +   Feature              a b b b c e e e i i i i i i i i i i f f f f m m m n n p r s v v v v x c
>> +                        f n n o x 1 n n 4 4 4 4 g g x x x x m m m m l l p f u c i z h i i m e v
>> +                        p x x n g 0 a i 0 0 0 0 b b g g g g 1 1 1 1 x x i p l a n e o r r x n i
>> +                        a 2 2 d b 0   c e e e e   v b b b b 0 0 0 0 4 5 p   l p g d s t t n v r
>> +                        c x x i e 0       . v v   f e e e e k k k k     e         a t i i e i t
>> +                        k   v n           . f f       . v v   . v v               t   o o t r i
>> +                        e   f g           .   .       . f f   . f f               a     . 3 t o
>>                           t                 v   v       v   v   v   v               2     v
>>                                             e   e       e   e   e   e                     e
>>                                             c   c       c   c   c   c                     c
>> -   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
> Please keep the alphabetical order.

OK, I'll do it in next version.

Thanks,
Jianfeng
  
David Marchand April 22, 2016, 7:36 a.m. UTC | #6
Hello,

On Fri, Apr 22, 2016 at 7:15 AM, Tan, Jianfeng <jianfeng.tan@intel.com> wrote:
> On 4/21/2016 4:51 PM, David Marchand wrote:
>> virtio code relies on drv_flags (even updating it while this should be
>> per-device).
>> So first, virtio should rely on dev_flags.
>
>
> Mainly drv_flags's RTE_PCI_DRV_INTR_LSC, and RTE_PCI_DRV_DETACHABLE bit is
> used. I understand the issue, pointed out by you here, that if two virtio
> devices are used in a VM, one with feature VIRTIO_NET_F_STATUS, and the
> other without feature VIRTIO_NET_F_STATUS (under the case that two vhost
> backends are used). Then it leads to uncertainty of the behavior.
>
> Since the flags has been copied into dev_flags after features negotiated, I
> believe we should use dev_flags instead of drv_flags. A patch to fix this
> will be sent.

Ok.

>> The rest needs to be astracted in some virtio ops ?
> So with that fix goes, we may make it more clear now. Do you mean this?

Well, here, we have what looks like to be two drivers (one pci and one vdev).
You tried to keep all this code together, to avoid duplicating it,
which sounds sane.
But in the end, you are trying to make this work by adding checks
where this can't.
I am not saying we should duplicate the code, but maybe having some
internal virtio ops / abstraction would do the trick and avoid those
checks.


The reason of those comments is that dev_type in ethdev is going to
disappear, see [1] and [2].
Drivers are called through their own specific ethdev/crypto ops and
so, those drivers know implicitely that their are either pci or vdev
(or whatever in the future) drivers.


[1]: http://dpdk.org/ml/archives/dev/2016-April/037686.html
[2]: http://dpdk.org/ml/archives/dev/2016-January/031390.html
  
Thomas Monjalon April 22, 2016, 8:30 a.m. UTC | #7
2016-04-22 15:26, Tan, Jianfeng:
> Hi Thomas,
> 
> On 4/21/2016 6:05 PM, Thomas Monjalon wrote:
> > 2016-04-21 02:56, Jianfeng Tan:
> >> Add a new virtual device named eth_cvio, it can be used just like
> >> eth_ring, eth_null, etc.
> > Why this name eth_cvio?
> > Why the prefix eth_?
> > The virtio-net driver uses a kernel device. Here it is a userland device.
> > Why not virtio-user?
> 
> I was looking for an appropriate name for this device; have tried a lot, 
> but none is good enough. Thank you for your advice virtio-user.
> The prefix eth_ is to keep the same style with eth_ring, eth_null.
> So how about eth_virtio_user?

I prefer not adding eth_. Just my opinion.
  
Jianfeng Tan April 22, 2016, 10:12 a.m. UTC | #8
Hi Yuanhan,

On 4/22/2016 6:14 AM, Yuanhan Liu wrote:
> On Thu, Apr 21, 2016 at 02:56:36AM +0000, Jianfeng Tan wrote:
>> Add a new virtual device named eth_cvio, it can be used just like
>> eth_ring, eth_null, etc.
>>
>> Configured parameters include:
>>    - rx (optional, 1 by default), number of rx, not used for now.
>>    - tx (optional, 1 by default), number of tx, not used for now.
>>    - cq (optional, 0 by default), not supported for now.
>>    - mac (optional), random value will be given if not specified.
>>    - queue_size (optional, 256 by default), size of virtqueue.
>>    - path (madatory), path of vhost, depends on the file type, vhost
>>      user if the given path points to a unix socket; vhost-net if the
>>      given path points to a char device.
>>    - ifname (optional), specify the name of backend tap device; only
>>      valid when backend is vhost-net.
>>
>> The major difference with original virtio for vm is that, here we use
>> virtual addr instead of physical addr for vhost to calculate relative
>> address.
>>
>> When enable CONFIG_RTE_VIRTIO_VDEV (enabled by default), the compiled
>> library can be used in both VM and container environment.
>>
>> Examples:
>> path_vhost=/dev/vhost-net # use vhost-net as a backend
>> path_vhost=<path_to_vhost_user> # use vhost-user as a backend
>>
>> sudo ./examples/l2fwd/build/l2fwd -c 0x100000 -n 4 \
>>      --socket-mem 0,1024 --no-pci --file-prefix=l2fwd \
>>      --vdev=eth_cvio0,mac=00:01:02:03:04:05,path=$path_vhost -- -p 0x1
>>
>> Known issues:
>>   - Control queue and multi-queue are not supported yet.
>>   - Cannot work with --huge-unlink.
>>   - Cannot work with no-huge.
>>   - Cannot work when there are more than VHOST_MEMORY_MAX_NREGIONS(8)
>>     hugepages.
>>   - Root privilege is a must (mainly becase of sorting hugepages according
>>     to physical address).
>>   - Applications should not use file name like HUGEFILE_FMT ("%smap_%d").
>>
>> Signed-off-by: Huawei Xie <huawei.xie@intel.com>
>> Signed-off-by: Jianfeng Tan <jianfeng.tan@intel.com>
>> Acked-By: Neil Horman <nhorman@tuxdrver.com>
>> ---
>>   doc/guides/nics/overview.rst             |  58 +++++-----
>>   doc/guides/rel_notes/release_16_07.rst   |   4 +
>>   drivers/net/virtio/rte_eth_virtio_vdev.c | 188 ++++++++++++++++++++++++++++++-
> Why prefixing it with "rte_eth_..."?

I was to make align with other virtual devices, like ring, null, etc. 
But like suggested by Thomas and you, I'll rename it.

>
>> -   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
>> -   Feature              a b b b c e e e i i i i i i i i i i f f f f m m m n n p r s v v v v x
>> -                        f n n o x 1 n n 4 4 4 4 g g x x x x m m m m l l p f u c i z h i i m e
>> -                        p x x n g 0 a i 0 0 0 0 b b g g g g 1 1 1 1 x x i p l a n e o r r x n
>> -                        a 2 2 d b 0   c e e e e   v b b b b 0 0 0 0 4 5 p   l p g d s t t n v
>> -                        c x x i e 0       . v v   f e e e e k k k k     e         a t i i e i
>> -                        k   v n           . f f       . v v   . v v               t   o o t r
>> -                        e   f g           .   .       . f f   . f f               a     . 3 t
>> +   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
>> +   Feature              a b b b c e e e i i i i i i i i i i f f f f m m m n n p r s v v v v x c
>> +                        f n n o x 1 n n 4 4 4 4 g g x x x x m m m m l l p f u c i z h i i m e v
>> +                        p x x n g 0 a i 0 0 0 0 b b g g g g 1 1 1 1 x x i p l a n e o r r x n i
>> +                        a 2 2 d b 0   c e e e e   v b b b b 0 0 0 0 4 5 p   l p g d s t t n v r
>> +                        c x x i e 0       . v v   f e e e e k k k k     e         a t i i e i t
>> +                        k   v n           . f f       . v v   . v v               t   o o t r i
>> +                        e   f g           .   .       . f f   . f f               a     . 3 t o
>>                           t                 v   v       v   v   v   v               2     v
>
> I would wish we have a diff that could do compare by columns but not by
> rows :)
>
>
>> diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
>> index 68097e6..3b47332 100644
>> --- a/drivers/net/virtio/virtio_pci.h
>> +++ b/drivers/net/virtio/virtio_pci.h
>> @@ -264,7 +264,7 @@ struct virtio_hw {
>>   #define VHOST_KERNEL	0
>>   #define VHOST_USER	1
>>   	int		type; /* type of backend */
>> -	uint32_t	queue_num;
>> +	uint32_t	queue_size;
> Hmm, this kind of change should not be squeezed here, stealthily. I
> would agree that the rename in decreases the stealthily, which is a
> good thing. You should submit a standalone patch instead.

Nice catch. I should do it in the first commit when it's defined.

Thank,
Jianfeng



>
> 	--yliu
  
Jianfeng Tan April 22, 2016, 10:25 a.m. UTC | #9
Hi,

On 4/22/2016 3:36 PM, David Marchand wrote:
> Hello,
>
> On Fri, Apr 22, 2016 at 7:15 AM, Tan, Jianfeng <jianfeng.tan@intel.com> wrote:
>> On 4/21/2016 4:51 PM, David Marchand wrote:
>>> virtio code relies on drv_flags (even updating it while this should be
>>> per-device).
>>> So first, virtio should rely on dev_flags.
>>
>> Mainly drv_flags's RTE_PCI_DRV_INTR_LSC, and RTE_PCI_DRV_DETACHABLE bit is
>> used. I understand the issue, pointed out by you here, that if two virtio
>> devices are used in a VM, one with feature VIRTIO_NET_F_STATUS, and the
>> other without feature VIRTIO_NET_F_STATUS (under the case that two vhost
>> backends are used). Then it leads to uncertainty of the behavior.
>>
>> Since the flags has been copied into dev_flags after features negotiated, I
>> believe we should use dev_flags instead of drv_flags. A patch to fix this
>> will be sent.
> Ok.
>
>>> The rest needs to be astracted in some virtio ops ?
>> So with that fix goes, we may make it more clear now. Do you mean this?
> Well, here, we have what looks like to be two drivers (one pci and one vdev).
> You tried to keep all this code together, to avoid duplicating it,
> which sounds sane.
> But in the end, you are trying to make this work by adding checks
> where this can't.
> I am not saying we should duplicate the code, but maybe having some
> internal virtio ops / abstraction would do the trick and avoid those
> checks.
>
>
> The reason of those comments is that dev_type in ethdev is going to
> disappear, see [1] and [2].
> Drivers are called through their own specific ethdev/crypto ops and
> so, those drivers know implicitely that their are either pci or vdev
> (or whatever in the future) drivers.
>
>
> [1]: http://dpdk.org/ml/archives/dev/2016-April/037686.html
> [2]: http://dpdk.org/ml/archives/dev/2016-January/031390.html

Thank you for this import information. A quick check, we can remove 
those checks, as you said, to virtio ops. Great!

Thanks,
Jianfeng
  

Patch

diff --git a/doc/guides/nics/overview.rst b/doc/guides/nics/overview.rst
index ed116e3..1ff72fb 100644
--- a/doc/guides/nics/overview.rst
+++ b/doc/guides/nics/overview.rst
@@ -74,40 +74,40 @@  Most of these differences are summarized below.
 
 .. table:: Features availability in networking drivers
 
-   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
-   Feature              a b b b c e e e i i i i i i i i i i f f f f m m m n n p r s v v v v x
-                        f n n o x 1 n n 4 4 4 4 g g x x x x m m m m l l p f u c i z h i i m e
-                        p x x n g 0 a i 0 0 0 0 b b g g g g 1 1 1 1 x x i p l a n e o r r x n
-                        a 2 2 d b 0   c e e e e   v b b b b 0 0 0 0 4 5 p   l p g d s t t n v
-                        c x x i e 0       . v v   f e e e e k k k k     e         a t i i e i
-                        k   v n           . f f       . v v   . v v               t   o o t r
-                        e   f g           .   .       . f f   . f f               a     . 3 t
+   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+   Feature              a b b b c e e e i i i i i i i i i i f f f f m m m n n p r s v v v v x c
+                        f n n o x 1 n n 4 4 4 4 g g x x x x m m m m l l p f u c i z h i i m e v
+                        p x x n g 0 a i 0 0 0 0 b b g g g g 1 1 1 1 x x i p l a n e o r r x n i
+                        a 2 2 d b 0   c e e e e   v b b b b 0 0 0 0 4 5 p   l p g d s t t n v r
+                        c x x i e 0       . v v   f e e e e k k k k     e         a t i i e i t
+                        k   v n           . f f       . v v   . v v               t   o o t r i
+                        e   f g           .   .       . f f   . f f               a     . 3 t o
                         t                 v   v       v   v   v   v               2     v
                                           e   e       e   e   e   e                     e
                                           c   c       c   c   c   c                     c
-   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
    speed capabilities
-   link status            X X   X X   X X X     X   X X X X         X X           X X X X
+   link status            X X   X X   X X X     X   X X X X         X X           X X X X     X
    link status event      X X     X     X X     X   X X             X X             X
    queue status event                                                               X
    Rx interrupt                   X     X X X X X X X X X X X X X X
-   queue start/stop             X   X X X X X X     X X     X X X X X X           X   X X
+   queue start/stop             X   X X X X X X     X X     X X X X X X           X   X X     X
    MTU update                   X X X           X   X X X X         X X
    jumbo frame                  X X X X X X X X X   X X X X X X X X X X       X
-   scattered Rx                 X X X   X X X X X X X X X X X X X X X X           X   X
+   scattered Rx                 X X X   X X X X X X X X X X X X X X X X           X   X       X
    LRO                                              X X X X
    TSO                          X   X   X X X X X X X X X X X X X X
-   promiscuous mode       X X   X X   X X X X X X X X X     X X     X X           X   X X
-   allmulticast mode            X X     X X X X X X X X X X X X     X X           X   X X
-   unicast MAC filter     X X     X   X X X X X X X X X X X X X     X X               X X
-   multicast MAC filter   X X         X X X X X             X X     X X               X X
+   promiscuous mode       X X   X X   X X X X X X X X X     X X     X X           X   X X     X
+   allmulticast mode            X X     X X X X X X X X X X X X     X X           X   X X     X
+   unicast MAC filter     X X     X   X X X X X X X X X X X X X     X X               X X     X
+   multicast MAC filter   X X         X X X X X             X X     X X               X X     X
    RSS hash                     X   X X X X X X X   X X X X X X X X X X
    RSS key update                   X   X X X X X   X X X X X X X X   X
    RSS reta update                  X   X X X X X   X X X X X X X X   X
    VMDq                                 X X     X   X X     X X
    SR-IOV                   X       X   X X     X   X X             X X
    DCB                                  X X     X   X X
-   VLAN filter                    X   X X X X X X X X X X X X X     X X               X X
+   VLAN filter                    X   X X X X X X X X X X X X X     X X               X X     X
    ethertype filter                     X X     X   X X
    n-tuple filter                               X   X X
    SYN filter                                   X   X X
@@ -127,23 +127,23 @@  Most of these differences are summarized below.
    inner L4 checksum                X   X   X       X   X           X
    packet type parsing          X     X X   X   X X X   X   X X X X X X
    timesync                             X X     X   X X
-   basic stats            X X   X X X X X X X X X X X X X X X X X X X X       X   X X X X
-   extended stats                   X   X X X X X X X X X X X X X X                   X X
-   stats per queue              X                   X X     X X X X X X           X   X X
+   basic stats            X X   X X X X X X X X X X X X X X X X X X X X       X   X X X X     X
+   extended stats                   X   X X X X X X X X X X X X X X                   X X     X
+   stats per queue              X                   X X     X X X X X X           X   X X     X
    EEPROM dump                                  X   X X
    registers dump                               X X X X X X
    multiprocess aware                   X X X X     X X X X X X X X X X       X
-   BSD nic_uio                  X X   X X X X X X X X X X X X X X X                   X X
-   Linux UIO              X X   X X X X X X X X X X X X X X X X X X                   X X
-   Linux VFIO                   X X   X X X X X X X X X X X X X X X                   X X
+   BSD nic_uio                  X X   X X X X X X X X X X X X X X X                   X X     X
+   Linux UIO              X X   X X X X X X X X X X X X X X X X X X                   X X     X
+   Linux VFIO                   X X   X X X X X X X X X X X X X X X                   X X     X
    other kdrv                                                       X X           X
-   ARMv7                                                                      X       X X
-   ARMv8                                                                      X       X X
+   ARMv7                                                                      X       X X     X
+   ARMv8                                                                      X       X X     X
    Power8                                                           X X       X
    TILE-Gx                                                                    X
-   x86-32                       X X X X X X X X X X X X X X X X X X X X       X     X X X
-   x86-64                 X X   X X X X X X X X X X X X X X X X X X X X       X   X X X X
-   usage doc              X X   X     X                             X X       X   X   X
+   x86-32                       X X X X X X X X X X X X X X X X X X X X       X     X X X     X
+   x86-64                 X X   X X X X X X X X X X X X X X X X X X X X       X   X X X X     X
+   usage doc              X X   X     X                             X X       X   X   X       X
    design doc
    perf doc
-   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
+   ==================== = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = = =
diff --git a/doc/guides/rel_notes/release_16_07.rst b/doc/guides/rel_notes/release_16_07.rst
index 701e827..f26639c 100644
--- a/doc/guides/rel_notes/release_16_07.rst
+++ b/doc/guides/rel_notes/release_16_07.rst
@@ -34,6 +34,10 @@  This section should contain new features added in this release. Sample format:
 
   Refer to the previous release notes for examples.
 
+* **Virtio support for containers.**
+
+  Add a new virtual device, named eth_cvio, to support virtio for containers.
+
 
 Resolved Issues
 ---------------
diff --git a/drivers/net/virtio/rte_eth_virtio_vdev.c b/drivers/net/virtio/rte_eth_virtio_vdev.c
index 419acef..282c9ae 100644
--- a/drivers/net/virtio/rte_eth_virtio_vdev.c
+++ b/drivers/net/virtio/rte_eth_virtio_vdev.c
@@ -50,6 +50,8 @@ 
 #include <rte_mbuf.h>
 #include <rte_memory.h>
 #include <rte_eal_memconfig.h>
+#include <rte_malloc.h>
+#include <rte_kvargs.h>
 
 #include "virtio_pci.h"
 #include "virtio_logs.h"
@@ -692,7 +694,7 @@  static uint16_t
 vdev_get_queue_num(struct virtio_hw *hw,
 		   uint16_t queue_id __rte_unused)
 {
-	return hw->queue_num;
+	return hw->queue_size;
 }
 
 static void
@@ -832,7 +834,7 @@  vhost_user_backend_setup(struct virtio_hw *hw)
 static void
 virtio_vdev_init(struct rte_eth_dev_data *data, char *path,
 		 int nb_rx, int nb_tx, int nb_cq __attribute__ ((unused)),
-		 int queue_num, char *mac, char *ifname)
+		 int queue_size, char *mac, char *ifname)
 {
 	int i, r;
 	struct stat s;
@@ -847,7 +849,7 @@  virtio_vdev_init(struct rte_eth_dev_data *data, char *path,
 	hw->path = strdup(path);
 	hw->max_rx_queues = nb_rx;
 	hw->max_tx_queues = nb_tx;
-	hw->queue_num = queue_num;
+	hw->queue_size = queue_size;
 	hw->mac_specified = 0;
 	if (mac) {
 		r = sscanf(mac, "%x:%x:%x:%x:%x:%x", &tmp[0],
@@ -895,3 +897,183 @@  virtio_vdev_uninit(struct rte_eth_dev_data *data)
 
 	close(hw->vhostfd);
 }
+
+static const char *valid_args[] = {
+#define CVIO_ARG_RX_NUM         "rx"
+	CVIO_ARG_RX_NUM,
+#define CVIO_ARG_TX_NUM         "tx"
+	CVIO_ARG_TX_NUM,
+#define CVIO_ARG_CQ_NUM         "cq"
+	CVIO_ARG_CQ_NUM,
+#define CVIO_ARG_MAC            "mac"
+	CVIO_ARG_MAC,
+#define CVIO_ARG_PATH           "path"
+	CVIO_ARG_PATH,
+#define CVIO_ARG_QUEUE_SIZE     "queue_size"
+	CVIO_ARG_QUEUE_SIZE,
+#define CVIO_ARG_IFNAME         "ifname"
+	CVIO_ARG_IFNAME,
+	NULL
+};
+
+#define CVIO_DEF_CQ_EN	0
+#define CVIO_DEF_Q_NUM	1
+#define CVIO_DEF_Q_SZ	256
+
+static int
+get_string_arg(const char *key __rte_unused,
+	       const char *value, void *extra_args)
+{
+	if (!value || !extra_args)
+		return -EINVAL;
+
+	*(char **)extra_args = strdup(value);
+
+	return 0;
+}
+
+static int
+get_integer_arg(const char *key __rte_unused,
+		const char *value, void *extra_args)
+{
+	if (!value || !extra_args)
+		return -EINVAL;
+
+	*(uint64_t *)extra_args = strtoull(value, NULL, 0);
+
+	return 0;
+}
+
+static struct rte_eth_dev *
+cvio_eth_dev_alloc(const char *name)
+{
+	struct rte_eth_dev *eth_dev;
+	struct rte_eth_dev_data *data;
+	struct virtio_hw *hw;
+
+	eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_VIRTUAL);
+	if (!eth_dev)
+		rte_panic("cannot alloc rte_eth_dev\n");
+
+	data = eth_dev->data;
+
+	hw = rte_zmalloc(NULL, sizeof(*hw), 0);
+	if (!hw)
+		rte_panic("malloc virtio_hw failed\n");
+
+	data->dev_private = hw;
+	data->numa_node = SOCKET_ID_ANY;
+	data->kdrv = RTE_KDRV_NONE;
+	data->dev_flags = RTE_ETH_DEV_DETACHABLE;
+	eth_dev->pci_dev = NULL;
+	eth_dev->driver = NULL;
+	return eth_dev;
+}
+
+/* Dev initialization routine. Invoked once for each virtio vdev at
+ * EAL init time, see rte_eal_dev_init().
+ * Returns 0 on success.
+ */
+static int
+rte_cvio_pmd_devinit(const char *name, const char *params)
+{
+	struct rte_kvargs *kvlist = NULL;
+	struct rte_eth_dev *eth_dev = NULL;
+	uint64_t nb_rx = CVIO_DEF_Q_NUM;
+	uint64_t nb_tx = CVIO_DEF_Q_NUM;
+	uint64_t nb_cq = CVIO_DEF_CQ_EN;
+	uint64_t queue_size = CVIO_DEF_Q_SZ;
+	char *sock_path = NULL;
+	char *mac_addr = NULL;
+	char *ifname = NULL;
+
+	if (!params || params[0] == '\0')
+		rte_panic("arg %s is mandatory for eth_cvio\n",
+			  CVIO_ARG_QUEUE_SIZE);
+
+	kvlist = rte_kvargs_parse(params, valid_args);
+	if (!kvlist)
+		rte_panic("error when parsing param\n");
+
+	if (rte_kvargs_count(kvlist, CVIO_ARG_PATH) == 1)
+		rte_kvargs_process(kvlist, CVIO_ARG_PATH,
+				   &get_string_arg, &sock_path);
+	else
+		rte_panic("arg %s is mandatory for eth_cvio\n",
+			  CVIO_ARG_QUEUE_SIZE);
+
+	if (rte_kvargs_count(kvlist, CVIO_ARG_MAC) == 1)
+		rte_kvargs_process(kvlist, CVIO_ARG_MAC,
+				   &get_string_arg, &mac_addr);
+
+	if (rte_kvargs_count(kvlist, CVIO_ARG_IFNAME) == 1)
+		rte_kvargs_process(kvlist, CVIO_ARG_IFNAME,
+				   &get_string_arg, &ifname);
+
+	if (rte_kvargs_count(kvlist, CVIO_ARG_QUEUE_SIZE) == 1)
+		rte_kvargs_process(kvlist, CVIO_ARG_QUEUE_SIZE,
+				   &get_integer_arg, &queue_size);
+
+	if (rte_kvargs_count(kvlist, CVIO_ARG_RX_NUM) == 1)
+		rte_kvargs_process(kvlist, CVIO_ARG_RX_NUM,
+				   &get_integer_arg, &nb_rx);
+
+	if (rte_kvargs_count(kvlist, CVIO_ARG_TX_NUM) == 1)
+		rte_kvargs_process(kvlist, CVIO_ARG_TX_NUM,
+				   &get_integer_arg, &nb_tx);
+
+	if (rte_kvargs_count(kvlist, CVIO_ARG_CQ_NUM) == 1)
+		rte_kvargs_process(kvlist, CVIO_ARG_CQ_NUM,
+				   &get_integer_arg, &nb_cq);
+
+	eth_dev = cvio_eth_dev_alloc(name);
+
+	virtio_vdev_init(eth_dev->data, sock_path, nb_rx, nb_tx, nb_cq,
+			 queue_size, mac_addr, ifname);
+	if (sock_path)
+		free(sock_path);
+	if (mac_addr)
+		free(mac_addr);
+	if (ifname)
+		free(ifname);
+
+	/* originally, this will be called in rte_eal_pci_probe() */
+	eth_virtio_dev_init(eth_dev);
+
+	return 0;
+}
+
+/** Called by rte_eth_dev_detach() */
+static int
+rte_cvio_pmd_devuninit(const char *name)
+{
+	struct rte_eth_dev *eth_dev = NULL;
+
+	if (!name)
+		return -EINVAL;
+
+	PMD_DRV_LOG(INFO, PMD, "Un-Initializing %s\n", name);
+	eth_dev = rte_eth_dev_allocated(name);
+	if (!eth_dev)
+		return -ENODEV;
+
+	/* make sure the device is stopped, queues freed */
+	rte_eth_dev_close(eth_dev->data->port_id);
+
+	virtio_vdev_uninit(eth_dev->data);
+
+	rte_free(eth_dev->data->dev_private);
+	rte_free(eth_dev->data);
+	rte_eth_dev_release_port(eth_dev);
+
+	return 0;
+}
+
+static struct rte_driver rte_cvio_driver = {
+	.name   = "eth_cvio",
+	.type   = PMD_VDEV,
+	.init   = rte_cvio_pmd_devinit,
+	.uninit = rte_cvio_pmd_devuninit,
+};
+
+PMD_REGISTER_DRIVER(rte_cvio_driver);
diff --git a/drivers/net/virtio/virtio_ethdev.c b/drivers/net/virtio/virtio_ethdev.c
index 63a368a..9d28384 100644
--- a/drivers/net/virtio/virtio_ethdev.c
+++ b/drivers/net/virtio/virtio_ethdev.c
@@ -60,7 +60,6 @@ 
 #include "virtio_rxtx.h"
 
 
-static int eth_virtio_dev_init(struct rte_eth_dev *eth_dev);
 static int eth_virtio_dev_uninit(struct rte_eth_dev *eth_dev);
 static int  virtio_dev_configure(struct rte_eth_dev *dev);
 static int  virtio_dev_start(struct rte_eth_dev *dev);
@@ -168,14 +167,14 @@  virtio_send_command(struct virtqueue *vq, struct virtio_pmd_ctrl *ctrl,
 	 * One RX packet for ACK.
 	 */
 	vq->vq_ring.desc[head].flags = VRING_DESC_F_NEXT;
-	vq->vq_ring.desc[head].addr = vq->virtio_net_hdr_mz->phys_addr;
+	vq->vq_ring.desc[head].addr = vq->virtio_net_hdr_mem;
 	vq->vq_ring.desc[head].len = sizeof(struct virtio_net_ctrl_hdr);
 	vq->vq_free_cnt--;
 	i = vq->vq_ring.desc[head].next;
 
 	for (k = 0; k < pkt_num; k++) {
 		vq->vq_ring.desc[i].flags = VRING_DESC_F_NEXT;
-		vq->vq_ring.desc[i].addr = vq->virtio_net_hdr_mz->phys_addr
+		vq->vq_ring.desc[i].addr = vq->virtio_net_hdr_mem
 			+ sizeof(struct virtio_net_ctrl_hdr)
 			+ sizeof(ctrl->status) + sizeof(uint8_t)*sum;
 		vq->vq_ring.desc[i].len = dlen[k];
@@ -185,7 +184,7 @@  virtio_send_command(struct virtqueue *vq, struct virtio_pmd_ctrl *ctrl,
 	}
 
 	vq->vq_ring.desc[i].flags = VRING_DESC_F_WRITE;
-	vq->vq_ring.desc[i].addr = vq->virtio_net_hdr_mz->phys_addr
+	vq->vq_ring.desc[i].addr = vq->virtio_net_hdr_mem
 			+ sizeof(struct virtio_net_ctrl_hdr);
 	vq->vq_ring.desc[i].len = sizeof(ctrl->status);
 	vq->vq_free_cnt--;
@@ -363,20 +362,27 @@  int virtio_dev_queue_setup(struct rte_eth_dev *dev,
 		}
 	}
 
-	/*
-	 * Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
-	 * and only accepts 32 bit page frame number.
-	 * Check if the allocated physical memory exceeds 16TB.
-	 */
-	if ((mz->phys_addr + vq->vq_ring_size - 1) >> (VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
-		PMD_INIT_LOG(ERR, "vring address shouldn't be above 16TB!");
-		rte_free(vq);
-		return -ENOMEM;
-	}
-
 	memset(mz->addr, 0, sizeof(mz->len));
 	vq->mz = mz;
-	vq->vq_ring_mem = mz->phys_addr;
+	if (dev->dev_type == RTE_ETH_DEV_PCI) {
+		vq->vq_ring_mem = mz->phys_addr;
+
+		/*
+		 * Virtio PCI device VIRTIO_PCI_QUEUE_PF register is 32bit,
+		 * and only accepts 32 bit page frame number.
+		 * Check if the allocated physical memory exceeds 16TB.
+		 */
+		if ((mz->phys_addr + vq->vq_ring_size - 1) >>
+				(VIRTIO_PCI_QUEUE_ADDR_SHIFT + 32)) {
+			PMD_INIT_LOG(ERR, "vring address shouldn't be above 16TB!");
+			rte_free(vq);
+			return -ENOMEM;
+		}
+	}
+#ifdef RTE_VIRTIO_VDEV
+	else
+		vq->vq_ring_mem = (phys_addr_t)mz->addr; /* Use vaddr!!! */
+#endif
 	vq->vq_ring_virt_mem = mz->addr;
 	PMD_INIT_LOG(DEBUG, "vq->vq_ring_mem:      0x%"PRIx64, (uint64_t)mz->phys_addr);
 	PMD_INIT_LOG(DEBUG, "vq->vq_ring_virt_mem: 0x%"PRIx64, (uint64_t)(uintptr_t)mz->addr);
@@ -407,7 +413,12 @@  int virtio_dev_queue_setup(struct rte_eth_dev *dev,
 			}
 		}
 		vq->virtio_net_hdr_mz = hdr_mz;
-		vq->virtio_net_hdr_mem = hdr_mz->phys_addr;
+		if (dev->dev_type == RTE_ETH_DEV_PCI)
+			vq->virtio_net_hdr_mem = hdr_mz->phys_addr;
+#ifdef RTE_VIRTIO_VDEV
+		else
+			vq->virtio_net_hdr_mem = (phys_addr_t)hdr_mz->addr;
+#endif
 
 		txr = hdr_mz->addr;
 		memset(txr, 0, vq_size * sizeof(*txr));
@@ -440,13 +451,24 @@  int virtio_dev_queue_setup(struct rte_eth_dev *dev,
 				return -ENOMEM;
 			}
 		}
-		vq->virtio_net_hdr_mem =
-			vq->virtio_net_hdr_mz->phys_addr;
+		if (dev->dev_type == RTE_ETH_DEV_PCI)
+			vq->virtio_net_hdr_mem = mz->phys_addr;
+#ifdef RTE_VIRTIO_VDEV
+		else
+			vq->virtio_net_hdr_mem = (phys_addr_t)mz->addr;
+#endif
 		memset(vq->virtio_net_hdr_mz->addr, 0, PAGE_SIZE);
 	}
 
 	hw->vtpci_ops->setup_queue(hw, vq);
 
+	if (dev->dev_type == RTE_ETH_DEV_PCI)
+		vq->offset = offsetof(struct rte_mbuf, buf_physaddr);
+#ifdef RTE_VIRTIO_VDEV
+	else
+		vq->offset = offsetof(struct rte_mbuf, buf_addr);
+#endif
+
 	*pvq = vq;
 	return 0;
 }
@@ -499,8 +521,9 @@  virtio_dev_close(struct rte_eth_dev *dev)
 		virtio_dev_stop(dev);
 
 	/* reset the NIC */
-	if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC)
-		vtpci_irq_config(hw, VIRTIO_MSI_NO_VECTOR);
+	if (dev->dev_type == RTE_ETH_DEV_PCI)
+		if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC)
+			vtpci_irq_config(hw, VIRTIO_MSI_NO_VECTOR);
 	vtpci_reset(hw);
 	virtio_dev_free_mbufs(dev);
 	virtio_free_queues(dev);
@@ -1002,8 +1025,9 @@  virtio_interrupt_handler(__rte_unused struct rte_intr_handle *handle,
 	isr = vtpci_isr(hw);
 	PMD_DRV_LOG(INFO, "interrupt status = %#x", isr);
 
-	if (rte_intr_enable(&dev->pci_dev->intr_handle) < 0)
-		PMD_DRV_LOG(ERR, "interrupt enable failed");
+	if (dev->dev_type == RTE_ETH_DEV_PCI)
+		if (rte_intr_enable(&dev->pci_dev->intr_handle) < 0)
+			PMD_DRV_LOG(ERR, "interrupt enable failed");
 
 	if (isr & VIRTIO_PCI_ISR_CONFIG) {
 		if (virtio_dev_link_update(dev, 0) == 0)
@@ -1027,7 +1051,7 @@  rx_func_get(struct rte_eth_dev *eth_dev)
  * This function is based on probe() function in virtio_pci.c
  * It returns 0 on success.
  */
-static int
+int
 eth_virtio_dev_init(struct rte_eth_dev *eth_dev)
 {
 	struct virtio_hw *hw = eth_dev->data->dev_private;
@@ -1057,9 +1081,11 @@  eth_virtio_dev_init(struct rte_eth_dev *eth_dev)
 
 	pci_dev = eth_dev->pci_dev;
 
-	ret = vtpci_init(pci_dev, hw);
-	if (ret)
-		return ret;
+	if (eth_dev->dev_type == RTE_ETH_DEV_PCI) {
+		ret = vtpci_init(pci_dev, hw);
+		if (ret)
+			return ret;
+	}
 
 	/* Reset the device although not necessary at startup */
 	vtpci_reset(hw);
@@ -1073,10 +1099,12 @@  eth_virtio_dev_init(struct rte_eth_dev *eth_dev)
 		return -1;
 
 	/* If host does not support status then disable LSC */
-	if (!vtpci_with_feature(hw, VIRTIO_NET_F_STATUS))
-		pci_dev->driver->drv_flags &= ~RTE_PCI_DRV_INTR_LSC;
+	if (eth_dev->dev_type == RTE_ETH_DEV_PCI) {
+		if (!vtpci_with_feature(hw, VIRTIO_NET_F_STATUS))
+			pci_dev->driver->drv_flags &= ~RTE_PCI_DRV_INTR_LSC;
 
-	rte_eth_copy_pci_info(eth_dev, pci_dev);
+		rte_eth_copy_pci_info(eth_dev, pci_dev);
+	}
 
 	rx_func_get(eth_dev);
 
@@ -1150,15 +1178,17 @@  eth_virtio_dev_init(struct rte_eth_dev *eth_dev)
 
 	PMD_INIT_LOG(DEBUG, "hw->max_rx_queues=%d   hw->max_tx_queues=%d",
 			hw->max_rx_queues, hw->max_tx_queues);
-	PMD_INIT_LOG(DEBUG, "port %d vendorID=0x%x deviceID=0x%x",
-			eth_dev->data->port_id, pci_dev->id.vendor_id,
-			pci_dev->id.device_id);
-
-	/* Setup interrupt callback  */
-	if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC)
-		rte_intr_callback_register(&pci_dev->intr_handle,
-				   virtio_interrupt_handler, eth_dev);
-
+	if (eth_dev->dev_type == RTE_ETH_DEV_PCI) {
+		PMD_INIT_LOG(DEBUG, "port %d vendorID=0x%x deviceID=0x%x",
+			     eth_dev->data->port_id, pci_dev->id.vendor_id,
+			     pci_dev->id.device_id);
+
+		/* Setup interrupt callback  */
+		if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC)
+			rte_intr_callback_register(&pci_dev->intr_handle,
+						   virtio_interrupt_handler,
+						   eth_dev);
+	}
 	virtio_dev_cq_start(eth_dev);
 
 	return 0;
@@ -1190,10 +1220,11 @@  eth_virtio_dev_uninit(struct rte_eth_dev *eth_dev)
 	eth_dev->data->mac_addrs = NULL;
 
 	/* reset interrupt callback  */
-	if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC)
-		rte_intr_callback_unregister(&pci_dev->intr_handle,
-						virtio_interrupt_handler,
-						eth_dev);
+	if (eth_dev->dev_type == RTE_ETH_DEV_PCI)
+		if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC)
+			rte_intr_callback_unregister(&pci_dev->intr_handle,
+						     virtio_interrupt_handler,
+						     eth_dev);
 	rte_eal_pci_unmap_device(pci_dev);
 
 	PMD_INIT_LOG(DEBUG, "dev_uninit completed");
@@ -1258,11 +1289,13 @@  virtio_dev_configure(struct rte_eth_dev *dev)
 		return -ENOTSUP;
 	}
 
-	if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC)
-		if (vtpci_irq_config(hw, 0) == VIRTIO_MSI_NO_VECTOR) {
-			PMD_DRV_LOG(ERR, "failed to set config vector");
-			return -EBUSY;
-		}
+	if (dev->dev_type == RTE_ETH_DEV_PCI) {
+		if (pci_dev->driver->drv_flags & RTE_PCI_DRV_INTR_LSC)
+			if (vtpci_irq_config(hw, 0) == VIRTIO_MSI_NO_VECTOR) {
+				PMD_DRV_LOG(ERR, "failed to set config vector");
+				return -EBUSY;
+			}
+	}
 
 	return 0;
 }
@@ -1431,7 +1464,10 @@  virtio_dev_info_get(struct rte_eth_dev *dev, struct rte_eth_dev_info *dev_info)
 {
 	struct virtio_hw *hw = dev->data->dev_private;
 
-	dev_info->driver_name = dev->driver->pci_drv.name;
+	if (dev->dev_type == RTE_ETH_DEV_PCI)
+		dev_info->driver_name = dev->driver->pci_drv.name;
+	else
+		dev_info->driver_name = "cvirtio PMD";
 	dev_info->max_rx_queues = (uint16_t)hw->max_rx_queues;
 	dev_info->max_tx_queues = (uint16_t)hw->max_tx_queues;
 	dev_info->min_rx_bufsize = VIRTIO_MIN_RX_BUFSIZE;
diff --git a/drivers/net/virtio/virtio_ethdev.h b/drivers/net/virtio/virtio_ethdev.h
index 3f3b032..284afaa 100644
--- a/drivers/net/virtio/virtio_ethdev.h
+++ b/drivers/net/virtio/virtio_ethdev.h
@@ -113,6 +113,8 @@  uint16_t virtio_recv_pkts_vec(void *rx_queue, struct rte_mbuf **rx_pkts,
 uint16_t virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 		uint16_t nb_pkts);
 
+int eth_virtio_dev_init(struct rte_eth_dev *eth_dev);
+
 /*
  * The VIRTIO_NET_F_GUEST_TSO[46] features permit the host to send us
  * frames larger than 1514 bytes. We do not yet support software LRO
@@ -121,4 +123,5 @@  uint16_t virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 #define VTNET_LRO_FEATURES (VIRTIO_NET_F_GUEST_TSO4 | \
 			    VIRTIO_NET_F_GUEST_TSO6 | VIRTIO_NET_F_GUEST_ECN)
 
+
 #endif /* _VIRTIO_ETHDEV_H_ */
diff --git a/drivers/net/virtio/virtio_pci.h b/drivers/net/virtio/virtio_pci.h
index 68097e6..3b47332 100644
--- a/drivers/net/virtio/virtio_pci.h
+++ b/drivers/net/virtio/virtio_pci.h
@@ -264,7 +264,7 @@  struct virtio_hw {
 #define VHOST_KERNEL	0
 #define VHOST_USER	1
 	int		type; /* type of backend */
-	uint32_t	queue_num;
+	uint32_t	queue_size;
 	char		*path;
 	int		mac_specified;
 	int		vhostfd;
diff --git a/drivers/net/virtio/virtio_rxtx.c b/drivers/net/virtio/virtio_rxtx.c
index ef21d8e..9d7e537 100644
--- a/drivers/net/virtio/virtio_rxtx.c
+++ b/drivers/net/virtio/virtio_rxtx.c
@@ -193,8 +193,7 @@  virtqueue_enqueue_recv_refill(struct virtqueue *vq, struct rte_mbuf *cookie)
 
 	start_dp = vq->vq_ring.desc;
 	start_dp[idx].addr =
-		(uint64_t)(cookie->buf_physaddr + RTE_PKTMBUF_HEADROOM
-		- hw->vtnet_hdr_size);
+		MBUF_DATA_DMA_ADDR(cookie, vq->offset) - hw->vtnet_hdr_size;
 	start_dp[idx].len =
 		cookie->buf_len - RTE_PKTMBUF_HEADROOM + hw->vtnet_hdr_size;
 	start_dp[idx].flags =  VRING_DESC_F_WRITE;
@@ -265,7 +264,7 @@  virtqueue_enqueue_xmit(struct virtqueue *txvq, struct rte_mbuf *cookie,
 	}
 
 	do {
-		start_dp[idx].addr  = rte_mbuf_data_dma_addr(cookie);
+		start_dp[idx].addr  = MBUF_DATA_DMA_ADDR(cookie, txvq->offset);
 		start_dp[idx].len   = cookie->data_len;
 		start_dp[idx].flags = cookie->next ? VRING_DESC_F_NEXT : 0;
 		idx = start_dp[idx].next;
diff --git a/drivers/net/virtio/virtio_rxtx_simple.c b/drivers/net/virtio/virtio_rxtx_simple.c
index 8f5293d..599c42e 100644
--- a/drivers/net/virtio/virtio_rxtx_simple.c
+++ b/drivers/net/virtio/virtio_rxtx_simple.c
@@ -80,8 +80,8 @@  virtqueue_enqueue_recv_refill_simple(struct virtqueue *vq,
 	vq->sw_ring[desc_idx] = cookie;
 
 	start_dp = vq->vq_ring.desc;
-	start_dp[desc_idx].addr = (uint64_t)((uintptr_t)cookie->buf_physaddr +
-		RTE_PKTMBUF_HEADROOM - vq->hw->vtnet_hdr_size);
+	start_dp[desc_idx].addr = MBUF_DATA_DMA_ADDR(cookie, vq->offset) -
+		vq->hw->vtnet_hdr_size;
 	start_dp[desc_idx].len = cookie->buf_len -
 		RTE_PKTMBUF_HEADROOM + vq->hw->vtnet_hdr_size;
 
@@ -119,8 +119,8 @@  virtio_rxq_rearm_vec(struct virtqueue *rxvq)
 		*(uint64_t *)p = rxvq->mbuf_initializer;
 
 		start_dp[i].addr =
-			(uint64_t)((uintptr_t)sw_ring[i]->buf_physaddr +
-			RTE_PKTMBUF_HEADROOM - rxvq->hw->vtnet_hdr_size);
+			MBUF_DATA_DMA_ADDR(sw_ring[i], rxvq->offset) -
+			rxvq->hw->vtnet_hdr_size;
 		start_dp[i].len = sw_ring[i]->buf_len -
 			RTE_PKTMBUF_HEADROOM + rxvq->hw->vtnet_hdr_size;
 	}
@@ -366,7 +366,7 @@  virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 			txvq->vq_descx[desc_idx + i].cookie = tx_pkts[i];
 		for (i = 0; i < nb_tail; i++) {
 			start_dp[desc_idx].addr =
-				rte_mbuf_data_dma_addr(*tx_pkts);
+				MBUF_DATA_DMA_ADDR(*tx_pkts, txvq->offset);
 			start_dp[desc_idx].len = (*tx_pkts)->pkt_len;
 			tx_pkts++;
 			desc_idx++;
@@ -377,7 +377,8 @@  virtio_xmit_pkts_simple(void *tx_queue, struct rte_mbuf **tx_pkts,
 	for (i = 0; i < nb_commit; i++)
 		txvq->vq_descx[desc_idx + i].cookie = tx_pkts[i];
 	for (i = 0; i < nb_commit; i++) {
-		start_dp[desc_idx].addr = rte_mbuf_data_dma_addr(*tx_pkts);
+		start_dp[desc_idx].addr =
+			MBUF_DATA_DMA_ADDR(*tx_pkts, txvq->offset);
 		start_dp[desc_idx].len = (*tx_pkts)->pkt_len;
 		tx_pkts++;
 		desc_idx++;
diff --git a/drivers/net/virtio/virtqueue.h b/drivers/net/virtio/virtqueue.h
index 4e9239e..81b5283 100644
--- a/drivers/net/virtio/virtqueue.h
+++ b/drivers/net/virtio/virtqueue.h
@@ -66,6 +66,14 @@  struct rte_mbuf;
 
 #define VIRTQUEUE_MAX_NAME_SZ 32
 
+#ifdef RTE_VIRTIO_VDEV
+#define MBUF_DATA_DMA_ADDR(mb, offset) \
+	((uint64_t)((uintptr_t)(*(void **)((uintptr_t)mb + offset)) \
+			+ (mb)->data_off))
+#else /* RTE_VIRTIO_VDEV */
+#define MBUF_DATA_DMA_ADDR(mb, offset) rte_mbuf_data_dma_addr(mb)
+#endif /* RTE_VIRTIO_VDEV */
+
 #define VTNET_SQ_RQ_QUEUE_IDX 0
 #define VTNET_SQ_TQ_QUEUE_IDX 1
 #define VTNET_SQ_CQ_QUEUE_IDX 2
@@ -165,6 +173,7 @@  struct virtqueue {
 	void        *vq_ring_virt_mem;    /**< linear address of vring*/
 	unsigned int vq_ring_size;
 	phys_addr_t vq_ring_mem;          /**< physical address of vring */
+					  /**< virtual address for vdev. */
 
 	struct vring vq_ring;    /**< vring keeping desc, used and avail */
 	uint16_t    vq_free_cnt; /**< num of desc available */
@@ -183,6 +192,7 @@  struct virtqueue {
 	 */
 	uint16_t vq_used_cons_idx;
 	uint16_t vq_avail_idx;
+	uint16_t offset; /**< relative offset to obtain addr in mbuf */
 	uint64_t mbuf_initializer; /**< value to init mbufs. */
 	phys_addr_t virtio_net_hdr_mem; /**< hdr for each xmit packet */