[dpdk-dev,v3,2/2] vhost: Add VHOST PMD

Message ID	1447046221-20811-3-git-send-email-mukawa@igel.co.jp (mailing list archive)
State	Changes Requested, archived
Headers	From: Tetsuya Mukawa <mukawa@igel.co.jp> To: dev@dpdk.org Date: Mon, 9 Nov 2015 14:17:01 +0900 Message-Id: <1447046221-20811-3-git-send-email-mukawa@igel.co.jp> In-Reply-To: <1447046221-20811-1-git-send-email-mukawa@igel.co.jp> References: <1446436737-25606-2-git-send-email-mukawa@igel.co.jp> <1447046221-20811-1-git-send-email-mukawa@igel.co.jp> Cc: ann.zhuangyanying@huawei.com Subject: [dpdk-dev] [PATCH v3 2/2] vhost: Add VHOST PMD Precedence: list Errors-To: dev-bounces@dpdk.org Sender: "dev" <dev-bounces@dpdk.org>

Message ID

1447046221-20811-3-git-send-email-mukawa@igel.co.jp (mailing list archive)

State

Changes Requested, archived

Headers

From: Tetsuya Mukawa <mukawa@igel.co.jp>
To: dev@dpdk.org
Date: Mon,  9 Nov 2015 14:17:01 +0900
Message-Id: <1447046221-20811-3-git-send-email-mukawa@igel.co.jp>
In-Reply-To: <1447046221-20811-1-git-send-email-mukawa@igel.co.jp>
References: <1446436737-25606-2-git-send-email-mukawa@igel.co.jp>
	<1447046221-20811-1-git-send-email-mukawa@igel.co.jp>
Cc: ann.zhuangyanying@huawei.com
Subject: [dpdk-dev] [PATCH v3 2/2] vhost: Add VHOST PMD
Precedence: list
Errors-To: dev-bounces@dpdk.org
Sender: "dev" <dev-bounces@dpdk.org>

Commit Message

Tetsuya Mukawa Nov. 9, 2015, 5:17 a.m. UTC

  The patch introduces a new PMD. This PMD is implemented as thin wrapper
of librte_vhost. It means librte_vhost is also needed to compile the PMD.
The vhost messages will be handled only when a port is started. So start
a port first, then invoke QEMU.

The PMD has 2 parameters.
 - iface:  The parameter is used to specify a path to connect to a
           virtio-net device.
 - queues: The parameter is used to specify the number of the queues
           virtio-net device has.
           (Default: 1)

Here is an example.
$ ./testpmd -c f -n 4 --vdev 'eth_vhost0,iface=/tmp/sock0,queues=1' -- -i

To connect above testpmd, here is qemu command example.

$ qemu-system-x86_64 \
        <snip>
        -chardev socket,id=chr0,path=/tmp/sock0 \
        -netdev vhost-user,id=net0,chardev=chr0,vhostforce,queues=1 \
        -device virtio-net-pci,netdev=net0

Signed-off-by: Tetsuya Mukawa <mukawa@igel.co.jp>
---
 config/common_linuxapp                      |   6 +
 doc/guides/nics/index.rst                   |   1 +
 doc/guides/rel_notes/release_2_2.rst        |   2 +
 drivers/net/Makefile                        |   4 +
 drivers/net/vhost/Makefile                  |  62 +++
 drivers/net/vhost/rte_eth_vhost.c           | 768 ++++++++++++++++++++++++++++
 drivers/net/vhost/rte_eth_vhost.h           |  65 +++
 drivers/net/vhost/rte_pmd_vhost_version.map |   8 +
 mk/rte.app.mk                               |   8 +-
 9 files changed, 923 insertions(+), 1 deletion(-)
 create mode 100644 drivers/net/vhost/Makefile
 create mode 100644 drivers/net/vhost/rte_eth_vhost.c
 create mode 100644 drivers/net/vhost/rte_eth_vhost.h
 create mode 100644 drivers/net/vhost/rte_pmd_vhost_version.map

Comments

Yuanhan Liu Nov. 9, 2015, 6:21 a.m. UTC | #1

Hi Tetsuya,

Here I just got some minor nits after a very rough glimpse.

On Mon, Nov 09, 2015 at 02:17:01PM +0900, Tetsuya Mukawa wrote:
...
> +static uint16_t
> +eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
> +{
> +	struct vhost_queue *r = q;
> +	uint16_t nb_rx = 0;
> +
> +	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
> +		return 0;
> +
> +	rte_atomic32_set(&r->while_queuing, 1);
> +
> +	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
> +		goto out;
> +
> +	/* Dequeue packets from guest TX queue */
> +	nb_rx = (uint16_t)rte_vhost_dequeue_burst(r->device,
> +			r->virtqueue_id, r->mb_pool, bufs, nb_bufs);

Unnecessary cast, as rte_vhost_enqueue_burst is defined with uint16_t
return type.

> +
> +	r->rx_pkts += nb_rx;
> +
> +out:
> +	rte_atomic32_set(&r->while_queuing, 0);
> +
> +	return nb_rx;
> +}
> +
> +static uint16_t
> +eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
> +{
> +	struct vhost_queue *r = q;
> +	uint16_t i, nb_tx = 0;
> +
> +	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
> +		return 0;
> +
> +	rte_atomic32_set(&r->while_queuing, 1);
> +
> +	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
> +		goto out;
> +
> +	/* Enqueue packets to guest RX queue */
> +	nb_tx = (uint16_t)rte_vhost_enqueue_burst(r->device,
> +			r->virtqueue_id, bufs, nb_bufs);

Ditto.

> +
> +	r->tx_pkts += nb_tx;
> +	r->err_pkts += nb_bufs - nb_tx;
> +
> +	for (i = 0; likely(i < nb_tx); i++)
> +		rte_pktmbuf_free(bufs[i]);
> +
> +out:
> +	rte_atomic32_set(&r->while_queuing, 0);
> +
> +	return nb_tx;
> +}
> +
> +static int
> +eth_dev_configure(struct rte_eth_dev *dev __rte_unused) { return 0; }

I personally would not prefer to saving few lines of code to sacrifice
the readability.

> +
> +static int
> +eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
> +		   uint16_t nb_rx_desc __rte_unused,
> +		   unsigned int socket_id,
> +		   const struct rte_eth_rxconf *rx_conf __rte_unused,
> +		   struct rte_mempool *mb_pool)
> +{
> +	struct pmd_internal *internal = dev->data->dev_private;
> +	struct vhost_queue *vq;
> +
> +	if (internal->rx_vhost_queues[rx_queue_id] != NULL)
> +		rte_free(internal->rx_vhost_queues[rx_queue_id]);

Such NULL check is unnecessary; rte_free will handle it.

> +
> +	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
> +			RTE_CACHE_LINE_SIZE, socket_id);
> +	if (vq == NULL) {
> +		RTE_LOG(ERR, PMD, "Failed to allocate memory for rx queue\n");
> +		return -ENOMEM;
> +	}
> +
> +	vq->mb_pool = mb_pool;
> +	vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
> +	internal->rx_vhost_queues[rx_queue_id] = vq;
> +	dev->data->rx_queues[rx_queue_id] = vq;
> +	return 0;
> +}
> +
> +static int
> +eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
> +		   uint16_t nb_tx_desc __rte_unused,
> +		   unsigned int socket_id,
> +		   const struct rte_eth_txconf *tx_conf __rte_unused)
> +{
> +	struct pmd_internal *internal = dev->data->dev_private;
> +	struct vhost_queue *vq;
> +
> +	if (internal->tx_vhost_queues[tx_queue_id] != NULL)
> +		rte_free(internal->tx_vhost_queues[tx_queue_id]);

Ditto.

> +
> +	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
> +			RTE_CACHE_LINE_SIZE, socket_id);
> +	if (vq == NULL) {
> +		RTE_LOG(ERR, PMD, "Failed to allocate memory for tx queue\n");
> +		return -ENOMEM;
> +	}
> +
> +	vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
> +	internal->tx_vhost_queues[tx_queue_id] = vq;
> +	dev->data->tx_queues[tx_queue_id] = vq;
> +	return 0;
> +}
> +
> +
> +static void
> +eth_dev_info(struct rte_eth_dev *dev,
> +	     struct rte_eth_dev_info *dev_info)
> +{
> +	struct pmd_internal *internal = dev->data->dev_private;
> +
> +	dev_info->driver_name = drivername;
> +	dev_info->max_mac_addrs = 1;
> +	dev_info->max_rx_pktlen = (uint32_t)-1;
> +	dev_info->max_rx_queues = (uint16_t)internal->nb_rx_queues;
> +	dev_info->max_tx_queues = (uint16_t)internal->nb_tx_queues;
> +	dev_info->min_rx_bufsize = 0;
> +}
> +
> +static void
> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
> +{
> +	unsigned i;
> +	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
> +	const struct pmd_internal *internal = dev->data->dev_private;
> +
> +	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
> +	     i < internal->nb_rx_queues; i++) {
> +		if (internal->rx_vhost_queues[i] == NULL)
> +			continue;
> +		igb_stats->q_ipackets[i] = internal->rx_vhost_queues[i]->rx_pkts;
> +		rx_total += igb_stats->q_ipackets[i];
> +	}
> +
> +	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
> +	     i < internal->nb_tx_queues; i++) {
> +		if (internal->tx_vhost_queues[i] == NULL)
> +			continue;
> +		igb_stats->q_opackets[i] = internal->tx_vhost_queues[i]->tx_pkts;
> +		igb_stats->q_errors[i] = internal->tx_vhost_queues[i]->err_pkts;
> +		tx_total += igb_stats->q_opackets[i];
> +		tx_err_total += igb_stats->q_errors[i];
> +	}
> +
> +	igb_stats->ipackets = rx_total;
> +	igb_stats->opackets = tx_total;
> +	igb_stats->oerrors = tx_err_total;
> +}
> +
> +static void
> +eth_stats_reset(struct rte_eth_dev *dev)
> +{
> +	unsigned i;
> +	struct pmd_internal *internal = dev->data->dev_private;
> +
> +	for (i = 0; i < internal->nb_rx_queues; i++) {
> +		if (internal->rx_vhost_queues[i] == NULL)
> +			continue;
> +		internal->rx_vhost_queues[i]->rx_pkts = 0;
> +	}
> +	for (i = 0; i < internal->nb_tx_queues; i++) {
> +		if (internal->tx_vhost_queues[i] == NULL)
> +			continue;
> +		internal->tx_vhost_queues[i]->tx_pkts = 0;
> +		internal->tx_vhost_queues[i]->err_pkts = 0;
> +	}
> +}
> +
> +static void
> +eth_queue_release(void *q __rte_unused) { ; }
> +static int
> +eth_link_update(struct rte_eth_dev *dev __rte_unused,
> +		int wait_to_complete __rte_unused) { return 0; }

Ditto.

> +
> +static const struct eth_dev_ops ops = {
> +	.dev_start = eth_dev_start,
> +	.dev_stop = eth_dev_stop,
> +	.dev_configure = eth_dev_configure,
> +	.dev_infos_get = eth_dev_info,
> +	.rx_queue_setup = eth_rx_queue_setup,
> +	.tx_queue_setup = eth_tx_queue_setup,
> +	.rx_queue_release = eth_queue_release,
> +	.tx_queue_release = eth_queue_release,
> +	.link_update = eth_link_update,
> +	.stats_get = eth_stats_get,
> +	.stats_reset = eth_stats_reset,
> +};
> +
> +static int
> +eth_dev_vhost_create(const char *name, int index,
> +		     char *iface_name,
> +		     int16_t queues,
> +		     const unsigned numa_node)
> +{
> +	struct rte_eth_dev_data *data = NULL;
> +	struct pmd_internal *internal = NULL;
> +	struct rte_eth_dev *eth_dev = NULL;
> +	struct ether_addr *eth_addr = NULL;
> +
> +	RTE_LOG(INFO, PMD, "Creating VHOST-USER backend on numa socket %u\n",
> +		numa_node);
> +
> +	/* now do all data allocation - for eth_dev structure, dummy pci driver
> +	 * and internal (private) data
> +	 */
> +	data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node);
> +	if (data == NULL)
> +		goto error;
> +
> +	internal = rte_zmalloc_socket(name, sizeof(*internal), 0, numa_node);
> +	if (internal == NULL)
> +		goto error;
> +
> +	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
> +	if (eth_addr == NULL)
> +		goto error;
> +	*eth_addr = base_eth_addr;
> +	eth_addr->addr_bytes[5] = index;
> +
> +	/* reserve an ethdev entry */
> +	eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_VIRTUAL);
> +	if (eth_dev == NULL)
> +		goto error;
> +
> +	/* now put it all together
> +	 * - store queue data in internal,
> +	 * - store numa_node info in ethdev data
> +	 * - point eth_dev_data to internals
> +	 * - and point eth_dev structure to new eth_dev_data structure
> +	 */
> +	internal->nb_rx_queues = queues;
> +	internal->nb_tx_queues = queues;
> +	internal->dev_name = strdup(name);
> +	if (internal->dev_name == NULL)
> +		goto error;
> +	internal->iface_name = strdup(iface_name);
> +	if (internal->iface_name == NULL)
> +		goto error;

If allocation failed here, you will find that internal->dev_name is not
freed.

> +
> +	pthread_mutex_lock(&internal_list_lock);
> +	TAILQ_INSERT_TAIL(&internals_list, internal, next);
> +	pthread_mutex_unlock(&internal_list_lock);
> +
> +	data->dev_private = internal;
> +	data->port_id = eth_dev->data->port_id;
> +	memmove(data->name, eth_dev->data->name, sizeof(data->name));
> +	data->nb_rx_queues = queues;
> +	data->nb_tx_queues = queues;
> +	data->dev_link = pmd_link;
> +	data->mac_addrs = eth_addr;
> +
> +	/* We'll replace the 'data' originally allocated by eth_dev. So the
> +	 * vhost PMD resources won't be shared between multi processes.
> +	 */
> +	eth_dev->data = data;
> +	eth_dev->dev_ops = &ops;
> +	eth_dev->driver = NULL;
> +	eth_dev->data->dev_flags = RTE_ETH_DEV_DETACHABLE;
> +	eth_dev->data->kdrv = RTE_KDRV_NONE;
> +	eth_dev->data->drv_name = internal->dev_name;
> +	eth_dev->data->numa_node = numa_node;
> +
> +	/* finally assign rx and tx ops */
> +	eth_dev->rx_pkt_burst = eth_vhost_rx;
> +	eth_dev->tx_pkt_burst = eth_vhost_tx;
> +
> +	return data->port_id;
> +
> +error:
> +	rte_free(data);
> +	rte_free(internal);
> +	rte_free(eth_addr);
> +
> +	return -1;
> +}
...
...
> +
> +	if ((internal) && (internal->dev_name))
> +		free(internal->dev_name);
> +	if ((internal) && (internal->iface_name))
> +		free(internal->iface_name);
> +
> +	rte_free(eth_dev->data->mac_addrs);
> +	rte_free(eth_dev->data);
> +
> +	for (i = 0; i < internal->nb_rx_queues; i++) {
> +		if (internal->rx_vhost_queues[i] != NULL)
> +			rte_free(internal->rx_vhost_queues[i]);
> +	}
> +	for (i = 0; i < internal->nb_tx_queues; i++) {
> +		if (internal->tx_vhost_queues[i] != NULL)
> +			rte_free(internal->tx_vhost_queues[i]);

Ditto.

(Hopefully I could have a detailed review later, say next week).

	--yliu

Tetsuya Mukawa Nov. 9, 2015, 6:27 a.m. UTC | #2

Hi Liu,

Thank you so much for your reviewing.
I will fix them, then submit again in this week.

Thanks,
Tetsuya


On 2015/11/09 15:21, Yuanhan Liu wrote:
> Hi Tetsuya,
>
> Here I just got some minor nits after a very rough glimpse.
>
> On Mon, Nov 09, 2015 at 02:17:01PM +0900, Tetsuya Mukawa wrote:
> ...
>> +static uint16_t
>> +eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
>> +{
>> +	struct vhost_queue *r = q;
>> +	uint16_t nb_rx = 0;
>> +
>> +	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
>> +		return 0;
>> +
>> +	rte_atomic32_set(&r->while_queuing, 1);
>> +
>> +	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
>> +		goto out;
>> +
>> +	/* Dequeue packets from guest TX queue */
>> +	nb_rx = (uint16_t)rte_vhost_dequeue_burst(r->device,
>> +			r->virtqueue_id, r->mb_pool, bufs, nb_bufs);
> Unnecessary cast, as rte_vhost_enqueue_burst is defined with uint16_t
> return type.
>
>> +
>> +	r->rx_pkts += nb_rx;
>> +
>> +out:
>> +	rte_atomic32_set(&r->while_queuing, 0);
>> +
>> +	return nb_rx;
>> +}
>> +
>> +static uint16_t
>> +eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
>> +{
>> +	struct vhost_queue *r = q;
>> +	uint16_t i, nb_tx = 0;
>> +
>> +	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
>> +		return 0;
>> +
>> +	rte_atomic32_set(&r->while_queuing, 1);
>> +
>> +	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
>> +		goto out;
>> +
>> +	/* Enqueue packets to guest RX queue */
>> +	nb_tx = (uint16_t)rte_vhost_enqueue_burst(r->device,
>> +			r->virtqueue_id, bufs, nb_bufs);
> Ditto.
>
>> +
>> +	r->tx_pkts += nb_tx;
>> +	r->err_pkts += nb_bufs - nb_tx;
>> +
>> +	for (i = 0; likely(i < nb_tx); i++)
>> +		rte_pktmbuf_free(bufs[i]);
>> +
>> +out:
>> +	rte_atomic32_set(&r->while_queuing, 0);
>> +
>> +	return nb_tx;
>> +}
>> +
>> +static int
>> +eth_dev_configure(struct rte_eth_dev *dev __rte_unused) { return 0; }
> I personally would not prefer to saving few lines of code to sacrifice
> the readability.
>
>> +
>> +static int
>> +eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
>> +		   uint16_t nb_rx_desc __rte_unused,
>> +		   unsigned int socket_id,
>> +		   const struct rte_eth_rxconf *rx_conf __rte_unused,
>> +		   struct rte_mempool *mb_pool)
>> +{
>> +	struct pmd_internal *internal = dev->data->dev_private;
>> +	struct vhost_queue *vq;
>> +
>> +	if (internal->rx_vhost_queues[rx_queue_id] != NULL)
>> +		rte_free(internal->rx_vhost_queues[rx_queue_id]);
> Such NULL check is unnecessary; rte_free will handle it.
>
>> +
>> +	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
>> +			RTE_CACHE_LINE_SIZE, socket_id);
>> +	if (vq == NULL) {
>> +		RTE_LOG(ERR, PMD, "Failed to allocate memory for rx queue\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	vq->mb_pool = mb_pool;
>> +	vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
>> +	internal->rx_vhost_queues[rx_queue_id] = vq;
>> +	dev->data->rx_queues[rx_queue_id] = vq;
>> +	return 0;
>> +}
>> +
>> +static int
>> +eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
>> +		   uint16_t nb_tx_desc __rte_unused,
>> +		   unsigned int socket_id,
>> +		   const struct rte_eth_txconf *tx_conf __rte_unused)
>> +{
>> +	struct pmd_internal *internal = dev->data->dev_private;
>> +	struct vhost_queue *vq;
>> +
>> +	if (internal->tx_vhost_queues[tx_queue_id] != NULL)
>> +		rte_free(internal->tx_vhost_queues[tx_queue_id]);
> Ditto.
>
>> +
>> +	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
>> +			RTE_CACHE_LINE_SIZE, socket_id);
>> +	if (vq == NULL) {
>> +		RTE_LOG(ERR, PMD, "Failed to allocate memory for tx queue\n");
>> +		return -ENOMEM;
>> +	}
>> +
>> +	vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
>> +	internal->tx_vhost_queues[tx_queue_id] = vq;
>> +	dev->data->tx_queues[tx_queue_id] = vq;
>> +	return 0;
>> +}
>> +
>> +
>> +static void
>> +eth_dev_info(struct rte_eth_dev *dev,
>> +	     struct rte_eth_dev_info *dev_info)
>> +{
>> +	struct pmd_internal *internal = dev->data->dev_private;
>> +
>> +	dev_info->driver_name = drivername;
>> +	dev_info->max_mac_addrs = 1;
>> +	dev_info->max_rx_pktlen = (uint32_t)-1;
>> +	dev_info->max_rx_queues = (uint16_t)internal->nb_rx_queues;
>> +	dev_info->max_tx_queues = (uint16_t)internal->nb_tx_queues;
>> +	dev_info->min_rx_bufsize = 0;
>> +}
>> +
>> +static void
>> +eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
>> +{
>> +	unsigned i;
>> +	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
>> +	const struct pmd_internal *internal = dev->data->dev_private;
>> +
>> +	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
>> +	     i < internal->nb_rx_queues; i++) {
>> +		if (internal->rx_vhost_queues[i] == NULL)
>> +			continue;
>> +		igb_stats->q_ipackets[i] = internal->rx_vhost_queues[i]->rx_pkts;
>> +		rx_total += igb_stats->q_ipackets[i];
>> +	}
>> +
>> +	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
>> +	     i < internal->nb_tx_queues; i++) {
>> +		if (internal->tx_vhost_queues[i] == NULL)
>> +			continue;
>> +		igb_stats->q_opackets[i] = internal->tx_vhost_queues[i]->tx_pkts;
>> +		igb_stats->q_errors[i] = internal->tx_vhost_queues[i]->err_pkts;
>> +		tx_total += igb_stats->q_opackets[i];
>> +		tx_err_total += igb_stats->q_errors[i];
>> +	}
>> +
>> +	igb_stats->ipackets = rx_total;
>> +	igb_stats->opackets = tx_total;
>> +	igb_stats->oerrors = tx_err_total;
>> +}
>> +
>> +static void
>> +eth_stats_reset(struct rte_eth_dev *dev)
>> +{
>> +	unsigned i;
>> +	struct pmd_internal *internal = dev->data->dev_private;
>> +
>> +	for (i = 0; i < internal->nb_rx_queues; i++) {
>> +		if (internal->rx_vhost_queues[i] == NULL)
>> +			continue;
>> +		internal->rx_vhost_queues[i]->rx_pkts = 0;
>> +	}
>> +	for (i = 0; i < internal->nb_tx_queues; i++) {
>> +		if (internal->tx_vhost_queues[i] == NULL)
>> +			continue;
>> +		internal->tx_vhost_queues[i]->tx_pkts = 0;
>> +		internal->tx_vhost_queues[i]->err_pkts = 0;
>> +	}
>> +}
>> +
>> +static void
>> +eth_queue_release(void *q __rte_unused) { ; }
>> +static int
>> +eth_link_update(struct rte_eth_dev *dev __rte_unused,
>> +		int wait_to_complete __rte_unused) { return 0; }
> Ditto.
>
>> +
>> +static const struct eth_dev_ops ops = {
>> +	.dev_start = eth_dev_start,
>> +	.dev_stop = eth_dev_stop,
>> +	.dev_configure = eth_dev_configure,
>> +	.dev_infos_get = eth_dev_info,
>> +	.rx_queue_setup = eth_rx_queue_setup,
>> +	.tx_queue_setup = eth_tx_queue_setup,
>> +	.rx_queue_release = eth_queue_release,
>> +	.tx_queue_release = eth_queue_release,
>> +	.link_update = eth_link_update,
>> +	.stats_get = eth_stats_get,
>> +	.stats_reset = eth_stats_reset,
>> +};
>> +
>> +static int
>> +eth_dev_vhost_create(const char *name, int index,
>> +		     char *iface_name,
>> +		     int16_t queues,
>> +		     const unsigned numa_node)
>> +{
>> +	struct rte_eth_dev_data *data = NULL;
>> +	struct pmd_internal *internal = NULL;
>> +	struct rte_eth_dev *eth_dev = NULL;
>> +	struct ether_addr *eth_addr = NULL;
>> +
>> +	RTE_LOG(INFO, PMD, "Creating VHOST-USER backend on numa socket %u\n",
>> +		numa_node);
>> +
>> +	/* now do all data allocation - for eth_dev structure, dummy pci driver
>> +	 * and internal (private) data
>> +	 */
>> +	data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node);
>> +	if (data == NULL)
>> +		goto error;
>> +
>> +	internal = rte_zmalloc_socket(name, sizeof(*internal), 0, numa_node);
>> +	if (internal == NULL)
>> +		goto error;
>> +
>> +	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
>> +	if (eth_addr == NULL)
>> +		goto error;
>> +	*eth_addr = base_eth_addr;
>> +	eth_addr->addr_bytes[5] = index;
>> +
>> +	/* reserve an ethdev entry */
>> +	eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_VIRTUAL);
>> +	if (eth_dev == NULL)
>> +		goto error;
>> +
>> +	/* now put it all together
>> +	 * - store queue data in internal,
>> +	 * - store numa_node info in ethdev data
>> +	 * - point eth_dev_data to internals
>> +	 * - and point eth_dev structure to new eth_dev_data structure
>> +	 */
>> +	internal->nb_rx_queues = queues;
>> +	internal->nb_tx_queues = queues;
>> +	internal->dev_name = strdup(name);
>> +	if (internal->dev_name == NULL)
>> +		goto error;
>> +	internal->iface_name = strdup(iface_name);
>> +	if (internal->iface_name == NULL)
>> +		goto error;
> If allocation failed here, you will find that internal->dev_name is not
> freed.
>
>> +
>> +	pthread_mutex_lock(&internal_list_lock);
>> +	TAILQ_INSERT_TAIL(&internals_list, internal, next);
>> +	pthread_mutex_unlock(&internal_list_lock);
>> +
>> +	data->dev_private = internal;
>> +	data->port_id = eth_dev->data->port_id;
>> +	memmove(data->name, eth_dev->data->name, sizeof(data->name));
>> +	data->nb_rx_queues = queues;
>> +	data->nb_tx_queues = queues;
>> +	data->dev_link = pmd_link;
>> +	data->mac_addrs = eth_addr;
>> +
>> +	/* We'll replace the 'data' originally allocated by eth_dev. So the
>> +	 * vhost PMD resources won't be shared between multi processes.
>> +	 */
>> +	eth_dev->data = data;
>> +	eth_dev->dev_ops = &ops;
>> +	eth_dev->driver = NULL;
>> +	eth_dev->data->dev_flags = RTE_ETH_DEV_DETACHABLE;
>> +	eth_dev->data->kdrv = RTE_KDRV_NONE;
>> +	eth_dev->data->drv_name = internal->dev_name;
>> +	eth_dev->data->numa_node = numa_node;
>> +
>> +	/* finally assign rx and tx ops */
>> +	eth_dev->rx_pkt_burst = eth_vhost_rx;
>> +	eth_dev->tx_pkt_burst = eth_vhost_tx;
>> +
>> +	return data->port_id;
>> +
>> +error:
>> +	rte_free(data);
>> +	rte_free(internal);
>> +	rte_free(eth_addr);
>> +
>> +	return -1;
>> +}
> ...
> ...
>> +
>> +	if ((internal) && (internal->dev_name))
>> +		free(internal->dev_name);
>> +	if ((internal) && (internal->iface_name))
>> +		free(internal->iface_name);
>> +
>> +	rte_free(eth_dev->data->mac_addrs);
>> +	rte_free(eth_dev->data);
>> +
>> +	for (i = 0; i < internal->nb_rx_queues; i++) {
>> +		if (internal->rx_vhost_queues[i] != NULL)
>> +			rte_free(internal->rx_vhost_queues[i]);
>> +	}
>> +	for (i = 0; i < internal->nb_tx_queues; i++) {
>> +		if (internal->tx_vhost_queues[i] != NULL)
>> +			rte_free(internal->tx_vhost_queues[i]);
> Ditto.
>
> (Hopefully I could have a detailed review later, say next week).
>
> 	--yliu

Stephen Hemminger Nov. 9, 2015, 10:22 p.m. UTC | #3

On Mon,  9 Nov 2015 14:17:01 +0900
Tetsuya Mukawa <mukawa@igel.co.jp> wrote:

> +	rte_atomic32_set(&r->while_queuing, 1);
> +
> +	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
> +		goto out;

You special 2 variable custom locking here is buggy.
If you hit second atomic test, you will leave while_queuing set.

Tetsuya Mukawa Nov. 10, 2015, 3:14 a.m. UTC | #4

On 2015/11/10 7:22, Stephen Hemminger wrote:
> On Mon,  9 Nov 2015 14:17:01 +0900
> Tetsuya Mukawa <mukawa@igel.co.jp> wrote:
>
>> +	rte_atomic32_set(&r->while_queuing, 1);
>> +
>> +	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
>> +		goto out;
> You special 2 variable custom locking here is buggy.
> If you hit second atomic test, you will leave while_queuing set.

Hi Stephen,

Thanks for reviewing.
I clear while_queuing like below.

+out:
+	rte_atomic32_set(&r->while_queuing, 0);
+
+	return nb_rx;
+}

Thanks,
tetsuya

Zhihong Wang Nov. 12, 2015, 12:52 p.m. UTC | #5

Hi Tetsuya,

In my test I created 2 vdev using "--vdev 'eth_vhost0,iface=/tmp/sock0,queues=1' --vdev 'eth_vhost1,iface=/tmp/sock1,queues=1'", and the qemu message got handled in wrong order.
The reason is that: 2 threads are created to handle message from 2 sockets, but their fds are SHARED, so each thread are reading from both sockets.

This can lead to incorrect behaviors, in my case sometimes the VHOST_USER_SET_MEM_TABLE got handled after VRING initialization and lead to destroy_device().

Detailed log as shown below: thread 69351 & 69352 are both reading fd 25. Thanks Yuanhan for helping debugging!


Thanks
Zhihong


-----------------------------------------------------------------------------------------------------------------

---->  debug: setting up new vq conn for fd: 23, tid: 69352
VHOST_CONFIG: new virtio connection is 25
VHOST_CONFIG: new device, handle is 0
---->  debug: vserver_message_handler thread id: 69352, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_OWNER
---->  debug: vserver_message_handler thread id: 69352, fd: 25
VHOST_CONFIG: read message VHOST_USER_GET_FEATURES
---->  debug: vserver_message_handler thread id: 69352, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
VHOST_CONFIG: vring call idx:0 file:26
---->  debug: vserver_message_handler thread id: 69352, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
VHOST_CONFIG: vring call idx:1 file:27
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
VHOST_CONFIG: vring call idx:0 file:28
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
VHOST_CONFIG: vring call idx:1 file:26
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_FEATURES
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_MEM_TABLE
---->  debug: device_fh: 0: user_set_mem_table
VHOST_CONFIG: mapped region 0 fd:27 to 0x7ff6c0000000 sz:0xa0000 off:0x0
VHOST_CONFIG: mapped region 1 fd:29 to 0x7ff680000000 sz:0x40000000 off:0xc0000
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_NUM
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_BASE
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_ADDR
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_KICK
VHOST_CONFIG: vring kick idx:0 file:30
---->  debug: vserver_message_handler thread id: 69352, fd: 25
VHOST_CONFIG: virtio is not ready for processing.
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_BASE
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_ADDR
---->  debug: vserver_message_handler thread id: 69351, fd: 25
VHOST_CONFIG: read message VHOST_USER_SET_VRING_KICK
VHOST_CONFIG: vring kick idx:1 file:31
VHOST_CONFIG: virtio is now ready for processing.
PMD: New connection established
VHOST_CONFIG: read message VHOST_USER_SET_VRING_NUM

-----------------------------------------------------------------------------------------------------------------

> ...
> +
> +static void *vhost_driver_session(void *param __rte_unused)
> +{
> +	static struct virtio_net_device_ops *vhost_ops;
> +
> +	vhost_ops = rte_zmalloc(NULL, sizeof(*vhost_ops), 0);
> +	if (vhost_ops == NULL)
> +		rte_panic("Can't allocate memory\n");
> +
> +	/* set vhost arguments */
> +	vhost_ops->new_device = new_device;
> +	vhost_ops->destroy_device = destroy_device;
> +	if (rte_vhost_driver_pmd_callback_register(vhost_ops) < 0)
> +		rte_panic("Can't register callbacks\n");
> +
> +	/* start event handling */
> +	rte_vhost_driver_session_start();
> +
> +	rte_free(vhost_ops);
> +	pthread_exit(0);
> +}
> +
> +static void vhost_driver_session_start(struct pmd_internal *internal)
> +{
> +	int ret;
> +
> +	ret = pthread_create(&internal->session_th,
> +			NULL, vhost_driver_session, NULL);
> +	if (ret)
> +		rte_panic("Can't create a thread\n");
> +}
> +
> ...

Tetsuya Mukawa Nov. 13, 2015, 3:09 a.m. UTC | #6

On 2015/11/12 21:52, Wang, Zhihong wrote:
> Hi Tetsuya,
>
> In my test I created 2 vdev using "--vdev 'eth_vhost0,iface=/tmp/sock0,queues=1' --vdev 'eth_vhost1,iface=/tmp/sock1,queues=1'", and the qemu message got handled in wrong order.
> The reason is that: 2 threads are created to handle message from 2 sockets, but their fds are SHARED, so each thread are reading from both sockets.
>
> This can lead to incorrect behaviors, in my case sometimes the VHOST_USER_SET_MEM_TABLE got handled after VRING initialization and lead to destroy_device().
>
> Detailed log as shown below: thread 69351 & 69352 are both reading fd 25. Thanks Yuanhan for helping debugging!
>

Hi Zhihong and Yuanhan,

Thank you so much for debugging the issue.
I will fix vhost PMD not to create multiple message handling threads.

I am going to submit the PMD today.
Could you please check it again using latest one?

Tetsuya


> Thanks
> Zhihong
>
>
> -----------------------------------------------------------------------------------------------------------------
>
> ---->  debug: setting up new vq conn for fd: 23, tid: 69352
> VHOST_CONFIG: new virtio connection is 25
> VHOST_CONFIG: new device, handle is 0
> ---->  debug: vserver_message_handler thread id: 69352, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_OWNER
> ---->  debug: vserver_message_handler thread id: 69352, fd: 25
> VHOST_CONFIG: read message VHOST_USER_GET_FEATURES
> ---->  debug: vserver_message_handler thread id: 69352, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
> VHOST_CONFIG: vring call idx:0 file:26
> ---->  debug: vserver_message_handler thread id: 69352, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
> VHOST_CONFIG: vring call idx:1 file:27
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
> VHOST_CONFIG: vring call idx:0 file:28
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
> VHOST_CONFIG: vring call idx:1 file:26
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_FEATURES
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_MEM_TABLE
> ---->  debug: device_fh: 0: user_set_mem_table
> VHOST_CONFIG: mapped region 0 fd:27 to 0x7ff6c0000000 sz:0xa0000 off:0x0
> VHOST_CONFIG: mapped region 1 fd:29 to 0x7ff680000000 sz:0x40000000 off:0xc0000
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_NUM
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_BASE
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_ADDR
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_KICK
> VHOST_CONFIG: vring kick idx:0 file:30
> ---->  debug: vserver_message_handler thread id: 69352, fd: 25
> VHOST_CONFIG: virtio is not ready for processing.
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_BASE
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_ADDR
> ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_KICK
> VHOST_CONFIG: vring kick idx:1 file:31
> VHOST_CONFIG: virtio is now ready for processing.
> PMD: New connection established
> VHOST_CONFIG: read message VHOST_USER_SET_VRING_NUM
>
> -----------------------------------------------------------------------------------------------------------------
>
>> ...
>> +
>> +static void *vhost_driver_session(void *param __rte_unused)
>> +{
>> +	static struct virtio_net_device_ops *vhost_ops;
>> +
>> +	vhost_ops = rte_zmalloc(NULL, sizeof(*vhost_ops), 0);
>> +	if (vhost_ops == NULL)
>> +		rte_panic("Can't allocate memory\n");
>> +
>> +	/* set vhost arguments */
>> +	vhost_ops->new_device = new_device;
>> +	vhost_ops->destroy_device = destroy_device;
>> +	if (rte_vhost_driver_pmd_callback_register(vhost_ops) < 0)
>> +		rte_panic("Can't register callbacks\n");
>> +
>> +	/* start event handling */
>> +	rte_vhost_driver_session_start();
>> +
>> +	rte_free(vhost_ops);
>> +	pthread_exit(0);
>> +}
>> +
>> +static void vhost_driver_session_start(struct pmd_internal *internal)
>> +{
>> +	int ret;
>> +
>> +	ret = pthread_create(&internal->session_th,
>> +			NULL, vhost_driver_session, NULL);
>> +	if (ret)
>> +		rte_panic("Can't create a thread\n");
>> +}
>> +
>> ...

Zhihong Wang Nov. 13, 2015, 3:50 a.m. UTC | #7

> -----Original Message-----
> From: Tetsuya Mukawa [mailto:mukawa@igel.co.jp]
> Sent: Friday, November 13, 2015 11:10 AM
> To: Wang, Zhihong <zhihong.wang@intel.com>; dev@dpdk.org; Liu, Yuanhan
> <yuanhan.liu@intel.com>
> Cc: ann.zhuangyanying@huawei.com
> Subject: Re: [dpdk-dev] [PATCH v3 2/2] vhost: Add VHOST PMD
> 
> On 2015/11/12 21:52, Wang, Zhihong wrote:
> > Hi Tetsuya,
> >
> > In my test I created 2 vdev using "--vdev
> 'eth_vhost0,iface=/tmp/sock0,queues=1' --vdev
> 'eth_vhost1,iface=/tmp/sock1,queues=1'", and the qemu message got handled
> in wrong order.
> > The reason is that: 2 threads are created to handle message from 2 sockets, but
> their fds are SHARED, so each thread are reading from both sockets.
> >
> > This can lead to incorrect behaviors, in my case sometimes the
> VHOST_USER_SET_MEM_TABLE got handled after VRING initialization and lead to
> destroy_device().
> >
> > Detailed log as shown below: thread 69351 & 69352 are both reading fd 25.
> Thanks Yuanhan for helping debugging!
> >
> 
> Hi Zhihong and Yuanhan,
> 
> Thank you so much for debugging the issue.
> I will fix vhost PMD not to create multiple message handling threads.
> 
> I am going to submit the PMD today.
> Could you please check it again using latest one?
> 

Looking forward to it!


> Tetsuya
> 
> 
> > Thanks
> > Zhihong
> >
> >
> > ----------------------------------------------------------------------
> > -------------------------------------------
> >
> > ---->  debug: setting up new vq conn for fd: 23, tid: 69352
> > VHOST_CONFIG: new virtio connection is 25
> > VHOST_CONFIG: new device, handle is 0
> > ---->  debug: vserver_message_handler thread id: 69352, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_OWNER
> > ---->  debug: vserver_message_handler thread id: 69352, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_GET_FEATURES
> > ---->  debug: vserver_message_handler thread id: 69352, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
> > VHOST_CONFIG: vring call idx:0 file:26
> > ---->  debug: vserver_message_handler thread id: 69352, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
> > VHOST_CONFIG: vring call idx:1 file:27
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
> > VHOST_CONFIG: vring call idx:0 file:28
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_CALL
> > VHOST_CONFIG: vring call idx:1 file:26
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_FEATURES
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_MEM_TABLE
> > ---->  debug: device_fh: 0: user_set_mem_table
> > VHOST_CONFIG: mapped region 0 fd:27 to 0x7ff6c0000000 sz:0xa0000
> > off:0x0
> > VHOST_CONFIG: mapped region 1 fd:29 to 0x7ff680000000 sz:0x40000000
> > off:0xc0000
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_NUM
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_BASE
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_ADDR
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_KICK
> > VHOST_CONFIG: vring kick idx:0 file:30
> > ---->  debug: vserver_message_handler thread id: 69352, fd: 25
> > VHOST_CONFIG: virtio is not ready for processing.
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_BASE
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_ADDR
> > ---->  debug: vserver_message_handler thread id: 69351, fd: 25
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_KICK
> > VHOST_CONFIG: vring kick idx:1 file:31
> > VHOST_CONFIG: virtio is now ready for processing.
> > PMD: New connection established
> > VHOST_CONFIG: read message VHOST_USER_SET_VRING_NUM
> >
> > ----------------------------------------------------------------------
> > -------------------------------------------
> >
> >> ...
> >> +
> >> +static void *vhost_driver_session(void *param __rte_unused) {
> >> +	static struct virtio_net_device_ops *vhost_ops;
> >> +
> >> +	vhost_ops = rte_zmalloc(NULL, sizeof(*vhost_ops), 0);
> >> +	if (vhost_ops == NULL)
> >> +		rte_panic("Can't allocate memory\n");
> >> +
> >> +	/* set vhost arguments */
> >> +	vhost_ops->new_device = new_device;
> >> +	vhost_ops->destroy_device = destroy_device;
> >> +	if (rte_vhost_driver_pmd_callback_register(vhost_ops) < 0)
> >> +		rte_panic("Can't register callbacks\n");
> >> +
> >> +	/* start event handling */
> >> +	rte_vhost_driver_session_start();
> >> +
> >> +	rte_free(vhost_ops);
> >> +	pthread_exit(0);
> >> +}
> >> +
> >> +static void vhost_driver_session_start(struct pmd_internal
> >> +*internal) {
> >> +	int ret;
> >> +
> >> +	ret = pthread_create(&internal->session_th,
> >> +			NULL, vhost_driver_session, NULL);
> >> +	if (ret)
> >> +		rte_panic("Can't create a thread\n"); }
> >> +
> >> ...

Rich Lane Nov. 13, 2015, 4:03 a.m. UTC | #8

>
> +       if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
> +               ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
> +                                        &open_iface, &iface_name);
> +               if (ret < 0)
> +                       goto out_free;
> +       }
>

I noticed that the strdup in eth_dev_vhost_create crashes if you don't pass
the iface option, so this should probably return an error if the option
doesn't exist.

Tetsuya Mukawa Nov. 13, 2015, 4:29 a.m. UTC | #9

On 2015/11/13 13:03, Rich Lane wrote:
>> +       if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
>> +               ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
>> +                                        &open_iface, &iface_name);
>> +               if (ret < 0)
>> +                       goto out_free;
>> +       }
>>
> I noticed that the strdup in eth_dev_vhost_create crashes if you don't pass
> the iface option, so this should probably return an error if the option
> doesn't exist.
>

Hi Lane,

Yes, you are correct. Thanks for checking!
I will fix it also.

Tetsuya

Tetsuya Mukawa Nov. 13, 2015, 5:20 a.m. UTC | #10

The patch introduces a new PMD. This PMD is implemented as thin wrapper
of librte_vhost.

* Known issue.
We may see issues while handling RESET_OWNER message.
These handlings are done in vhost library, so not a part of vhost PMD.
So far, we are waiting for QEMU fixing.

PATCH v4 changes:
 - Rebase on latest DPDK tree.
 - Fix cording style.
 - Fix code not to invoke multiple messaging handling threads.
 - Fix code to handle vdev parameters correctly.
 - Remove needless cast.
 - Remove needless if-condition before rt_free().

PATCH v3 changes:
 - Rebase on latest matser
 - Specify correct queue_id in RX/TX function.

PATCH v2 changes:
 - Remove a below patch that fixes vhost library.
   The patch was applied as a separate patch.
   - vhost: fix crash with multiqueue enabled
 - Fix typos.
   (Thanks to Thomas, Monjalon)
 - Rebase on latest tree with above bernard's patches.

PATCH v1 changes:
 - Support vhost multiple queues.
 - Rebase on "remove pci driver from vdevs".
 - Optimize RX/TX functions.
 - Fix resource leaks.
 - Fix compile issue.
 - Add patch to fix vhost library.

RFC PATCH v3 changes:
 - Optimize performance.
   In RX/TX functions, change code to access only per core data.
 - Add below API to allow user to use vhost library APIs for a port managed
   by vhost PMD. There are a few limitations. See "rte_eth_vhost.h".
    - rte_eth_vhost_portid2vdev()
   To support this functionality, vhost library is also changed.
   Anyway, if users doesn't use vhost PMD, can fully use vhost library APIs.
 - Add code to support vhost multiple queues.
   Actually, multiple queues functionality is not enabled so far.

RFC PATCH v2 changes:
 - Fix issues reported by checkpatch.pl
   (Thanks to Stephen Hemminger)


Tetsuya Mukawa (2):
  vhost: Add callback and private data for vhost PMD
  vhost: Add VHOST PMD

 config/common_linuxapp                        |   6 +
 doc/guides/nics/index.rst                     |   1 +
 doc/guides/rel_notes/release_2_2.rst          |   2 +
 drivers/net/Makefile                          |   4 +
 drivers/net/vhost/Makefile                    |  62 ++
 drivers/net/vhost/rte_eth_vhost.c             | 783 ++++++++++++++++++++++++++
 drivers/net/vhost/rte_eth_vhost.h             |  65 +++
 drivers/net/vhost/rte_pmd_vhost_version.map   |   8 +
 lib/librte_vhost/rte_vhost_version.map        |   6 +
 lib/librte_vhost/rte_virtio_net.h             |   3 +
 lib/librte_vhost/vhost_user/virtio-net-user.c |  13 +-
 lib/librte_vhost/virtio-net.c                 |  60 +-
 lib/librte_vhost/virtio-net.h                 |   4 +-
 mk/rte.app.mk                                 |   8 +-
 14 files changed, 1011 insertions(+), 14 deletions(-)
 create mode 100644 drivers/net/vhost/Makefile
 create mode 100644 drivers/net/vhost/rte_eth_vhost.c
 create mode 100644 drivers/net/vhost/rte_eth_vhost.h
 create mode 100644 drivers/net/vhost/rte_pmd_vhost_version.map

Yuanhan Liu Nov. 13, 2015, 5:32 a.m. UTC | #11

On Fri, Nov 13, 2015 at 02:20:29PM +0900, Tetsuya Mukawa wrote:
> The patch introduces a new PMD. This PMD is implemented as thin wrapper
> of librte_vhost.
> 
> * Known issue.
> We may see issues while handling RESET_OWNER message.
> These handlings are done in vhost library, so not a part of vhost PMD.
> So far, we are waiting for QEMU fixing.

Fix patches have already been applied. Please help test :)

	--yliu

Tetsuya Mukawa Nov. 13, 2015, 5:37 a.m. UTC | #12

On 2015/11/13 14:32, Yuanhan Liu wrote:
> On Fri, Nov 13, 2015 at 02:20:29PM +0900, Tetsuya Mukawa wrote:
>> The patch introduces a new PMD. This PMD is implemented as thin wrapper
>> of librte_vhost.
>>
>> * Known issue.
>> We may see issues while handling RESET_OWNER message.
>> These handlings are done in vhost library, so not a part of vhost PMD.
>> So far, we are waiting for QEMU fixing.
> Fix patches have already been applied. Please help test :)
>
> 	--yliu

Thanks!
I have checked it, and it worked!

Tetsuya

Tetsuya Mukawa Nov. 13, 2015, 6:50 a.m. UTC | #13

On 2015/11/13 14:32, Yuanhan Liu wrote:
> On Fri, Nov 13, 2015 at 02:20:29PM +0900, Tetsuya Mukawa wrote:
>> The patch introduces a new PMD. This PMD is implemented as thin wrapper
>> of librte_vhost.
>>
>> * Known issue.
>> We may see issues while handling RESET_OWNER message.
>> These handlings are done in vhost library, so not a part of vhost PMD.
>> So far, we are waiting for QEMU fixing.
> Fix patches have already been applied. Please help test :)
>
> 	--yliu

Hi Yuanhan,

It seems there might be an another issue related with "vq->callfd" in
vhost library.
We may miss something to handle the value correctly.

Anyway, here are steps.
1. Apply vhost PMD patch.
(I guess you don't need it to reproduce the issue, but to reproduce it,
using the PMD may be easy)
2. Start testpmd on host with vhost-user PMD.
3. Start QEMU with virtio-net device.
4. Login QEMU.
5. Bind the virtio-net device to igb_uio.
6. Start testpmd in QEMU.
7. Quit testmd in QEMU.
8. Start testpmd again in QEMU.

It seems when last command is executed, testpmd on host doesn't receive
SET_VRING_CALL message from QEMU.
Because of this, testpmd on host assumes virtio-net device is not ready.
(I made sure virtio_is_ready() was failed on host).

According to QEMU source code, SET_VRING_KICK will be called when
virtqueue starts, but SET_VRING_CALL will be called when virtqueue is
initialized.
Not sure exactly, might be "vq->call" will be valid while connection is
established?

Also I've found a workaround.
Please execute after step7.

8. Bind the virtio-net device to virtio-pci kernel driver.
9. Bind the virtio-net device to igb_uio.
10. Start testpmd in QEMU.

When step8 is executed, connection will be re-established, and testpmd
on host will be able to receive SET_VRING_CALL.
Then testpmd on host can start.

Thanks,
Tetsuya

Yuanhan Liu Nov. 17, 2015, 1:26 p.m. UTC | #14

On Fri, Nov 13, 2015 at 03:50:16PM +0900, Tetsuya Mukawa wrote:
> On 2015/11/13 14:32, Yuanhan Liu wrote:
> > On Fri, Nov 13, 2015 at 02:20:29PM +0900, Tetsuya Mukawa wrote:
> >> The patch introduces a new PMD. This PMD is implemented as thin wrapper
> >> of librte_vhost.
> >>
> >> * Known issue.
> >> We may see issues while handling RESET_OWNER message.
> >> These handlings are done in vhost library, so not a part of vhost PMD.
> >> So far, we are waiting for QEMU fixing.
> > Fix patches have already been applied. Please help test :)
> >
> > 	--yliu
> 
> Hi Yuanhan,
> 
> It seems there might be an another issue related with "vq->callfd" in
> vhost library.
> We may miss something to handle the value correctly.
> 
> Anyway, here are steps.
> 1. Apply vhost PMD patch.
> (I guess you don't need it to reproduce the issue, but to reproduce it,
> using the PMD may be easy)
> 2. Start testpmd on host with vhost-user PMD.
> 3. Start QEMU with virtio-net device.
> 4. Login QEMU.
> 5. Bind the virtio-net device to igb_uio.
> 6. Start testpmd in QEMU.
> 7. Quit testmd in QEMU.
> 8. Start testpmd again in QEMU.
> 
> It seems when last command is executed, testpmd on host doesn't receive
> SET_VRING_CALL message from QEMU.
> Because of this, testpmd on host assumes virtio-net device is not ready.
> (I made sure virtio_is_ready() was failed on host).
> 
> According to QEMU source code, SET_VRING_KICK will be called when
> virtqueue starts, but SET_VRING_CALL will be called when virtqueue is
> initialized.
> Not sure exactly, might be "vq->call" will be valid while connection is
> established?

Yes, it would be valid as far as we don't reset it from another
set_vring_call. So, we should not reset it on reset_device().

	--yliu
> 
> Also I've found a workaround.
> Please execute after step7.
> 
> 8. Bind the virtio-net device to virtio-pci kernel driver.
> 9. Bind the virtio-net device to igb_uio.
> 10. Start testpmd in QEMU.
> 
> When step8 is executed, connection will be re-established, and testpmd
> on host will be able to receive SET_VRING_CALL.
> Then testpmd on host can start.
> 
> Thanks,
> Tetsuya

Tetsuya Mukawa Nov. 19, 2015, 1:20 a.m. UTC | #15

On 2015/11/17 22:26, Yuanhan Liu wrote:
> On Fri, Nov 13, 2015 at 03:50:16PM +0900, Tetsuya Mukawa wrote:
>> On 2015/11/13 14:32, Yuanhan Liu wrote:
>>> On Fri, Nov 13, 2015 at 02:20:29PM +0900, Tetsuya Mukawa wrote:
>>>> The patch introduces a new PMD. This PMD is implemented as thin wrapper
>>>> of librte_vhost.
>>>>
>>>> * Known issue.
>>>> We may see issues while handling RESET_OWNER message.
>>>> These handlings are done in vhost library, so not a part of vhost PMD.
>>>> So far, we are waiting for QEMU fixing.
>>> Fix patches have already been applied. Please help test :)
>>>
>>> 	--yliu
>> Hi Yuanhan,
>>
>> It seems there might be an another issue related with "vq->callfd" in
>> vhost library.
>> We may miss something to handle the value correctly.
>>
>> Anyway, here are steps.
>> 1. Apply vhost PMD patch.
>> (I guess you don't need it to reproduce the issue, but to reproduce it,
>> using the PMD may be easy)
>> 2. Start testpmd on host with vhost-user PMD.
>> 3. Start QEMU with virtio-net device.
>> 4. Login QEMU.
>> 5. Bind the virtio-net device to igb_uio.
>> 6. Start testpmd in QEMU.
>> 7. Quit testmd in QEMU.
>> 8. Start testpmd again in QEMU.
>>
>> It seems when last command is executed, testpmd on host doesn't receive
>> SET_VRING_CALL message from QEMU.
>> Because of this, testpmd on host assumes virtio-net device is not ready.
>> (I made sure virtio_is_ready() was failed on host).
>>
>> According to QEMU source code, SET_VRING_KICK will be called when
>> virtqueue starts, but SET_VRING_CALL will be called when virtqueue is
>> initialized.
>> Not sure exactly, might be "vq->call" will be valid while connection is
>> established?
> Yes, it would be valid as far as we don't reset it from another
> set_vring_call. So, we should not reset it on reset_device().
>
> 	--yliu

Hi Yuanhan,

Thanks for checking.
I will submit the patch for this today.

Tetsuya

>> Also I've found a workaround.
>> Please execute after step7.
>>
>> 8. Bind the virtio-net device to virtio-pci kernel driver.
>> 9. Bind the virtio-net device to igb_uio.
>> 10. Start testpmd in QEMU.
>>
>> When step8 is executed, connection will be re-established, and testpmd
>> on host will be able to receive SET_VRING_CALL.
>> Then testpmd on host can start.
>>
>> Thanks,
>> Tetsuya

diff mbox

Patch

diff --git a/config/common_linuxapp b/config/common_linuxapp
index 7248262..a264c11 100644
--- a/config/common_linuxapp
+++ b/config/common_linuxapp
@@ -458,6 +458,12 @@  CONFIG_RTE_LIBRTE_VHOST_NUMA=n
 CONFIG_RTE_LIBRTE_VHOST_DEBUG=n
 
 #
+# Compile vhost PMD
+# To compile, CONFIG_RTE_LIBRTE_VHOST should be enabled.
+#
+CONFIG_RTE_LIBRTE_PMD_VHOST=y
+
+#
 #Compile Xen domain0 support
 #
 CONFIG_RTE_LIBRTE_XEN_DOM0=n
diff --git a/doc/guides/nics/index.rst b/doc/guides/nics/index.rst
index 2d4936d..57d1041 100644
--- a/doc/guides/nics/index.rst
+++ b/doc/guides/nics/index.rst
@@ -47,6 +47,7 @@  Network Interface Controller Drivers
     mlx4
     mlx5
     virtio
+    vhost
     vmxnet3
     pcap_ring
 
diff --git a/doc/guides/rel_notes/release_2_2.rst b/doc/guides/rel_notes/release_2_2.rst
index 59dda59..4b5644d 100644
--- a/doc/guides/rel_notes/release_2_2.rst
+++ b/doc/guides/rel_notes/release_2_2.rst
@@ -90,6 +90,8 @@  New Features
 
 * **Added vhost-user multiple queue support.**
 
+* **Added vhost PMD.**
+
 * **Added port hotplug support to vmxnet3.**
 
 * **Added port hotplug support to xenvirt.**
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index 6da1ce2..66eb63d 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -50,5 +50,9 @@  DIRS-$(CONFIG_RTE_LIBRTE_VIRTIO_PMD) += virtio
 DIRS-$(CONFIG_RTE_LIBRTE_VMXNET3_PMD) += vmxnet3
 DIRS-$(CONFIG_RTE_LIBRTE_PMD_XENVIRT) += xenvirt
 
+ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
+DIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += vhost
+endif # $(CONFIG_RTE_LIBRTE_VHOST)
+
 include $(RTE_SDK)/mk/rte.sharelib.mk
 include $(RTE_SDK)/mk/rte.subdir.mk
diff --git a/drivers/net/vhost/Makefile b/drivers/net/vhost/Makefile
new file mode 100644
index 0000000..8186a80
--- /dev/null
+++ b/drivers/net/vhost/Makefile
@@ -0,0 +1,62 @@ 
+#   BSD LICENSE
+#
+#   Copyright (c) 2010-2015 Intel Corporation.
+#   All rights reserved.
+#
+#   Redistribution and use in source and binary forms, with or without
+#   modification, are permitted provided that the following conditions
+#   are met:
+#
+#     * Redistributions of source code must retain the above copyright
+#       notice, this list of conditions and the following disclaimer.
+#     * Redistributions in binary form must reproduce the above copyright
+#       notice, this list of conditions and the following disclaimer in
+#       the documentation and/or other materials provided with the
+#       distribution.
+#     * Neither the name of Intel corporation nor the names of its
+#       contributors may be used to endorse or promote products derived
+#       from this software without specific prior written permission.
+#
+#   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+#   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+#   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+#   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+#   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+#   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+#   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+#   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+#   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+#   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+#   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+include $(RTE_SDK)/mk/rte.vars.mk
+
+#
+# library name
+#
+LIB = librte_pmd_vhost.a
+
+CFLAGS += -O3
+CFLAGS += $(WERROR_FLAGS)
+
+EXPORT_MAP := rte_pmd_vhost_version.map
+
+LIBABIVER := 1
+
+#
+# all source are stored in SRCS-y
+#
+SRCS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += rte_eth_vhost.c
+
+#
+# Export include files
+#
+SYMLINK-y-include += rte_eth_vhost.h
+
+# this lib depends upon:
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += lib/librte_mbuf
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += lib/librte_ether
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += lib/librte_kvargs
+DEPDIRS-$(CONFIG_RTE_LIBRTE_PMD_VHOST) += lib/librte_vhost
+
+include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/drivers/net/vhost/rte_eth_vhost.c b/drivers/net/vhost/rte_eth_vhost.c
new file mode 100644
index 0000000..ff983b5
--- /dev/null
+++ b/drivers/net/vhost/rte_eth_vhost.c
@@ -0,0 +1,768 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright (c) 2015 IGEL Co., Ltd.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of IGEL Co.,Ltd. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include <unistd.h>
+#include <pthread.h>
+
+#include <rte_mbuf.h>
+#include <rte_ethdev.h>
+#include <rte_malloc.h>
+#include <rte_memcpy.h>
+#include <rte_dev.h>
+#include <rte_kvargs.h>
+#include <rte_virtio_net.h>
+
+#include "rte_eth_vhost.h"
+
+#define ETH_VHOST_IFACE_ARG		"iface"
+#define ETH_VHOST_QUEUES_ARG		"queues"
+
+static const char *drivername = "VHOST PMD";
+
+static const char *valid_arguments[] = {
+	ETH_VHOST_IFACE_ARG,
+	ETH_VHOST_QUEUES_ARG,
+	NULL
+};
+
+static struct ether_addr base_eth_addr = {
+	.addr_bytes = {
+		0x56 /* V */,
+		0x48 /* H */,
+		0x4F /* O */,
+		0x53 /* S */,
+		0x54 /* T */,
+		0x00
+	}
+};
+
+struct vhost_queue {
+	rte_atomic32_t allow_queuing;
+	rte_atomic32_t while_queuing;
+	struct virtio_net *device;
+	struct pmd_internal *internal;
+	struct rte_mempool *mb_pool;
+	uint16_t virtqueue_id;
+	uint64_t rx_pkts;
+	uint64_t tx_pkts;
+	uint64_t err_pkts;
+};
+
+struct pmd_internal {
+	TAILQ_ENTRY(pmd_internal) next;
+	char *dev_name;
+	char *iface_name;
+	unsigned nb_rx_queues;
+	unsigned nb_tx_queues;
+
+	struct vhost_queue *rx_vhost_queues[RTE_MAX_QUEUES_PER_PORT];
+	struct vhost_queue *tx_vhost_queues[RTE_MAX_QUEUES_PER_PORT];
+
+	volatile uint16_t once;
+	pthread_t session_th;
+};
+
+TAILQ_HEAD(pmd_internal_head, pmd_internal);
+static struct pmd_internal_head internals_list =
+	TAILQ_HEAD_INITIALIZER(internals_list);
+
+static pthread_mutex_t internal_list_lock = PTHREAD_MUTEX_INITIALIZER;
+
+static struct rte_eth_link pmd_link = {
+		.link_speed = 10000,
+		.link_duplex = ETH_LINK_FULL_DUPLEX,
+		.link_status = 0
+};
+
+static uint16_t
+eth_vhost_rx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
+{
+	struct vhost_queue *r = q;
+	uint16_t nb_rx = 0;
+
+	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
+		return 0;
+
+	rte_atomic32_set(&r->while_queuing, 1);
+
+	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
+		goto out;
+
+	/* Dequeue packets from guest TX queue */
+	nb_rx = (uint16_t)rte_vhost_dequeue_burst(r->device,
+			r->virtqueue_id, r->mb_pool, bufs, nb_bufs);
+
+	r->rx_pkts += nb_rx;
+
+out:
+	rte_atomic32_set(&r->while_queuing, 0);
+
+	return nb_rx;
+}
+
+static uint16_t
+eth_vhost_tx(void *q, struct rte_mbuf **bufs, uint16_t nb_bufs)
+{
+	struct vhost_queue *r = q;
+	uint16_t i, nb_tx = 0;
+
+	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
+		return 0;
+
+	rte_atomic32_set(&r->while_queuing, 1);
+
+	if (unlikely(rte_atomic32_read(&r->allow_queuing) == 0))
+		goto out;
+
+	/* Enqueue packets to guest RX queue */
+	nb_tx = (uint16_t)rte_vhost_enqueue_burst(r->device,
+			r->virtqueue_id, bufs, nb_bufs);
+
+	r->tx_pkts += nb_tx;
+	r->err_pkts += nb_bufs - nb_tx;
+
+	for (i = 0; likely(i < nb_tx); i++)
+		rte_pktmbuf_free(bufs[i]);
+
+out:
+	rte_atomic32_set(&r->while_queuing, 0);
+
+	return nb_tx;
+}
+
+static int
+eth_dev_configure(struct rte_eth_dev *dev __rte_unused) { return 0; }
+
+static inline struct pmd_internal *
+find_internal_resource(char *ifname)
+{
+	int found = 0;
+	struct pmd_internal *internal;
+
+	if (ifname == NULL)
+		return NULL;
+
+	pthread_mutex_lock(&internal_list_lock);
+
+	TAILQ_FOREACH(internal, &internals_list, next) {
+		if (!strcmp(internal->iface_name, ifname)) {
+			found = 1;
+			break;
+		}
+	}
+
+	pthread_mutex_unlock(&internal_list_lock);
+
+	if (!found)
+		return NULL;
+
+	return internal;
+}
+
+static int
+new_device(struct virtio_net *dev)
+{
+	struct rte_eth_dev *eth_dev;
+	struct pmd_internal *internal;
+	struct vhost_queue *vq;
+	unsigned i;
+
+	if (dev == NULL) {
+		RTE_LOG(INFO, PMD, "Invalid argument\n");
+		return -1;
+	}
+
+	internal = find_internal_resource(dev->ifname);
+	if (internal == NULL) {
+		RTE_LOG(INFO, PMD, "Invalid device name\n");
+		return -1;
+	}
+
+	if ((dev->virt_qp_nb < internal->nb_rx_queues) ||
+			(dev->virt_qp_nb < internal->nb_tx_queues)) {
+		RTE_LOG(INFO, PMD, "Not enough queues\n");
+		return -1;
+	}
+
+	eth_dev = rte_eth_dev_allocated(internal->dev_name);
+	if (eth_dev == NULL) {
+		RTE_LOG(INFO, PMD, "Failed to find a ethdev\n");
+		return -1;
+	}
+
+	for (i = 0; i < internal->nb_rx_queues; i++) {
+		vq = internal->rx_vhost_queues[i];
+		if (vq == NULL)
+			continue;
+		vq->device = dev;
+		vq->internal = internal;
+	}
+	for (i = 0; i < internal->nb_tx_queues; i++) {
+		vq = internal->tx_vhost_queues[i];
+		if (vq == NULL)
+			continue;
+		vq->device = dev;
+		vq->internal = internal;
+	}
+
+	dev->flags |= VIRTIO_DEV_RUNNING;
+	dev->pmd_priv = eth_dev;
+	eth_dev->data->dev_link.link_status = 1;
+
+	for (i = 0; i < internal->nb_rx_queues; i++) {
+		vq = internal->rx_vhost_queues[i];
+		if (vq == NULL)
+			continue;
+		rte_atomic32_set(&vq->allow_queuing, 1);
+	}
+	for (i = 0; i < internal->nb_tx_queues; i++) {
+		vq = internal->tx_vhost_queues[i];
+		if (vq == NULL)
+			continue;
+		rte_atomic32_set(&vq->allow_queuing, 1);
+	}
+	RTE_LOG(INFO, PMD, "New connection established\n");
+
+	return 0;
+}
+
+static void
+destroy_device(volatile struct virtio_net *dev)
+{
+	struct rte_eth_dev *eth_dev;
+	struct pmd_internal *internal;
+	struct vhost_queue *vq;
+	unsigned i;
+
+	if (dev == NULL) {
+		RTE_LOG(INFO, PMD, "Invalid argument\n");
+		return;
+	}
+
+	eth_dev = (struct rte_eth_dev *)dev->pmd_priv;
+	if (eth_dev == NULL) {
+		RTE_LOG(INFO, PMD, "Failed to find a ethdev\n");
+		return;
+	}
+
+	internal = eth_dev->data->dev_private;
+
+	/* Wait until rx/tx_pkt_burst stops accessing vhost device */
+	for (i = 0; i < internal->nb_rx_queues; i++) {
+		vq = internal->rx_vhost_queues[i];
+		if (vq == NULL)
+			continue;
+		rte_atomic32_set(&vq->allow_queuing, 0);
+		while (rte_atomic32_read(&vq->while_queuing))
+			rte_pause();
+	}
+	for (i = 0; i < internal->nb_tx_queues; i++) {
+		vq = internal->tx_vhost_queues[i];
+		if (vq == NULL)
+			continue;
+		rte_atomic32_set(&vq->allow_queuing, 0);
+		while (rte_atomic32_read(&vq->while_queuing))
+			rte_pause();
+	}
+
+	eth_dev->data->dev_link.link_status = 0;
+
+	dev->pmd_priv = NULL;
+	dev->flags &= ~VIRTIO_DEV_RUNNING;
+
+	for (i = 0; i < internal->nb_rx_queues; i++) {
+		vq = internal->rx_vhost_queues[i];
+		if (vq == NULL)
+			continue;
+		vq->device = NULL;
+	}
+	for (i = 0; i < internal->nb_tx_queues; i++) {
+		vq = internal->tx_vhost_queues[i];
+		if (vq == NULL)
+			continue;
+		vq->device = NULL;
+	}
+
+	RTE_LOG(INFO, PMD, "Connection closed\n");
+}
+
+static void *vhost_driver_session(void *param __rte_unused)
+{
+	static struct virtio_net_device_ops *vhost_ops;
+
+	vhost_ops = rte_zmalloc(NULL, sizeof(*vhost_ops), 0);
+	if (vhost_ops == NULL)
+		rte_panic("Can't allocate memory\n");
+
+	/* set vhost arguments */
+	vhost_ops->new_device = new_device;
+	vhost_ops->destroy_device = destroy_device;
+	if (rte_vhost_driver_pmd_callback_register(vhost_ops) < 0)
+		rte_panic("Can't register callbacks\n");
+
+	/* start event handling */
+	rte_vhost_driver_session_start();
+
+	rte_free(vhost_ops);
+	pthread_exit(0);
+}
+
+static void vhost_driver_session_start(struct pmd_internal *internal)
+{
+	int ret;
+
+	ret = pthread_create(&internal->session_th,
+			NULL, vhost_driver_session, NULL);
+	if (ret)
+		rte_panic("Can't create a thread\n");
+}
+
+static void vhost_driver_session_stop(struct pmd_internal *internal)
+{
+	int ret;
+
+	ret = pthread_cancel(internal->session_th);
+	if (ret)
+		rte_panic("Can't cancel the thread\n");
+
+	ret = pthread_join(internal->session_th, NULL);
+	if (ret)
+		rte_panic("Can't join the thread\n");
+}
+
+static int
+eth_dev_start(struct rte_eth_dev *dev)
+{
+	int ret;
+	struct pmd_internal *internal = dev->data->dev_private;
+
+	if (rte_atomic16_cmpset(&internal->once, 0, 1)) {
+		ret = rte_vhost_driver_register(internal->iface_name);
+		if (ret)
+			return ret;
+
+		vhost_driver_session_start(internal);
+	}
+	return 0;
+}
+
+static void
+eth_dev_stop(struct rte_eth_dev *dev)
+{
+	struct pmd_internal *internal = dev->data->dev_private;
+
+	if (rte_atomic16_cmpset(&internal->once, 1, 0)) {
+		rte_vhost_driver_unregister(internal->iface_name);
+		vhost_driver_session_stop(internal);
+	}
+}
+
+static int
+eth_rx_queue_setup(struct rte_eth_dev *dev, uint16_t rx_queue_id,
+		   uint16_t nb_rx_desc __rte_unused,
+		   unsigned int socket_id,
+		   const struct rte_eth_rxconf *rx_conf __rte_unused,
+		   struct rte_mempool *mb_pool)
+{
+	struct pmd_internal *internal = dev->data->dev_private;
+	struct vhost_queue *vq;
+
+	if (internal->rx_vhost_queues[rx_queue_id] != NULL)
+		rte_free(internal->rx_vhost_queues[rx_queue_id]);
+
+	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
+			RTE_CACHE_LINE_SIZE, socket_id);
+	if (vq == NULL) {
+		RTE_LOG(ERR, PMD, "Failed to allocate memory for rx queue\n");
+		return -ENOMEM;
+	}
+
+	vq->mb_pool = mb_pool;
+	vq->virtqueue_id = rx_queue_id * VIRTIO_QNUM + VIRTIO_TXQ;
+	internal->rx_vhost_queues[rx_queue_id] = vq;
+	dev->data->rx_queues[rx_queue_id] = vq;
+	return 0;
+}
+
+static int
+eth_tx_queue_setup(struct rte_eth_dev *dev, uint16_t tx_queue_id,
+		   uint16_t nb_tx_desc __rte_unused,
+		   unsigned int socket_id,
+		   const struct rte_eth_txconf *tx_conf __rte_unused)
+{
+	struct pmd_internal *internal = dev->data->dev_private;
+	struct vhost_queue *vq;
+
+	if (internal->tx_vhost_queues[tx_queue_id] != NULL)
+		rte_free(internal->tx_vhost_queues[tx_queue_id]);
+
+	vq = rte_zmalloc_socket(NULL, sizeof(struct vhost_queue),
+			RTE_CACHE_LINE_SIZE, socket_id);
+	if (vq == NULL) {
+		RTE_LOG(ERR, PMD, "Failed to allocate memory for tx queue\n");
+		return -ENOMEM;
+	}
+
+	vq->virtqueue_id = tx_queue_id * VIRTIO_QNUM + VIRTIO_RXQ;
+	internal->tx_vhost_queues[tx_queue_id] = vq;
+	dev->data->tx_queues[tx_queue_id] = vq;
+	return 0;
+}
+
+
+static void
+eth_dev_info(struct rte_eth_dev *dev,
+	     struct rte_eth_dev_info *dev_info)
+{
+	struct pmd_internal *internal = dev->data->dev_private;
+
+	dev_info->driver_name = drivername;
+	dev_info->max_mac_addrs = 1;
+	dev_info->max_rx_pktlen = (uint32_t)-1;
+	dev_info->max_rx_queues = (uint16_t)internal->nb_rx_queues;
+	dev_info->max_tx_queues = (uint16_t)internal->nb_tx_queues;
+	dev_info->min_rx_bufsize = 0;
+}
+
+static void
+eth_stats_get(struct rte_eth_dev *dev, struct rte_eth_stats *igb_stats)
+{
+	unsigned i;
+	unsigned long rx_total = 0, tx_total = 0, tx_err_total = 0;
+	const struct pmd_internal *internal = dev->data->dev_private;
+
+	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
+	     i < internal->nb_rx_queues; i++) {
+		if (internal->rx_vhost_queues[i] == NULL)
+			continue;
+		igb_stats->q_ipackets[i] = internal->rx_vhost_queues[i]->rx_pkts;
+		rx_total += igb_stats->q_ipackets[i];
+	}
+
+	for (i = 0; i < RTE_ETHDEV_QUEUE_STAT_CNTRS &&
+	     i < internal->nb_tx_queues; i++) {
+		if (internal->tx_vhost_queues[i] == NULL)
+			continue;
+		igb_stats->q_opackets[i] = internal->tx_vhost_queues[i]->tx_pkts;
+		igb_stats->q_errors[i] = internal->tx_vhost_queues[i]->err_pkts;
+		tx_total += igb_stats->q_opackets[i];
+		tx_err_total += igb_stats->q_errors[i];
+	}
+
+	igb_stats->ipackets = rx_total;
+	igb_stats->opackets = tx_total;
+	igb_stats->oerrors = tx_err_total;
+}
+
+static void
+eth_stats_reset(struct rte_eth_dev *dev)
+{
+	unsigned i;
+	struct pmd_internal *internal = dev->data->dev_private;
+
+	for (i = 0; i < internal->nb_rx_queues; i++) {
+		if (internal->rx_vhost_queues[i] == NULL)
+			continue;
+		internal->rx_vhost_queues[i]->rx_pkts = 0;
+	}
+	for (i = 0; i < internal->nb_tx_queues; i++) {
+		if (internal->tx_vhost_queues[i] == NULL)
+			continue;
+		internal->tx_vhost_queues[i]->tx_pkts = 0;
+		internal->tx_vhost_queues[i]->err_pkts = 0;
+	}
+}
+
+static void
+eth_queue_release(void *q __rte_unused) { ; }
+static int
+eth_link_update(struct rte_eth_dev *dev __rte_unused,
+		int wait_to_complete __rte_unused) { return 0; }
+
+static const struct eth_dev_ops ops = {
+	.dev_start = eth_dev_start,
+	.dev_stop = eth_dev_stop,
+	.dev_configure = eth_dev_configure,
+	.dev_infos_get = eth_dev_info,
+	.rx_queue_setup = eth_rx_queue_setup,
+	.tx_queue_setup = eth_tx_queue_setup,
+	.rx_queue_release = eth_queue_release,
+	.tx_queue_release = eth_queue_release,
+	.link_update = eth_link_update,
+	.stats_get = eth_stats_get,
+	.stats_reset = eth_stats_reset,
+};
+
+static int
+eth_dev_vhost_create(const char *name, int index,
+		     char *iface_name,
+		     int16_t queues,
+		     const unsigned numa_node)
+{
+	struct rte_eth_dev_data *data = NULL;
+	struct pmd_internal *internal = NULL;
+	struct rte_eth_dev *eth_dev = NULL;
+	struct ether_addr *eth_addr = NULL;
+
+	RTE_LOG(INFO, PMD, "Creating VHOST-USER backend on numa socket %u\n",
+		numa_node);
+
+	/* now do all data allocation - for eth_dev structure, dummy pci driver
+	 * and internal (private) data
+	 */
+	data = rte_zmalloc_socket(name, sizeof(*data), 0, numa_node);
+	if (data == NULL)
+		goto error;
+
+	internal = rte_zmalloc_socket(name, sizeof(*internal), 0, numa_node);
+	if (internal == NULL)
+		goto error;
+
+	eth_addr = rte_zmalloc_socket(name, sizeof(*eth_addr), 0, numa_node);
+	if (eth_addr == NULL)
+		goto error;
+	*eth_addr = base_eth_addr;
+	eth_addr->addr_bytes[5] = index;
+
+	/* reserve an ethdev entry */
+	eth_dev = rte_eth_dev_allocate(name, RTE_ETH_DEV_VIRTUAL);
+	if (eth_dev == NULL)
+		goto error;
+
+	/* now put it all together
+	 * - store queue data in internal,
+	 * - store numa_node info in ethdev data
+	 * - point eth_dev_data to internals
+	 * - and point eth_dev structure to new eth_dev_data structure
+	 */
+	internal->nb_rx_queues = queues;
+	internal->nb_tx_queues = queues;
+	internal->dev_name = strdup(name);
+	if (internal->dev_name == NULL)
+		goto error;
+	internal->iface_name = strdup(iface_name);
+	if (internal->iface_name == NULL)
+		goto error;
+
+	pthread_mutex_lock(&internal_list_lock);
+	TAILQ_INSERT_TAIL(&internals_list, internal, next);
+	pthread_mutex_unlock(&internal_list_lock);
+
+	data->dev_private = internal;
+	data->port_id = eth_dev->data->port_id;
+	memmove(data->name, eth_dev->data->name, sizeof(data->name));
+	data->nb_rx_queues = queues;
+	data->nb_tx_queues = queues;
+	data->dev_link = pmd_link;
+	data->mac_addrs = eth_addr;
+
+	/* We'll replace the 'data' originally allocated by eth_dev. So the
+	 * vhost PMD resources won't be shared between multi processes.
+	 */
+	eth_dev->data = data;
+	eth_dev->dev_ops = &ops;
+	eth_dev->driver = NULL;
+	eth_dev->data->dev_flags = RTE_ETH_DEV_DETACHABLE;
+	eth_dev->data->kdrv = RTE_KDRV_NONE;
+	eth_dev->data->drv_name = internal->dev_name;
+	eth_dev->data->numa_node = numa_node;
+
+	/* finally assign rx and tx ops */
+	eth_dev->rx_pkt_burst = eth_vhost_rx;
+	eth_dev->tx_pkt_burst = eth_vhost_tx;
+
+	return data->port_id;
+
+error:
+	rte_free(data);
+	rte_free(internal);
+	rte_free(eth_addr);
+
+	return -1;
+}
+
+static inline int
+open_iface(const char *key __rte_unused, const char *value, void *extra_args)
+{
+	const char **iface_name = extra_args;
+
+	if (value == NULL)
+		return -1;
+
+	*iface_name = value;
+
+	return 0;
+}
+
+static inline int
+open_queues(const char *key __rte_unused, const char *value, void *extra_args)
+{
+	uint16_t *q = extra_args;
+
+	if ((value == NULL) || (extra_args == NULL))
+		return -EINVAL;
+
+	*q = (uint16_t)strtoul(value, NULL, 0);
+	if ((*q == USHRT_MAX) && (errno == ERANGE))
+		return -1;
+
+	if (*q > RTE_MAX_QUEUES_PER_PORT)
+		return -1;
+
+	return 0;
+}
+
+static int
+rte_pmd_vhost_devinit(const char *name, const char *params)
+{
+	struct rte_kvargs *kvlist = NULL;
+	int ret = 0;
+	int index;
+	char *iface_name;
+	uint16_t queues;
+
+	RTE_LOG(INFO, PMD, "Initializing pmd_vhost for %s\n", name);
+
+	kvlist = rte_kvargs_parse(params, valid_arguments);
+	if (kvlist == NULL)
+		return -1;
+
+	if (strlen(name) < strlen("eth_vhost"))
+		return -1;
+
+	index = strtol(name + strlen("eth_vhost"), NULL, 0);
+	if (errno == ERANGE)
+		return -1;
+
+	if (rte_kvargs_count(kvlist, ETH_VHOST_IFACE_ARG) == 1) {
+		ret = rte_kvargs_process(kvlist, ETH_VHOST_IFACE_ARG,
+					 &open_iface, &iface_name);
+		if (ret < 0)
+			goto out_free;
+	}
+
+	if (rte_kvargs_count(kvlist, ETH_VHOST_QUEUES_ARG) == 1) {
+		ret = rte_kvargs_process(kvlist, ETH_VHOST_QUEUES_ARG,
+					 &open_queues, &queues);
+		if (ret < 0)
+			goto out_free;
+
+	} else
+		queues = 1;
+
+	eth_dev_vhost_create(name, index,
+			iface_name, queues, rte_socket_id());
+
+out_free:
+	rte_kvargs_free(kvlist);
+	return ret;
+}
+
+static int
+rte_pmd_vhost_devuninit(const char *name)
+{
+	struct rte_eth_dev *eth_dev = NULL;
+	struct pmd_internal *internal;
+	unsigned int i;
+
+	RTE_LOG(INFO, PMD, "Un-Initializing pmd_vhost for %s\n", name);
+
+	if (name == NULL)
+		return -EINVAL;
+
+	/* find an ethdev entry */
+	eth_dev = rte_eth_dev_allocated(name);
+	if (eth_dev == NULL)
+		return -ENODEV;
+
+	internal = eth_dev->data->dev_private;
+
+	pthread_mutex_lock(&internal_list_lock);
+	TAILQ_REMOVE(&internals_list, internal, next);
+	pthread_mutex_unlock(&internal_list_lock);
+
+	eth_dev_stop(eth_dev);
+
+	if ((internal) && (internal->dev_name))
+		free(internal->dev_name);
+	if ((internal) && (internal->iface_name))
+		free(internal->iface_name);
+
+	rte_free(eth_dev->data->mac_addrs);
+	rte_free(eth_dev->data);
+
+	for (i = 0; i < internal->nb_rx_queues; i++) {
+		if (internal->rx_vhost_queues[i] != NULL)
+			rte_free(internal->rx_vhost_queues[i]);
+	}
+	for (i = 0; i < internal->nb_tx_queues; i++) {
+		if (internal->tx_vhost_queues[i] != NULL)
+			rte_free(internal->tx_vhost_queues[i]);
+	}
+	rte_free(internal);
+
+	rte_eth_dev_release_port(eth_dev);
+
+	return 0;
+}
+
+static struct rte_driver pmd_vhost_drv = {
+	.name = "eth_vhost",
+	.type = PMD_VDEV,
+	.init = rte_pmd_vhost_devinit,
+	.uninit = rte_pmd_vhost_devuninit,
+};
+
+struct
+virtio_net *rte_eth_vhost_portid2vdev(uint16_t port_id)
+{
+	struct rte_eth_dev *eth_dev;
+
+	if (rte_eth_dev_is_valid_port(port_id) == 0)
+		return NULL;
+
+	eth_dev = &rte_eth_devices[port_id];
+	if (strncmp("eth_vhost", eth_dev->data->drv_name,
+				strlen("eth_vhost")) == 0) {
+		struct pmd_internal *internal;
+		struct vhost_queue *vq;
+
+		internal = eth_dev->data->dev_private;
+		vq = internal->rx_vhost_queues[0];
+		if ((vq != NULL) && (vq->device != NULL))
+			return vq->device;
+	}
+
+	return NULL;
+}
+
+PMD_REGISTER_DRIVER(pmd_vhost_drv);
diff --git a/drivers/net/vhost/rte_eth_vhost.h b/drivers/net/vhost/rte_eth_vhost.h
new file mode 100644
index 0000000..22a880f
--- /dev/null
+++ b/drivers/net/vhost/rte_eth_vhost.h
@@ -0,0 +1,65 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright(c) 2015 IGEL Co., Ltd.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of IGEL Co., Ltd. nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#ifndef _RTE_ETH_AF_PACKET_H_
+#define _RTE_ETH_AF_PACKET_H_
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#include <rte_virtio_net.h>
+
+/**
+ * The function convert specified port_id to virtio device structure.
+ * The retured device can be used for vhost library APIs.
+ * To use vhost library APIs and vhost PMD parallely, below API should
+ * not be called, because the API will be called by vhost PMD.
+ * - rte_vhost_driver_session_start()
+ * Once a device is managed by vhost PMD, below API should not be called.
+ * - rte_vhost_driver_unregister()
+ * To unregister the device, call Port Hotplug APIs.
+ *
+ * @param port_id
+ *  port number
+ * @return
+ *  virtio net device structure corresponding to the specified port
+ *  NULL will be returned in error cases.
+ */
+struct virtio_net *rte_eth_vhost_portid2vdev(uint16_t port_id);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif
diff --git a/drivers/net/vhost/rte_pmd_vhost_version.map b/drivers/net/vhost/rte_pmd_vhost_version.map
new file mode 100644
index 0000000..bf0361a
--- /dev/null
+++ b/drivers/net/vhost/rte_pmd_vhost_version.map
@@ -0,0 +1,8 @@ 
+DPDK_2.2 {
+
+	global:
+
+	rte_eth_vhost_portid2vdev;
+
+	local: *;
+};
diff --git a/mk/rte.app.mk b/mk/rte.app.mk
index 724efa7..1af4bb3 100644
--- a/mk/rte.app.mk
+++ b/mk/rte.app.mk
@@ -148,7 +148,13 @@  _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_PCAP)       += -lrte_pmd_pcap
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_AF_PACKET)  += -lrte_pmd_af_packet
 _LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_NULL)       += -lrte_pmd_null
 
-endif # ! $(CONFIG_RTE_BUILD_SHARED_LIB)
+ifeq ($(CONFIG_RTE_LIBRTE_VHOST),y)
+
+_LDLIBS-$(CONFIG_RTE_LIBRTE_PMD_VHOST)      += -lrte_pmd_vhost
+
+endif # ! $(CONFIG_RTE_LIBRTE_VHOST)
+
+endif # $(CONFIG_RTE_BUILD_SHARED_LIB)
 
 endif # ! CONFIG_RTE_BUILD_COMBINE_LIBS