[dpdk-dev,05/18] mbuf: add function to get packet type from data

Message ID 1467733310-20875-6-git-send-email-olivier.matz@6wind.com (mailing list archive)
State Superseded, archived
Headers

Commit Message

Olivier Matz July 5, 2016, 3:41 p.m. UTC
  Introduce the function rte_pktmbuf_get_ptype() that parses a
mbuf and returns its packet type. For now, the following packet
types are parsed:
   L2: Ether
   L3: IPv4, IPv6
   L4: TCP, UDP, SCTP

The goal here is to provide a reference implementation for packet type
parsing. This function will be used by testpmd in next commits, allowing
to compare its result with the value given by the hardware.

This function will also be useful when implementing Rx offload support
in virtio pmd. Indeed, the virtio protocol gives the csum start and
offset, but it does not give the L4 protocol nor it tells if the
checksum is relevant for inner or outer. This information has to be
known to properly set the ol_flags in mbuf.

Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
Signed-off-by: Jean Dao <jean.dao@6wind.com>
Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
---
 doc/guides/rel_notes/release_16_11.rst |   5 +
 lib/librte_mbuf/Makefile               |   5 +-
 lib/librte_mbuf/rte_mbuf_ptype.c       | 234 +++++++++++++++++++++++++++++++++
 lib/librte_mbuf/rte_mbuf_ptype.h       |  43 ++++++
 lib/librte_mbuf/rte_mbuf_version.map   |   1 +
 5 files changed, 286 insertions(+), 2 deletions(-)
 create mode 100644 lib/librte_mbuf/rte_mbuf_ptype.c
  

Comments

Cunming Liang July 6, 2016, 6:44 a.m. UTC | #1
Hi Olivier,

On 7/5/2016 11:41 PM, Olivier Matz wrote:
> Introduce the function rte_pktmbuf_get_ptype() that parses a
> mbuf and returns its packet type. For now, the following packet
> types are parsed:
>     L2: Ether
>     L3: IPv4, IPv6
>     L4: TCP, UDP, SCTP
>
> The goal here is to provide a reference implementation for packet type
> parsing. This function will be used by testpmd in next commits, allowing
> to compare its result with the value given by the hardware.
>
> This function will also be useful when implementing Rx offload support
> in virtio pmd. Indeed, the virtio protocol gives the csum start and
> offset, but it does not give the L4 protocol nor it tells if the
> checksum is relevant for inner or outer. This information has to be
> known to properly set the ol_flags in mbuf.
>
> Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
> Signed-off-by: Jean Dao <jean.dao@6wind.com>
> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> ---
>   doc/guides/rel_notes/release_16_11.rst |   5 +
>   lib/librte_mbuf/Makefile               |   5 +-
>   lib/librte_mbuf/rte_mbuf_ptype.c       | 234 +++++++++++++++++++++++++++++++++
>   lib/librte_mbuf/rte_mbuf_ptype.h       |  43 ++++++
>   lib/librte_mbuf/rte_mbuf_version.map   |   1 +
>   5 files changed, 286 insertions(+), 2 deletions(-)
>   create mode 100644 lib/librte_mbuf/rte_mbuf_ptype.c
>
> [...]
> +
> +/* parse mbuf data to get packet type */
> +uint32_t rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
> +	struct rte_mbuf_hdr_lens *hdr_lens)
> +{
> +	struct rte_mbuf_hdr_lens local_hdr_lens;
> +	const struct ether_hdr *eh;
> +	struct ether_hdr eh_copy;
> +	uint32_t pkt_type = RTE_PTYPE_L2_ETHER;
> +	uint32_t off = 0;
> +	uint16_t proto;
> +
> +	if (hdr_lens == NULL)
> +		hdr_lens = &local_hdr_lens;
> +
> +	eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy);
> +	if (unlikely(eh == NULL))
> +		return 0;
> +	proto = eh->ether_type;
> +	off = sizeof(*eh);
> +	hdr_lens->l2_len = off;
> +
> +	if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
> +		const struct ipv4_hdr *ip4h;
> +		struct ipv4_hdr ip4h_copy;
> +
> +		ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy);
> +		if (unlikely(ip4h == NULL))
> +			return pkt_type;
> +
> +		pkt_type |= ptype_l3_ip(ip4h->version_ihl);
> +		hdr_lens->l3_len = ip4_hlen(ip4h);
> +		off += hdr_lens->l3_len;
> +		if (ip4h->fragment_offset &
> +				rte_cpu_to_be_16(IPV4_HDR_OFFSET_MASK |
> +					IPV4_HDR_MF_FLAG)) {
> +			pkt_type |= RTE_PTYPE_L4_FRAG;
> +			hdr_lens->l4_len = 0;
> +			return pkt_type;
> +		}
> +		proto = ip4h->next_proto_id;
> +		pkt_type |= ptype_l4(proto);
> +	} else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) {
> +		const struct ipv6_hdr *ip6h;
> +		struct ipv6_hdr ip6h_copy;
> +		int frag = 0;
> +
> +		ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy);
> +		if (unlikely(ip6h == NULL))
> +			return pkt_type;
> +
> +		proto = ip6h->proto;
> +		hdr_lens->l3_len = sizeof(*ip6h);
> +		off += hdr_lens->l3_len;
> +		pkt_type |= ptype_l3_ip6(proto);
> +		if ((pkt_type & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV6_EXT) {
> +			proto = skip_ip6_ext(proto, m, &off, &frag);
> +			hdr_lens->l3_len = off - hdr_lens->l2_len;
> +		}
> +		if (proto == 0)
> +			return pkt_type;
> +		if (frag) {
> +			pkt_type |= RTE_PTYPE_L4_FRAG;
> +			hdr_lens->l4_len = 0;
> +			return pkt_type;
> +		}
> +		pkt_type |= ptype_l4(proto);
> +	}
> +
> +	if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP) {
> +		hdr_lens->l4_len = sizeof(struct udp_hdr);
> +	} else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP) {
> +		const struct tcp_hdr *th;
> +		struct tcp_hdr th_copy;
> +
> +		th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy);
> +		if (unlikely(th == NULL))
> +			return pkt_type & (RTE_PTYPE_L2_MASK |
> +				RTE_PTYPE_L3_MASK);
> +		hdr_lens->l4_len = (th->data_off & 0xf0) >> 2;
> +	} else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) {
> +		hdr_lens->l4_len = sizeof(struct sctp_hdr);
> +	} else {
> +		hdr_lens->l4_len = 0;
> +	}
> +
> +	return pkt_type;
> +}
> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h b/lib/librte_mbuf/rte_mbuf_ptype.h
> index 4a34678..f468520 100644
> --- a/lib/librte_mbuf/rte_mbuf_ptype.h
> +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
> @@ -545,6 +545,49 @@ extern "C" {
>   		RTE_PTYPE_INNER_L3_MASK |				\
>   		RTE_PTYPE_INNER_L4_MASK))
>   
> +struct rte_mbuf;
> +
> +/**
> + * Structure containing header lengths associated to a packet.
> + */
> +struct rte_mbuf_hdr_lens {
> +	uint8_t l2_len;
> +	uint8_t l3_len;
> +	uint8_t l4_len;
> +	uint8_t tunnel_len;
> +	uint8_t inner_l2_len;
> +	uint8_t inner_l3_len;
> +	uint8_t inner_l4_len;
> +};
[LC] The header parsing graph usually is not unique. The definition 
maybe nice for the basic IP and L4 tunnel.
However it can't scale out to other cases, e.g. qinq, mac-in-mac, mpls 
l2/l3 tunnel.
The parsing logic of "rte_pktmbuf_get_ptype()" and the definition of 
"struct rte_mbuf_hdr_lens" consist a pair for one specific parser scheme.
In this case, the fixed function is to support below.

+ * Supported packet types are:
+ *   L2: Ether
+ *   L3: IPv4, IPv6
+ *   L4: TCP, UDP, SCTP

Of course, it can add more packet type detection logic in future. But 
the more support, the higher the cost.

One of the alternative way is to allow registering parser pair. APP 
decides to choose the predefined scheme(by DPDK LIB), or to self-define 
the parsing logic.
In this way, the scheme can do some assumption for the specific case and 
ignore some useless graph detection.
In addition, besides the SW parser, the HW parser(identified by 
packet_type in mbuf) can be turn on/off by leveraging the same manner.

Thanks.
> +
> +/**
> + * Parse an Ethernet packet to get its packet type.
> + *
> + * This function parses the network headers in mbuf data and return its
> + * packet type.
> + *
> + * If it is provided by the user, it also fills a rte_mbuf_hdr_lens
> + * structure that contains the lengths of the parsed network
> + * headers. Each length field is valid only if the associated packet
> + * type is set. For instance, hdr_lens->l2_len is valid only if
> + * (retval & RTE_PTYPE_L2_MASK) != RTE_PTYPE_UNKNOWN.
> + *
> + * Supported packet types are:
> + *   L2: Ether
> + *   L3: IPv4, IPv6
> + *   L4: TCP, UDP, SCTP
> + *
> + * @param m
> + *   The packet mbuf to be parsed.
> + * @param hdr_lens
> + *   A pointer to a structure where the header lengths will be returned,
> + *   or NULL.
> + * @return
> + *   The packet type of the packet.
> + */
> +uint32_t rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
> +	struct rte_mbuf_hdr_lens *hdr_lens);
> +
>   #ifdef __cplusplus
>   }
>   #endif
> diff --git a/lib/librte_mbuf/rte_mbuf_version.map b/lib/librte_mbuf/rte_mbuf_version.map
> index 79e4dd8..416af8e 100644
> --- a/lib/librte_mbuf/rte_mbuf_version.map
> +++ b/lib/librte_mbuf/rte_mbuf_version.map
> @@ -23,5 +23,6 @@ DPDK_16.11 {
>   	global:
>   
>   	__rte_pktmbuf_read;
> +	rte_pktmbuf_get_ptype;
>   
>   } DPDK_2.1;
  
Olivier Matz July 6, 2016, 7:42 a.m. UTC | #2
Hi Cunming,

On 07/06/2016 08:44 AM, Liang, Cunming wrote:
> Hi Olivier,
> 
> On 7/5/2016 11:41 PM, Olivier Matz wrote:
>> Introduce the function rte_pktmbuf_get_ptype() that parses a
>> mbuf and returns its packet type. For now, the following packet
>> types are parsed:
>>     L2: Ether
>>     L3: IPv4, IPv6
>>     L4: TCP, UDP, SCTP
>>
>> The goal here is to provide a reference implementation for packet type
>> parsing. This function will be used by testpmd in next commits, allowing
>> to compare its result with the value given by the hardware.
>>
>> This function will also be useful when implementing Rx offload support
>> in virtio pmd. Indeed, the virtio protocol gives the csum start and
>> offset, but it does not give the L4 protocol nor it tells if the
>> checksum is relevant for inner or outer. This information has to be
>> known to properly set the ol_flags in mbuf.
>>
>> Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
>> Signed-off-by: Jean Dao <jean.dao@6wind.com>
>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>> ---
>>   doc/guides/rel_notes/release_16_11.rst |   5 +
>>   lib/librte_mbuf/Makefile               |   5 +-
>>   lib/librte_mbuf/rte_mbuf_ptype.c       | 234
>> +++++++++++++++++++++++++++++++++
>>   lib/librte_mbuf/rte_mbuf_ptype.h       |  43 ++++++
>>   lib/librte_mbuf/rte_mbuf_version.map   |   1 +
>>   5 files changed, 286 insertions(+), 2 deletions(-)
>>   create mode 100644 lib/librte_mbuf/rte_mbuf_ptype.c
>>
>> [...]
>> +
>> +/* parse mbuf data to get packet type */
>> +uint32_t rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
>> +    struct rte_mbuf_hdr_lens *hdr_lens)
>> +{
>> +    struct rte_mbuf_hdr_lens local_hdr_lens;
>> +    const struct ether_hdr *eh;
>> +    struct ether_hdr eh_copy;
>> +    uint32_t pkt_type = RTE_PTYPE_L2_ETHER;
>> +    uint32_t off = 0;
>> +    uint16_t proto;
>> +
>> +    if (hdr_lens == NULL)
>> +        hdr_lens = &local_hdr_lens;
>> +
>> +    eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy);
>> +    if (unlikely(eh == NULL))
>> +        return 0;
>> +    proto = eh->ether_type;
>> +    off = sizeof(*eh);
>> +    hdr_lens->l2_len = off;
>> +
>> +    if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
>> +        const struct ipv4_hdr *ip4h;
>> +        struct ipv4_hdr ip4h_copy;
>> +
>> +        ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy);
>> +        if (unlikely(ip4h == NULL))
>> +            return pkt_type;
>> +
>> +        pkt_type |= ptype_l3_ip(ip4h->version_ihl);
>> +        hdr_lens->l3_len = ip4_hlen(ip4h);
>> +        off += hdr_lens->l3_len;
>> +        if (ip4h->fragment_offset &
>> +                rte_cpu_to_be_16(IPV4_HDR_OFFSET_MASK |
>> +                    IPV4_HDR_MF_FLAG)) {
>> +            pkt_type |= RTE_PTYPE_L4_FRAG;
>> +            hdr_lens->l4_len = 0;
>> +            return pkt_type;
>> +        }
>> +        proto = ip4h->next_proto_id;
>> +        pkt_type |= ptype_l4(proto);
>> +    } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) {
>> +        const struct ipv6_hdr *ip6h;
>> +        struct ipv6_hdr ip6h_copy;
>> +        int frag = 0;
>> +
>> +        ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy);
>> +        if (unlikely(ip6h == NULL))
>> +            return pkt_type;
>> +
>> +        proto = ip6h->proto;
>> +        hdr_lens->l3_len = sizeof(*ip6h);
>> +        off += hdr_lens->l3_len;
>> +        pkt_type |= ptype_l3_ip6(proto);
>> +        if ((pkt_type & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV6_EXT) {
>> +            proto = skip_ip6_ext(proto, m, &off, &frag);
>> +            hdr_lens->l3_len = off - hdr_lens->l2_len;
>> +        }
>> +        if (proto == 0)
>> +            return pkt_type;
>> +        if (frag) {
>> +            pkt_type |= RTE_PTYPE_L4_FRAG;
>> +            hdr_lens->l4_len = 0;
>> +            return pkt_type;
>> +        }
>> +        pkt_type |= ptype_l4(proto);
>> +    }
>> +
>> +    if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP) {
>> +        hdr_lens->l4_len = sizeof(struct udp_hdr);
>> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP) {
>> +        const struct tcp_hdr *th;
>> +        struct tcp_hdr th_copy;
>> +
>> +        th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy);
>> +        if (unlikely(th == NULL))
>> +            return pkt_type & (RTE_PTYPE_L2_MASK |
>> +                RTE_PTYPE_L3_MASK);
>> +        hdr_lens->l4_len = (th->data_off & 0xf0) >> 2;
>> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) {
>> +        hdr_lens->l4_len = sizeof(struct sctp_hdr);
>> +    } else {
>> +        hdr_lens->l4_len = 0;
>> +    }
>> +
>> +    return pkt_type;
>> +}
>> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h
>> b/lib/librte_mbuf/rte_mbuf_ptype.h
>> index 4a34678..f468520 100644
>> --- a/lib/librte_mbuf/rte_mbuf_ptype.h
>> +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
>> @@ -545,6 +545,49 @@ extern "C" {
>>           RTE_PTYPE_INNER_L3_MASK |                \
>>           RTE_PTYPE_INNER_L4_MASK))
>>   +struct rte_mbuf;
>> +
>> +/**
>> + * Structure containing header lengths associated to a packet.
>> + */
>> +struct rte_mbuf_hdr_lens {
>> +    uint8_t l2_len;
>> +    uint8_t l3_len;
>> +    uint8_t l4_len;
>> +    uint8_t tunnel_len;
>> +    uint8_t inner_l2_len;
>> +    uint8_t inner_l3_len;
>> +    uint8_t inner_l4_len;
>> +};
> [LC] The header parsing graph usually is not unique. The definition
> maybe nice for the basic IP and L4 tunnel.
> However it can't scale out to other cases, e.g. qinq, mac-in-mac, mpls
> l2/l3 tunnel.
> The parsing logic of "rte_pktmbuf_get_ptype()" and the definition of
> "struct rte_mbuf_hdr_lens" consist a pair for one specific parser scheme.
> In this case, the fixed function is to support below.
> 
> + * Supported packet types are:
> + *   L2: Ether
> + *   L3: IPv4, IPv6
> + *   L4: TCP, UDP, SCTP
> 
> Of course, it can add more packet type detection logic in future. But
> the more support, the higher the cost.
> 
> One of the alternative way is to allow registering parser pair. APP
> decides to choose the predefined scheme(by DPDK LIB), or to self-define
> the parsing logic.
> In this way, the scheme can do some assumption for the specific case and
> ignore some useless graph detection.
> In addition, besides the SW parser, the HW parser(identified by
> packet_type in mbuf) can be turn on/off by leveraging the same manner.


Sorry, I'm not sure I'm fully getting what you are saying. If I
understand well, you would like to have something more flexible that
supports the registration of protocol to be recognized?

I'm not sure having a function with a dynamic registration method would
really increase the performance compared to a static complete function.
Actually, we will never support a tons of protocols since each layer
packet type is 4 bits, and since it requires that at least one hw
supports it.

As described in the cover letter, the 2 main goals of this patchset are
to provide a reference implementation for packet type recognition, and
enable the support of virtio offloads (I'll send the patchset soon).
This function is adapted to these 2 usages. Are you thinking of another
use-case that would not be covered?

Regards,
Olivier
  
Chilikin, Andrey July 6, 2016, 11:59 a.m. UTC | #3
Hi Oliver,

> -----Original Message-----
> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Olivier MATZ
> Sent: Wednesday, July 6, 2016 8:43 AM
> To: Liang, Cunming <cunming.liang@intel.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 05/18] mbuf: add function to get packet type
> from data
> 
> Hi Cunming,
> 
> On 07/06/2016 08:44 AM, Liang, Cunming wrote:
> > Hi Olivier,
> >
> > On 7/5/2016 11:41 PM, Olivier Matz wrote:
> >> Introduce the function rte_pktmbuf_get_ptype() that parses a mbuf and
> >> returns its packet type. For now, the following packet types are
> >> parsed:
> >>     L2: Ether
> >>     L3: IPv4, IPv6
> >>     L4: TCP, UDP, SCTP
> >>
> >> The goal here is to provide a reference implementation for packet
> >> type parsing. This function will be used by testpmd in next commits,
> >> allowing to compare its result with the value given by the hardware.
> >>
> >> This function will also be useful when implementing Rx offload
> >> support in virtio pmd. Indeed, the virtio protocol gives the csum
> >> start and offset, but it does not give the L4 protocol nor it tells
> >> if the checksum is relevant for inner or outer. This information has
> >> to be known to properly set the ol_flags in mbuf.
> >>
> >> Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
> >> Signed-off-by: Jean Dao <jean.dao@6wind.com>
> >> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> >> ---
> >>   doc/guides/rel_notes/release_16_11.rst |   5 +
> >>   lib/librte_mbuf/Makefile               |   5 +-
> >>   lib/librte_mbuf/rte_mbuf_ptype.c       | 234
> >> +++++++++++++++++++++++++++++++++
> >>   lib/librte_mbuf/rte_mbuf_ptype.h       |  43 ++++++
> >>   lib/librte_mbuf/rte_mbuf_version.map   |   1 +
> >>   5 files changed, 286 insertions(+), 2 deletions(-)
> >>   create mode 100644 lib/librte_mbuf/rte_mbuf_ptype.c
> >>
> >> [...]
> >> +
> >> +/* parse mbuf data to get packet type */ uint32_t
> >> +rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
> >> +    struct rte_mbuf_hdr_lens *hdr_lens) {
> >> +    struct rte_mbuf_hdr_lens local_hdr_lens;
> >> +    const struct ether_hdr *eh;
> >> +    struct ether_hdr eh_copy;
> >> +    uint32_t pkt_type = RTE_PTYPE_L2_ETHER;
> >> +    uint32_t off = 0;
> >> +    uint16_t proto;
> >> +
> >> +    if (hdr_lens == NULL)
> >> +        hdr_lens = &local_hdr_lens;
> >> +
> >> +    eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy);
> >> +    if (unlikely(eh == NULL))
> >> +        return 0;
> >> +    proto = eh->ether_type;
> >> +    off = sizeof(*eh);
> >> +    hdr_lens->l2_len = off;
> >> +
> >> +    if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
> >> +        const struct ipv4_hdr *ip4h;
> >> +        struct ipv4_hdr ip4h_copy;
> >> +
> >> +        ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy);
> >> +        if (unlikely(ip4h == NULL))
> >> +            return pkt_type;
> >> +
> >> +        pkt_type |= ptype_l3_ip(ip4h->version_ihl);
> >> +        hdr_lens->l3_len = ip4_hlen(ip4h);
> >> +        off += hdr_lens->l3_len;
> >> +        if (ip4h->fragment_offset &
> >> +                rte_cpu_to_be_16(IPV4_HDR_OFFSET_MASK |
> >> +                    IPV4_HDR_MF_FLAG)) {
> >> +            pkt_type |= RTE_PTYPE_L4_FRAG;
> >> +            hdr_lens->l4_len = 0;
> >> +            return pkt_type;
> >> +        }
> >> +        proto = ip4h->next_proto_id;
> >> +        pkt_type |= ptype_l4(proto);
> >> +    } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) {
> >> +        const struct ipv6_hdr *ip6h;
> >> +        struct ipv6_hdr ip6h_copy;
> >> +        int frag = 0;
> >> +
> >> +        ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy);
> >> +        if (unlikely(ip6h == NULL))
> >> +            return pkt_type;
> >> +
> >> +        proto = ip6h->proto;
> >> +        hdr_lens->l3_len = sizeof(*ip6h);
> >> +        off += hdr_lens->l3_len;
> >> +        pkt_type |= ptype_l3_ip6(proto);
> >> +        if ((pkt_type & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV6_EXT) {
> >> +            proto = skip_ip6_ext(proto, m, &off, &frag);
> >> +            hdr_lens->l3_len = off - hdr_lens->l2_len;
> >> +        }
> >> +        if (proto == 0)
> >> +            return pkt_type;
> >> +        if (frag) {
> >> +            pkt_type |= RTE_PTYPE_L4_FRAG;
> >> +            hdr_lens->l4_len = 0;
> >> +            return pkt_type;
> >> +        }
> >> +        pkt_type |= ptype_l4(proto);
> >> +    }
> >> +
> >> +    if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP) {
> >> +        hdr_lens->l4_len = sizeof(struct udp_hdr);
> >> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP) {
> >> +        const struct tcp_hdr *th;
> >> +        struct tcp_hdr th_copy;
> >> +
> >> +        th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy);
> >> +        if (unlikely(th == NULL))
> >> +            return pkt_type & (RTE_PTYPE_L2_MASK |
> >> +                RTE_PTYPE_L3_MASK);
> >> +        hdr_lens->l4_len = (th->data_off & 0xf0) >> 2;
> >> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) {
> >> +        hdr_lens->l4_len = sizeof(struct sctp_hdr);
> >> +    } else {
> >> +        hdr_lens->l4_len = 0;
> >> +    }
> >> +
> >> +    return pkt_type;
> >> +}
> >> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h
> >> b/lib/librte_mbuf/rte_mbuf_ptype.h
> >> index 4a34678..f468520 100644
> >> --- a/lib/librte_mbuf/rte_mbuf_ptype.h
> >> +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
> >> @@ -545,6 +545,49 @@ extern "C" {
> >>           RTE_PTYPE_INNER_L3_MASK |                \
> >>           RTE_PTYPE_INNER_L4_MASK))
> >>   +struct rte_mbuf;
> >> +
> >> +/**
> >> + * Structure containing header lengths associated to a packet.
> >> + */
> >> +struct rte_mbuf_hdr_lens {
> >> +    uint8_t l2_len;
> >> +    uint8_t l3_len;
> >> +    uint8_t l4_len;
> >> +    uint8_t tunnel_len;
> >> +    uint8_t inner_l2_len;
> >> +    uint8_t inner_l3_len;
> >> +    uint8_t inner_l4_len;
> >> +};
> > [LC] The header parsing graph usually is not unique. The definition
> > maybe nice for the basic IP and L4 tunnel.
> > However it can't scale out to other cases, e.g. qinq, mac-in-mac, mpls
> > l2/l3 tunnel.
> > The parsing logic of "rte_pktmbuf_get_ptype()" and the definition of
> > "struct rte_mbuf_hdr_lens" consist a pair for one specific parser scheme.
> > In this case, the fixed function is to support below.
> >
> > + * Supported packet types are:
> > + *   L2: Ether
> > + *   L3: IPv4, IPv6
> > + *   L4: TCP, UDP, SCTP
> >
> > Of course, it can add more packet type detection logic in future. But
> > the more support, the higher the cost.
> >
> > One of the alternative way is to allow registering parser pair. APP
> > decides to choose the predefined scheme(by DPDK LIB), or to
> > self-define the parsing logic.
> > In this way, the scheme can do some assumption for the specific case
> > and ignore some useless graph detection.
> > In addition, besides the SW parser, the HW parser(identified by
> > packet_type in mbuf) can be turn on/off by leveraging the same manner.
> 
> Sorry, I'm not sure I'm fully getting what you are saying. If I understand well,
> you would like to have something more flexible that supports the registration of
> protocol to be recognized?
> 
> I'm not sure having a function with a dynamic registration method would really
> increase the performance compared to a static complete function.
> Actually, we will never support a tons of protocols since each layer packet type
> is 4 bits, and since it requires that at least one hw supports it.

This patch will be very useful as a reference implementation, but it also highlights an issue with the current implementation of packet types reporting by HW and SW - as you just mentioned there are only 4 bits per each layer. As these 4 bit are used as a enumeration it is impossible to reports multiple headers located on the same layer. MPLS is one example, different packets could have different numbers of MPLS labels, but it is impossible to report using current packet_type structure.

It is possible, however, to  program HW to report user (application) specific packet types. For example, for IPoMPLS with one MPLS label, HW will report packet type A, but for IPoMPLS with two MPLS labels HW will reports packet type B. In this case, instead of defining and supporting tons of statically defined (or enumerated) protocol headers combinations, application will register packet types it expects from HW in addition to standard packet types. At the moment we  have high bits of packet_type reserved, so one possible solution would be to use the highest bit to indicate that this is user defined packet_type, specific to the application. Then it could be used with HW and with SW parser. For example, packet_type 0x8000000A is IPoMPLS with one MPLS label, 0x8000000B is IPoMPLS with two MPLS labels and so on.

Regards,
Andrey
> 
> As described in the cover letter, the 2 main goals of this patchset are to provide
> a reference implementation for packet type recognition, and enable the
> support of virtio offloads (I'll send the patchset soon).
> This function is adapted to these 2 usages. Are you thinking of another use-case
> that would not be covered?

> 
> Regards,
> Olivier
  
Olivier Matz July 6, 2016, 12:08 p.m. UTC | #4
Hi Andrey,

On 07/06/2016 01:59 PM, Chilikin, Andrey wrote:
> Hi Oliver,
> 
>> -----Original Message-----
>> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Olivier MATZ
>> Sent: Wednesday, July 6, 2016 8:43 AM
>> To: Liang, Cunming <cunming.liang@intel.com>; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH 05/18] mbuf: add function to get packet type
>> from data
>>
>> Hi Cunming,
>>
>> On 07/06/2016 08:44 AM, Liang, Cunming wrote:
>>> Hi Olivier,
>>>
>>> On 7/5/2016 11:41 PM, Olivier Matz wrote:
>>>> Introduce the function rte_pktmbuf_get_ptype() that parses a mbuf and
>>>> returns its packet type. For now, the following packet types are
>>>> parsed:
>>>>     L2: Ether
>>>>     L3: IPv4, IPv6
>>>>     L4: TCP, UDP, SCTP
>>>>
>>>> The goal here is to provide a reference implementation for packet
>>>> type parsing. This function will be used by testpmd in next commits,
>>>> allowing to compare its result with the value given by the hardware.
>>>>
>>>> This function will also be useful when implementing Rx offload
>>>> support in virtio pmd. Indeed, the virtio protocol gives the csum
>>>> start and offset, but it does not give the L4 protocol nor it tells
>>>> if the checksum is relevant for inner or outer. This information has
>>>> to be known to properly set the ol_flags in mbuf.
>>>>
>>>> Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
>>>> Signed-off-by: Jean Dao <jean.dao@6wind.com>
>>>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>>>> ---
>>>>   doc/guides/rel_notes/release_16_11.rst |   5 +
>>>>   lib/librte_mbuf/Makefile               |   5 +-
>>>>   lib/librte_mbuf/rte_mbuf_ptype.c       | 234
>>>> +++++++++++++++++++++++++++++++++
>>>>   lib/librte_mbuf/rte_mbuf_ptype.h       |  43 ++++++
>>>>   lib/librte_mbuf/rte_mbuf_version.map   |   1 +
>>>>   5 files changed, 286 insertions(+), 2 deletions(-)
>>>>   create mode 100644 lib/librte_mbuf/rte_mbuf_ptype.c
>>>>
>>>> [...]
>>>> +
>>>> +/* parse mbuf data to get packet type */ uint32_t
>>>> +rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
>>>> +    struct rte_mbuf_hdr_lens *hdr_lens) {
>>>> +    struct rte_mbuf_hdr_lens local_hdr_lens;
>>>> +    const struct ether_hdr *eh;
>>>> +    struct ether_hdr eh_copy;
>>>> +    uint32_t pkt_type = RTE_PTYPE_L2_ETHER;
>>>> +    uint32_t off = 0;
>>>> +    uint16_t proto;
>>>> +
>>>> +    if (hdr_lens == NULL)
>>>> +        hdr_lens = &local_hdr_lens;
>>>> +
>>>> +    eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy);
>>>> +    if (unlikely(eh == NULL))
>>>> +        return 0;
>>>> +    proto = eh->ether_type;
>>>> +    off = sizeof(*eh);
>>>> +    hdr_lens->l2_len = off;
>>>> +
>>>> +    if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
>>>> +        const struct ipv4_hdr *ip4h;
>>>> +        struct ipv4_hdr ip4h_copy;
>>>> +
>>>> +        ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy);
>>>> +        if (unlikely(ip4h == NULL))
>>>> +            return pkt_type;
>>>> +
>>>> +        pkt_type |= ptype_l3_ip(ip4h->version_ihl);
>>>> +        hdr_lens->l3_len = ip4_hlen(ip4h);
>>>> +        off += hdr_lens->l3_len;
>>>> +        if (ip4h->fragment_offset &
>>>> +                rte_cpu_to_be_16(IPV4_HDR_OFFSET_MASK |
>>>> +                    IPV4_HDR_MF_FLAG)) {
>>>> +            pkt_type |= RTE_PTYPE_L4_FRAG;
>>>> +            hdr_lens->l4_len = 0;
>>>> +            return pkt_type;
>>>> +        }
>>>> +        proto = ip4h->next_proto_id;
>>>> +        pkt_type |= ptype_l4(proto);
>>>> +    } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) {
>>>> +        const struct ipv6_hdr *ip6h;
>>>> +        struct ipv6_hdr ip6h_copy;
>>>> +        int frag = 0;
>>>> +
>>>> +        ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy);
>>>> +        if (unlikely(ip6h == NULL))
>>>> +            return pkt_type;
>>>> +
>>>> +        proto = ip6h->proto;
>>>> +        hdr_lens->l3_len = sizeof(*ip6h);
>>>> +        off += hdr_lens->l3_len;
>>>> +        pkt_type |= ptype_l3_ip6(proto);
>>>> +        if ((pkt_type & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV6_EXT) {
>>>> +            proto = skip_ip6_ext(proto, m, &off, &frag);
>>>> +            hdr_lens->l3_len = off - hdr_lens->l2_len;
>>>> +        }
>>>> +        if (proto == 0)
>>>> +            return pkt_type;
>>>> +        if (frag) {
>>>> +            pkt_type |= RTE_PTYPE_L4_FRAG;
>>>> +            hdr_lens->l4_len = 0;
>>>> +            return pkt_type;
>>>> +        }
>>>> +        pkt_type |= ptype_l4(proto);
>>>> +    }
>>>> +
>>>> +    if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP) {
>>>> +        hdr_lens->l4_len = sizeof(struct udp_hdr);
>>>> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP) {
>>>> +        const struct tcp_hdr *th;
>>>> +        struct tcp_hdr th_copy;
>>>> +
>>>> +        th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy);
>>>> +        if (unlikely(th == NULL))
>>>> +            return pkt_type & (RTE_PTYPE_L2_MASK |
>>>> +                RTE_PTYPE_L3_MASK);
>>>> +        hdr_lens->l4_len = (th->data_off & 0xf0) >> 2;
>>>> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) {
>>>> +        hdr_lens->l4_len = sizeof(struct sctp_hdr);
>>>> +    } else {
>>>> +        hdr_lens->l4_len = 0;
>>>> +    }
>>>> +
>>>> +    return pkt_type;
>>>> +}
>>>> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h
>>>> b/lib/librte_mbuf/rte_mbuf_ptype.h
>>>> index 4a34678..f468520 100644
>>>> --- a/lib/librte_mbuf/rte_mbuf_ptype.h
>>>> +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
>>>> @@ -545,6 +545,49 @@ extern "C" {
>>>>           RTE_PTYPE_INNER_L3_MASK |                \
>>>>           RTE_PTYPE_INNER_L4_MASK))
>>>>   +struct rte_mbuf;
>>>> +
>>>> +/**
>>>> + * Structure containing header lengths associated to a packet.
>>>> + */
>>>> +struct rte_mbuf_hdr_lens {
>>>> +    uint8_t l2_len;
>>>> +    uint8_t l3_len;
>>>> +    uint8_t l4_len;
>>>> +    uint8_t tunnel_len;
>>>> +    uint8_t inner_l2_len;
>>>> +    uint8_t inner_l3_len;
>>>> +    uint8_t inner_l4_len;
>>>> +};
>>> [LC] The header parsing graph usually is not unique. The definition
>>> maybe nice for the basic IP and L4 tunnel.
>>> However it can't scale out to other cases, e.g. qinq, mac-in-mac, mpls
>>> l2/l3 tunnel.
>>> The parsing logic of "rte_pktmbuf_get_ptype()" and the definition of
>>> "struct rte_mbuf_hdr_lens" consist a pair for one specific parser scheme.
>>> In this case, the fixed function is to support below.
>>>
>>> + * Supported packet types are:
>>> + *   L2: Ether
>>> + *   L3: IPv4, IPv6
>>> + *   L4: TCP, UDP, SCTP
>>>
>>> Of course, it can add more packet type detection logic in future. But
>>> the more support, the higher the cost.
>>>
>>> One of the alternative way is to allow registering parser pair. APP
>>> decides to choose the predefined scheme(by DPDK LIB), or to
>>> self-define the parsing logic.
>>> In this way, the scheme can do some assumption for the specific case
>>> and ignore some useless graph detection.
>>> In addition, besides the SW parser, the HW parser(identified by
>>> packet_type in mbuf) can be turn on/off by leveraging the same manner.
>>
>> Sorry, I'm not sure I'm fully getting what you are saying. If I understand well,
>> you would like to have something more flexible that supports the registration of
>> protocol to be recognized?
>>
>> I'm not sure having a function with a dynamic registration method would really
>> increase the performance compared to a static complete function.
>> Actually, we will never support a tons of protocols since each layer packet type
>> is 4 bits, and since it requires that at least one hw supports it.
> 
> This patch will be very useful as a reference implementation, but it also highlights an issue with the current implementation of packet types reporting by HW and SW - as you just mentioned there are only 4 bits per each layer. As these 4 bit are used as a enumeration it is impossible to reports multiple headers located on the same layer. MPLS is one example, different packets could have different numbers of MPLS labels, but it is impossible to report using current packet_type structure.
> 
> It is possible, however, to  program HW to report user (application) specific packet types. For example, for IPoMPLS with one MPLS label, HW will report packet type A, but for IPoMPLS with two MPLS labels HW will reports packet type B. In this case, instead of defining and supporting tons of statically defined (or enumerated) protocol headers combinations, application will register packet types it expects from HW in addition to standard packet types. At the moment we  have high bits of packet_type reserved, so one possible solution would be to use the highest bit to indicate that this is user defined packet_type, specific to the application. Then it could be used with HW and with SW parser. For example, packet_type 0x8000000A is IPoMPLS with one MPLS label, 0x8000000B is IPoMPLS with two MPLS labels and so on.

Thank you for the explanation. From your description, I wonder if the
flow director API recently [1] proposed by Adrien wouldn't solve this issue?

[1] http://dpdk.org/ml/archives/dev/2016-July/043365.html

Regards,
Olivier
  
Chilikin, Andrey July 6, 2016, 12:21 p.m. UTC | #5
Hi Oliver,

> -----Original Message-----
> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
> Sent: Wednesday, July 6, 2016 1:09 PM
> To: Chilikin, Andrey <andrey.chilikin@intel.com>; Liang, Cunming
> <cunming.liang@intel.com>; dev@dpdk.org
> Cc: Ananyev, Konstantin <konstantin.ananyev@intel.com>
> Subject: Re: [dpdk-dev] [PATCH 05/18] mbuf: add function to get packet type
> from data
> 
> Hi Andrey,
> 
> On 07/06/2016 01:59 PM, Chilikin, Andrey wrote:
> > Hi Oliver,
> >
> >> -----Original Message-----
> >> From: dev [mailto:dev-bounces@dpdk.org] On Behalf Of Olivier MATZ
> >> Sent: Wednesday, July 6, 2016 8:43 AM
> >> To: Liang, Cunming <cunming.liang@intel.com>; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH 05/18] mbuf: add function to get
> >> packet type from data
> >>
> >> Hi Cunming,
> >>
> >> On 07/06/2016 08:44 AM, Liang, Cunming wrote:
> >>> Hi Olivier,
> >>>
> >>> On 7/5/2016 11:41 PM, Olivier Matz wrote:
> >>>> Introduce the function rte_pktmbuf_get_ptype() that parses a mbuf
> >>>> and returns its packet type. For now, the following packet types
> >>>> are
> >>>> parsed:
> >>>>     L2: Ether
> >>>>     L3: IPv4, IPv6
> >>>>     L4: TCP, UDP, SCTP
> >>>>
> >>>> The goal here is to provide a reference implementation for packet
> >>>> type parsing. This function will be used by testpmd in next
> >>>> commits, allowing to compare its result with the value given by the
> hardware.
> >>>>
> >>>> This function will also be useful when implementing Rx offload
> >>>> support in virtio pmd. Indeed, the virtio protocol gives the csum
> >>>> start and offset, but it does not give the L4 protocol nor it tells
> >>>> if the checksum is relevant for inner or outer. This information
> >>>> has to be known to properly set the ol_flags in mbuf.
> >>>>
> >>>> Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
> >>>> Signed-off-by: Jean Dao <jean.dao@6wind.com>
> >>>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> >>>> ---
> >>>>   doc/guides/rel_notes/release_16_11.rst |   5 +
> >>>>   lib/librte_mbuf/Makefile               |   5 +-
> >>>>   lib/librte_mbuf/rte_mbuf_ptype.c       | 234
> >>>> +++++++++++++++++++++++++++++++++
> >>>>   lib/librte_mbuf/rte_mbuf_ptype.h       |  43 ++++++
> >>>>   lib/librte_mbuf/rte_mbuf_version.map   |   1 +
> >>>>   5 files changed, 286 insertions(+), 2 deletions(-)
> >>>>   create mode 100644 lib/librte_mbuf/rte_mbuf_ptype.c
> >>>>
> >>>> [...]
> >>>> +
> >>>> +/* parse mbuf data to get packet type */ uint32_t
> >>>> +rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
> >>>> +    struct rte_mbuf_hdr_lens *hdr_lens) {
> >>>> +    struct rte_mbuf_hdr_lens local_hdr_lens;
> >>>> +    const struct ether_hdr *eh;
> >>>> +    struct ether_hdr eh_copy;
> >>>> +    uint32_t pkt_type = RTE_PTYPE_L2_ETHER;
> >>>> +    uint32_t off = 0;
> >>>> +    uint16_t proto;
> >>>> +
> >>>> +    if (hdr_lens == NULL)
> >>>> +        hdr_lens = &local_hdr_lens;
> >>>> +
> >>>> +    eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy);
> >>>> +    if (unlikely(eh == NULL))
> >>>> +        return 0;
> >>>> +    proto = eh->ether_type;
> >>>> +    off = sizeof(*eh);
> >>>> +    hdr_lens->l2_len = off;
> >>>> +
> >>>> +    if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
> >>>> +        const struct ipv4_hdr *ip4h;
> >>>> +        struct ipv4_hdr ip4h_copy;
> >>>> +
> >>>> +        ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy);
> >>>> +        if (unlikely(ip4h == NULL))
> >>>> +            return pkt_type;
> >>>> +
> >>>> +        pkt_type |= ptype_l3_ip(ip4h->version_ihl);
> >>>> +        hdr_lens->l3_len = ip4_hlen(ip4h);
> >>>> +        off += hdr_lens->l3_len;
> >>>> +        if (ip4h->fragment_offset &
> >>>> +                rte_cpu_to_be_16(IPV4_HDR_OFFSET_MASK |
> >>>> +                    IPV4_HDR_MF_FLAG)) {
> >>>> +            pkt_type |= RTE_PTYPE_L4_FRAG;
> >>>> +            hdr_lens->l4_len = 0;
> >>>> +            return pkt_type;
> >>>> +        }
> >>>> +        proto = ip4h->next_proto_id;
> >>>> +        pkt_type |= ptype_l4(proto);
> >>>> +    } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) {
> >>>> +        const struct ipv6_hdr *ip6h;
> >>>> +        struct ipv6_hdr ip6h_copy;
> >>>> +        int frag = 0;
> >>>> +
> >>>> +        ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy);
> >>>> +        if (unlikely(ip6h == NULL))
> >>>> +            return pkt_type;
> >>>> +
> >>>> +        proto = ip6h->proto;
> >>>> +        hdr_lens->l3_len = sizeof(*ip6h);
> >>>> +        off += hdr_lens->l3_len;
> >>>> +        pkt_type |= ptype_l3_ip6(proto);
> >>>> +        if ((pkt_type & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV6_EXT)
> {
> >>>> +            proto = skip_ip6_ext(proto, m, &off, &frag);
> >>>> +            hdr_lens->l3_len = off - hdr_lens->l2_len;
> >>>> +        }
> >>>> +        if (proto == 0)
> >>>> +            return pkt_type;
> >>>> +        if (frag) {
> >>>> +            pkt_type |= RTE_PTYPE_L4_FRAG;
> >>>> +            hdr_lens->l4_len = 0;
> >>>> +            return pkt_type;
> >>>> +        }
> >>>> +        pkt_type |= ptype_l4(proto);
> >>>> +    }
> >>>> +
> >>>> +    if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP) {
> >>>> +        hdr_lens->l4_len = sizeof(struct udp_hdr);
> >>>> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP) {
> >>>> +        const struct tcp_hdr *th;
> >>>> +        struct tcp_hdr th_copy;
> >>>> +
> >>>> +        th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy);
> >>>> +        if (unlikely(th == NULL))
> >>>> +            return pkt_type & (RTE_PTYPE_L2_MASK |
> >>>> +                RTE_PTYPE_L3_MASK);
> >>>> +        hdr_lens->l4_len = (th->data_off & 0xf0) >> 2;
> >>>> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) {
> >>>> +        hdr_lens->l4_len = sizeof(struct sctp_hdr);
> >>>> +    } else {
> >>>> +        hdr_lens->l4_len = 0;
> >>>> +    }
> >>>> +
> >>>> +    return pkt_type;
> >>>> +}
> >>>> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h
> >>>> b/lib/librte_mbuf/rte_mbuf_ptype.h
> >>>> index 4a34678..f468520 100644
> >>>> --- a/lib/librte_mbuf/rte_mbuf_ptype.h
> >>>> +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
> >>>> @@ -545,6 +545,49 @@ extern "C" {
> >>>>           RTE_PTYPE_INNER_L3_MASK |                \
> >>>>           RTE_PTYPE_INNER_L4_MASK))
> >>>>   +struct rte_mbuf;
> >>>> +
> >>>> +/**
> >>>> + * Structure containing header lengths associated to a packet.
> >>>> + */
> >>>> +struct rte_mbuf_hdr_lens {
> >>>> +    uint8_t l2_len;
> >>>> +    uint8_t l3_len;
> >>>> +    uint8_t l4_len;
> >>>> +    uint8_t tunnel_len;
> >>>> +    uint8_t inner_l2_len;
> >>>> +    uint8_t inner_l3_len;
> >>>> +    uint8_t inner_l4_len;
> >>>> +};
> >>> [LC] The header parsing graph usually is not unique. The definition
> >>> maybe nice for the basic IP and L4 tunnel.
> >>> However it can't scale out to other cases, e.g. qinq, mac-in-mac,
> >>> mpls
> >>> l2/l3 tunnel.
> >>> The parsing logic of "rte_pktmbuf_get_ptype()" and the definition of
> >>> "struct rte_mbuf_hdr_lens" consist a pair for one specific parser scheme.
> >>> In this case, the fixed function is to support below.
> >>>
> >>> + * Supported packet types are:
> >>> + *   L2: Ether
> >>> + *   L3: IPv4, IPv6
> >>> + *   L4: TCP, UDP, SCTP
> >>>
> >>> Of course, it can add more packet type detection logic in future.
> >>> But the more support, the higher the cost.
> >>>
> >>> One of the alternative way is to allow registering parser pair. APP
> >>> decides to choose the predefined scheme(by DPDK LIB), or to
> >>> self-define the parsing logic.
> >>> In this way, the scheme can do some assumption for the specific case
> >>> and ignore some useless graph detection.
> >>> In addition, besides the SW parser, the HW parser(identified by
> >>> packet_type in mbuf) can be turn on/off by leveraging the same manner.
> >>
> >> Sorry, I'm not sure I'm fully getting what you are saying. If I
> >> understand well, you would like to have something more flexible that
> >> supports the registration of protocol to be recognized?
> >>
> >> I'm not sure having a function with a dynamic registration method
> >> would really increase the performance compared to a static complete
> function.
> >> Actually, we will never support a tons of protocols since each layer
> >> packet type is 4 bits, and since it requires that at least one hw supports it.
> >
> > This patch will be very useful as a reference implementation, but it also
> highlights an issue with the current implementation of packet types reporting
> by HW and SW - as you just mentioned there are only 4 bits per each layer. As
> these 4 bit are used as a enumeration it is impossible to reports multiple
> headers located on the same layer. MPLS is one example, different packets
> could have different numbers of MPLS labels, but it is impossible to report
> using current packet_type structure.
> >
> > It is possible, however, to  program HW to report user (application) specific
> packet types. For example, for IPoMPLS with one MPLS label, HW will report
> packet type A, but for IPoMPLS with two MPLS labels HW will reports packet
> type B. In this case, instead of defining and supporting tons of statically defined
> (or enumerated) protocol headers combinations, application will register
> packet types it expects from HW in addition to standard packet types. At the
> moment we  have high bits of packet_type reserved, so one possible solution
> would be to use the highest bit to indicate that this is user defined packet_type,
> specific to the application. Then it could be used with HW and with SW parser.
> For example, packet_type 0x8000000A is IPoMPLS with one MPLS label,
> 0x8000000B is IPoMPLS with two MPLS labels and so on.
> 
> Thank you for the explanation. From your description, I wonder if the flow
> director API recently [1] proposed by Adrien wouldn't solve this issue?
> 
> [1] http://dpdk.org/ml/archives/dev/2016-July/043365.html

I'm reviewing Adrien's proposal (it is a big document :)) and reply with my comment after reviewing.

Regards,
Andrey  

> Regards,
> Olivier
  
Cunming Liang July 7, 2016, 8:19 a.m. UTC | #6
Hi Olivier,

> -----Original Message-----
> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
> Sent: Wednesday, July 06, 2016 3:43 PM
> To: Liang, Cunming <cunming.liang@intel.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 05/18] mbuf: add function to get packet type
> from data
> 
> Hi Cunming,
> 
> On 07/06/2016 08:44 AM, Liang, Cunming wrote:
> > Hi Olivier,
> >
> > On 7/5/2016 11:41 PM, Olivier Matz wrote:
> >> Introduce the function rte_pktmbuf_get_ptype() that parses a
> >> mbuf and returns its packet type. For now, the following packet
> >> types are parsed:
> >>     L2: Ether
> >>     L3: IPv4, IPv6
> >>     L4: TCP, UDP, SCTP
> >>
> >> The goal here is to provide a reference implementation for packet type
> >> parsing. This function will be used by testpmd in next commits, allowing
> >> to compare its result with the value given by the hardware.
> >>
> >> This function will also be useful when implementing Rx offload support
> >> in virtio pmd. Indeed, the virtio protocol gives the csum start and
> >> offset, but it does not give the L4 protocol nor it tells if the
> >> checksum is relevant for inner or outer. This information has to be
> >> known to properly set the ol_flags in mbuf.
> >>
> >> Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
> >> Signed-off-by: Jean Dao <jean.dao@6wind.com>
> >> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> >> ---
> >>   doc/guides/rel_notes/release_16_11.rst |   5 +
> >>   lib/librte_mbuf/Makefile               |   5 +-
> >>   lib/librte_mbuf/rte_mbuf_ptype.c       | 234
> >> +++++++++++++++++++++++++++++++++
> >>   lib/librte_mbuf/rte_mbuf_ptype.h       |  43 ++++++
> >>   lib/librte_mbuf/rte_mbuf_version.map   |   1 +
> >>   5 files changed, 286 insertions(+), 2 deletions(-)
> >>   create mode 100644 lib/librte_mbuf/rte_mbuf_ptype.c
> >>
> >> [...]
> >> +
> >> +/* parse mbuf data to get packet type */
> >> +uint32_t rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
> >> +    struct rte_mbuf_hdr_lens *hdr_lens)
> >> +{
> >> +    struct rte_mbuf_hdr_lens local_hdr_lens;
> >> +    const struct ether_hdr *eh;
> >> +    struct ether_hdr eh_copy;
> >> +    uint32_t pkt_type = RTE_PTYPE_L2_ETHER;
> >> +    uint32_t off = 0;
> >> +    uint16_t proto;
> >> +
> >> +    if (hdr_lens == NULL)
> >> +        hdr_lens = &local_hdr_lens;
> >> +
> >> +    eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy);
> >> +    if (unlikely(eh == NULL))
> >> +        return 0;
> >> +    proto = eh->ether_type;
> >> +    off = sizeof(*eh);
> >> +    hdr_lens->l2_len = off;
> >> +
> >> +    if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
> >> +        const struct ipv4_hdr *ip4h;
> >> +        struct ipv4_hdr ip4h_copy;
> >> +
> >> +        ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy);
> >> +        if (unlikely(ip4h == NULL))
> >> +            return pkt_type;
> >> +
> >> +        pkt_type |= ptype_l3_ip(ip4h->version_ihl);
> >> +        hdr_lens->l3_len = ip4_hlen(ip4h);
> >> +        off += hdr_lens->l3_len;
> >> +        if (ip4h->fragment_offset &
> >> +                rte_cpu_to_be_16(IPV4_HDR_OFFSET_MASK |
> >> +                    IPV4_HDR_MF_FLAG)) {
> >> +            pkt_type |= RTE_PTYPE_L4_FRAG;
> >> +            hdr_lens->l4_len = 0;
> >> +            return pkt_type;
> >> +        }
> >> +        proto = ip4h->next_proto_id;
> >> +        pkt_type |= ptype_l4(proto);
> >> +    } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) {
> >> +        const struct ipv6_hdr *ip6h;
> >> +        struct ipv6_hdr ip6h_copy;
> >> +        int frag = 0;
> >> +
> >> +        ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy);
> >> +        if (unlikely(ip6h == NULL))
> >> +            return pkt_type;
> >> +
> >> +        proto = ip6h->proto;
> >> +        hdr_lens->l3_len = sizeof(*ip6h);
> >> +        off += hdr_lens->l3_len;
> >> +        pkt_type |= ptype_l3_ip6(proto);
> >> +        if ((pkt_type & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV6_EXT) {
> >> +            proto = skip_ip6_ext(proto, m, &off, &frag);
> >> +            hdr_lens->l3_len = off - hdr_lens->l2_len;
> >> +        }
> >> +        if (proto == 0)
> >> +            return pkt_type;
> >> +        if (frag) {
> >> +            pkt_type |= RTE_PTYPE_L4_FRAG;
> >> +            hdr_lens->l4_len = 0;
> >> +            return pkt_type;
> >> +        }
> >> +        pkt_type |= ptype_l4(proto);
> >> +    }
> >> +
> >> +    if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP) {
> >> +        hdr_lens->l4_len = sizeof(struct udp_hdr);
> >> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP) {
> >> +        const struct tcp_hdr *th;
> >> +        struct tcp_hdr th_copy;
> >> +
> >> +        th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy);
> >> +        if (unlikely(th == NULL))
> >> +            return pkt_type & (RTE_PTYPE_L2_MASK |
> >> +                RTE_PTYPE_L3_MASK);
> >> +        hdr_lens->l4_len = (th->data_off & 0xf0) >> 2;
> >> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) {
> >> +        hdr_lens->l4_len = sizeof(struct sctp_hdr);
> >> +    } else {
> >> +        hdr_lens->l4_len = 0;
> >> +    }
> >> +
> >> +    return pkt_type;
> >> +}
> >> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h
> >> b/lib/librte_mbuf/rte_mbuf_ptype.h
> >> index 4a34678..f468520 100644
> >> --- a/lib/librte_mbuf/rte_mbuf_ptype.h
> >> +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
> >> @@ -545,6 +545,49 @@ extern "C" {
> >>           RTE_PTYPE_INNER_L3_MASK |                \
> >>           RTE_PTYPE_INNER_L4_MASK))
> >>   +struct rte_mbuf;
> >> +
> >> +/**
> >> + * Structure containing header lengths associated to a packet.
> >> + */
> >> +struct rte_mbuf_hdr_lens {
> >> +    uint8_t l2_len;
> >> +    uint8_t l3_len;
> >> +    uint8_t l4_len;
> >> +    uint8_t tunnel_len;
> >> +    uint8_t inner_l2_len;
> >> +    uint8_t inner_l3_len;
> >> +    uint8_t inner_l4_len;
> >> +};
> > [LC] The header parsing graph usually is not unique. The definition
> > maybe nice for the basic IP and L4 tunnel.
> > However it can't scale out to other cases, e.g. qinq, mac-in-mac, mpls
> > l2/l3 tunnel.
> > The parsing logic of "rte_pktmbuf_get_ptype()" and the definition of
> > "struct rte_mbuf_hdr_lens" consist a pair for one specific parser scheme.
> > In this case, the fixed function is to support below.
> >
> > + * Supported packet types are:
> > + *   L2: Ether
> > + *   L3: IPv4, IPv6
> > + *   L4: TCP, UDP, SCTP
> >
> > Of course, it can add more packet type detection logic in future. But
> > the more support, the higher the cost.
> >
> > One of the alternative way is to allow registering parser pair. APP
> > decides to choose the predefined scheme(by DPDK LIB), or to self-define
> > the parsing logic.
> > In this way, the scheme can do some assumption for the specific case and
> > ignore some useless graph detection.
> > In addition, besides the SW parser, the HW parser(identified by
> > packet_type in mbuf) can be turn on/off by leveraging the same manner.
> 
> 
> Sorry, I'm not sure I'm fully getting what you are saying. If I
> understand well, you would like to have something more flexible that
> supports the registration of protocol to be recognized?
[LC] Not on that granularity, but on the entire parsing routine.
rte_pktmbuf_get_ptype() as the common API, and can present in different behavior.
Usually in different scenario, the interested packet set is different.
For the specific case, can do some speculation pre-checking on the optimization perspective.

> 
> I'm not sure having a function with a dynamic registration method would
> really increase the performance compared to a static complete function.
[LC] No, it won't. But the overhead is not much, refer to rx_pkt_burst(is a callback either).
If someone only interest for IPv4-NoFrag-TCP stream, the easiest way maybe not layer by layer detection.
The straight forward way maybe, 1) load n bytes 2) compare mask 3) update ptype.
We require a normal way to do SW detection, current version is perfect.
My point is, we can provide a simple mechanism to allow other way, and under the same unified API.

> Actually, we will never support a tons of protocols since each layer
> packet type is 4 bits, and since it requires that at least one hw
> supports it.
[LC] Agree, it is today. But maybe dynamic in future, packet type definition as a template.
> 
> As described in the cover letter, the 2 main goals of this patchset are
> to provide a reference implementation for packet type recognition, and
> enable the support of virtio offloads (I'll send the patchset soon).
> This function is adapted to these 2 usages. Are you thinking of another
> use-case that would not be covered?
[LC] That's excellent work.  Furthermore I believe it can cover all ethdev actually.
When HW can't report some demand packet type, then fallback to your SW parser version.
If the auto-switch can be transparent, that's perfect. Maybe rx callback and update ptype in mbuf?

Thanks
> 
> Regards,
> Olivier
  
Olivier Matz July 7, 2016, 3:48 p.m. UTC | #7
Hi Cunming,

Thank you for your feedback.

On 07/07/2016 10:19 AM, Liang, Cunming wrote:
> Hi Olivier,
> 
>> -----Original Message-----
>> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
>> Sent: Wednesday, July 06, 2016 3:43 PM
>> To: Liang, Cunming <cunming.liang@intel.com>; dev@dpdk.org
>> Subject: Re: [dpdk-dev] [PATCH 05/18] mbuf: add function to get packet type
>> from data
>>
>> Hi Cunming,
>>
>> On 07/06/2016 08:44 AM, Liang, Cunming wrote:
>>> Hi Olivier,
>>>
>>> On 7/5/2016 11:41 PM, Olivier Matz wrote:
>>>> Introduce the function rte_pktmbuf_get_ptype() that parses a
>>>> mbuf and returns its packet type. For now, the following packet
>>>> types are parsed:
>>>>     L2: Ether
>>>>     L3: IPv4, IPv6
>>>>     L4: TCP, UDP, SCTP
>>>>
>>>> The goal here is to provide a reference implementation for packet type
>>>> parsing. This function will be used by testpmd in next commits, allowing
>>>> to compare its result with the value given by the hardware.
>>>>
>>>> This function will also be useful when implementing Rx offload support
>>>> in virtio pmd. Indeed, the virtio protocol gives the csum start and
>>>> offset, but it does not give the L4 protocol nor it tells if the
>>>> checksum is relevant for inner or outer. This information has to be
>>>> known to properly set the ol_flags in mbuf.
>>>>
>>>> Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
>>>> Signed-off-by: Jean Dao <jean.dao@6wind.com>
>>>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
>>>> ---
>>>>   doc/guides/rel_notes/release_16_11.rst |   5 +
>>>>   lib/librte_mbuf/Makefile               |   5 +-
>>>>   lib/librte_mbuf/rte_mbuf_ptype.c       | 234
>>>> +++++++++++++++++++++++++++++++++
>>>>   lib/librte_mbuf/rte_mbuf_ptype.h       |  43 ++++++
>>>>   lib/librte_mbuf/rte_mbuf_version.map   |   1 +
>>>>   5 files changed, 286 insertions(+), 2 deletions(-)
>>>>   create mode 100644 lib/librte_mbuf/rte_mbuf_ptype.c
>>>>
>>>> [...]
>>>> +
>>>> +/* parse mbuf data to get packet type */
>>>> +uint32_t rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
>>>> +    struct rte_mbuf_hdr_lens *hdr_lens)
>>>> +{
>>>> +    struct rte_mbuf_hdr_lens local_hdr_lens;
>>>> +    const struct ether_hdr *eh;
>>>> +    struct ether_hdr eh_copy;
>>>> +    uint32_t pkt_type = RTE_PTYPE_L2_ETHER;
>>>> +    uint32_t off = 0;
>>>> +    uint16_t proto;
>>>> +
>>>> +    if (hdr_lens == NULL)
>>>> +        hdr_lens = &local_hdr_lens;
>>>> +
>>>> +    eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy);
>>>> +    if (unlikely(eh == NULL))
>>>> +        return 0;
>>>> +    proto = eh->ether_type;
>>>> +    off = sizeof(*eh);
>>>> +    hdr_lens->l2_len = off;
>>>> +
>>>> +    if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
>>>> +        const struct ipv4_hdr *ip4h;
>>>> +        struct ipv4_hdr ip4h_copy;
>>>> +
>>>> +        ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy);
>>>> +        if (unlikely(ip4h == NULL))
>>>> +            return pkt_type;
>>>> +
>>>> +        pkt_type |= ptype_l3_ip(ip4h->version_ihl);
>>>> +        hdr_lens->l3_len = ip4_hlen(ip4h);
>>>> +        off += hdr_lens->l3_len;
>>>> +        if (ip4h->fragment_offset &
>>>> +                rte_cpu_to_be_16(IPV4_HDR_OFFSET_MASK |
>>>> +                    IPV4_HDR_MF_FLAG)) {
>>>> +            pkt_type |= RTE_PTYPE_L4_FRAG;
>>>> +            hdr_lens->l4_len = 0;
>>>> +            return pkt_type;
>>>> +        }
>>>> +        proto = ip4h->next_proto_id;
>>>> +        pkt_type |= ptype_l4(proto);
>>>> +    } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) {
>>>> +        const struct ipv6_hdr *ip6h;
>>>> +        struct ipv6_hdr ip6h_copy;
>>>> +        int frag = 0;
>>>> +
>>>> +        ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy);
>>>> +        if (unlikely(ip6h == NULL))
>>>> +            return pkt_type;
>>>> +
>>>> +        proto = ip6h->proto;
>>>> +        hdr_lens->l3_len = sizeof(*ip6h);
>>>> +        off += hdr_lens->l3_len;
>>>> +        pkt_type |= ptype_l3_ip6(proto);
>>>> +        if ((pkt_type & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV6_EXT) {
>>>> +            proto = skip_ip6_ext(proto, m, &off, &frag);
>>>> +            hdr_lens->l3_len = off - hdr_lens->l2_len;
>>>> +        }
>>>> +        if (proto == 0)
>>>> +            return pkt_type;
>>>> +        if (frag) {
>>>> +            pkt_type |= RTE_PTYPE_L4_FRAG;
>>>> +            hdr_lens->l4_len = 0;
>>>> +            return pkt_type;
>>>> +        }
>>>> +        pkt_type |= ptype_l4(proto);
>>>> +    }
>>>> +
>>>> +    if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP) {
>>>> +        hdr_lens->l4_len = sizeof(struct udp_hdr);
>>>> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP) {
>>>> +        const struct tcp_hdr *th;
>>>> +        struct tcp_hdr th_copy;
>>>> +
>>>> +        th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy);
>>>> +        if (unlikely(th == NULL))
>>>> +            return pkt_type & (RTE_PTYPE_L2_MASK |
>>>> +                RTE_PTYPE_L3_MASK);
>>>> +        hdr_lens->l4_len = (th->data_off & 0xf0) >> 2;
>>>> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) {
>>>> +        hdr_lens->l4_len = sizeof(struct sctp_hdr);
>>>> +    } else {
>>>> +        hdr_lens->l4_len = 0;
>>>> +    }
>>>> +
>>>> +    return pkt_type;
>>>> +}
>>>> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h
>>>> b/lib/librte_mbuf/rte_mbuf_ptype.h
>>>> index 4a34678..f468520 100644
>>>> --- a/lib/librte_mbuf/rte_mbuf_ptype.h
>>>> +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
>>>> @@ -545,6 +545,49 @@ extern "C" {
>>>>           RTE_PTYPE_INNER_L3_MASK |                \
>>>>           RTE_PTYPE_INNER_L4_MASK))
>>>>   +struct rte_mbuf;
>>>> +
>>>> +/**
>>>> + * Structure containing header lengths associated to a packet.
>>>> + */
>>>> +struct rte_mbuf_hdr_lens {
>>>> +    uint8_t l2_len;
>>>> +    uint8_t l3_len;
>>>> +    uint8_t l4_len;
>>>> +    uint8_t tunnel_len;
>>>> +    uint8_t inner_l2_len;
>>>> +    uint8_t inner_l3_len;
>>>> +    uint8_t inner_l4_len;
>>>> +};
>>> [LC] The header parsing graph usually is not unique. The definition
>>> maybe nice for the basic IP and L4 tunnel.
>>> However it can't scale out to other cases, e.g. qinq, mac-in-mac, mpls
>>> l2/l3 tunnel.
>>> The parsing logic of "rte_pktmbuf_get_ptype()" and the definition of
>>> "struct rte_mbuf_hdr_lens" consist a pair for one specific parser scheme.
>>> In this case, the fixed function is to support below.
>>>
>>> + * Supported packet types are:
>>> + *   L2: Ether
>>> + *   L3: IPv4, IPv6
>>> + *   L4: TCP, UDP, SCTP
>>>
>>> Of course, it can add more packet type detection logic in future. But
>>> the more support, the higher the cost.
>>>
>>> One of the alternative way is to allow registering parser pair. APP
>>> decides to choose the predefined scheme(by DPDK LIB), or to self-define
>>> the parsing logic.
>>> In this way, the scheme can do some assumption for the specific case and
>>> ignore some useless graph detection.
>>> In addition, besides the SW parser, the HW parser(identified by
>>> packet_type in mbuf) can be turn on/off by leveraging the same manner.
>>
>>
>> Sorry, I'm not sure I'm fully getting what you are saying. If I
>> understand well, you would like to have something more flexible that
>> supports the registration of protocol to be recognized?
> [LC] Not on that granularity, but on the entire parsing routine.
> rte_pktmbuf_get_ptype() as the common API, and can present in different behavior.
> Usually in different scenario, the interested packet set is different.
> For the specific case, can do some speculation pre-checking on the optimization perspective.
> 
>>
>> I'm not sure having a function with a dynamic registration method would
>> really increase the performance compared to a static complete function.
> [LC] No, it won't. But the overhead is not much, refer to rx_pkt_burst(is a callback either).
> If someone only interest for IPv4-NoFrag-TCP stream, the easiest way maybe not layer by layer detection.
> The straight forward way maybe, 1) load n bytes 2) compare mask 3) update ptype.
> We require a normal way to do SW detection, current version is perfect.
> My point is, we can provide a simple mechanism to allow other way, and under the same unified API.

Again, sorry, I'm not perfectly sure I understand what you are saying.

What you describe (mask packet data, then compare with a value) seems
very similar to what ovs does. Do you mean we should have an API for that?

I think once we have masked+compared the data, we may know much more
than just a packet_type.



> 
>> Actually, we will never support a tons of protocols since each layer
>> packet type is 4 bits, and since it requires that at least one hw
>> supports it.
> [LC] Agree, it is today. But maybe dynamic in future, packet type definition as a template.
>>
>> As described in the cover letter, the 2 main goals of this patchset are
>> to provide a reference implementation for packet type recognition, and
>> enable the support of virtio offloads (I'll send the patchset soon).
>> This function is adapted to these 2 usages. Are you thinking of another
>> use-case that would not be covered?
> [LC] That's excellent work.  Furthermore I believe it can cover all ethdev actually.
> When HW can't report some demand packet type, then fallback to your SW parser version.
> If the auto-switch can be transparent, that's perfect. Maybe rx callback and update ptype in mbuf?

I was also thinking about calling rte_pktmbuf_get_ptype() from a driver.
I think drivers should not access to mbuf data if it's not absolutely
required.
Calling rte_pktmbuf_get_ptype() from inside a rx callback seems easily
feasible, it may be useful for applications that mostly relies on
packet_type to select an action.


Regards,
Olivier
  
Cunming Liang July 8, 2016, 10:08 a.m. UTC | #8
Hi Olivier,

> -----Original Message-----
> From: Olivier Matz [mailto:olivier.matz@6wind.com]
> Sent: Thursday, July 07, 2016 11:49 PM
> To: Liang, Cunming <cunming.liang@intel.com>; dev@dpdk.org
> Subject: Re: [dpdk-dev] [PATCH 05/18] mbuf: add function to get packet type
> from data
> 
> Hi Cunming,
> 
> Thank you for your feedback.
> 
> On 07/07/2016 10:19 AM, Liang, Cunming wrote:
> > Hi Olivier,
> >
> >> -----Original Message-----
> >> From: Olivier MATZ [mailto:olivier.matz@6wind.com]
> >> Sent: Wednesday, July 06, 2016 3:43 PM
> >> To: Liang, Cunming <cunming.liang@intel.com>; dev@dpdk.org
> >> Subject: Re: [dpdk-dev] [PATCH 05/18] mbuf: add function to get packet type
> >> from data
> >>
> >> Hi Cunming,
> >>
> >> On 07/06/2016 08:44 AM, Liang, Cunming wrote:
> >>> Hi Olivier,
> >>>
> >>> On 7/5/2016 11:41 PM, Olivier Matz wrote:
> >>>> Introduce the function rte_pktmbuf_get_ptype() that parses a
> >>>> mbuf and returns its packet type. For now, the following packet
> >>>> types are parsed:
> >>>>     L2: Ether
> >>>>     L3: IPv4, IPv6
> >>>>     L4: TCP, UDP, SCTP
> >>>>
> >>>> The goal here is to provide a reference implementation for packet type
> >>>> parsing. This function will be used by testpmd in next commits, allowing
> >>>> to compare its result with the value given by the hardware.
> >>>>
> >>>> This function will also be useful when implementing Rx offload support
> >>>> in virtio pmd. Indeed, the virtio protocol gives the csum start and
> >>>> offset, but it does not give the L4 protocol nor it tells if the
> >>>> checksum is relevant for inner or outer. This information has to be
> >>>> known to properly set the ol_flags in mbuf.
> >>>>
> >>>> Signed-off-by: Didier Pallard <didier.pallard@6wind.com>
> >>>> Signed-off-by: Jean Dao <jean.dao@6wind.com>
> >>>> Signed-off-by: Olivier Matz <olivier.matz@6wind.com>
> >>>> ---
> >>>>   doc/guides/rel_notes/release_16_11.rst |   5 +
> >>>>   lib/librte_mbuf/Makefile               |   5 +-
> >>>>   lib/librte_mbuf/rte_mbuf_ptype.c       | 234
> >>>> +++++++++++++++++++++++++++++++++
> >>>>   lib/librte_mbuf/rte_mbuf_ptype.h       |  43 ++++++
> >>>>   lib/librte_mbuf/rte_mbuf_version.map   |   1 +
> >>>>   5 files changed, 286 insertions(+), 2 deletions(-)
> >>>>   create mode 100644 lib/librte_mbuf/rte_mbuf_ptype.c
> >>>>
> >>>> [...]
> >>>> +
> >>>> +/* parse mbuf data to get packet type */
> >>>> +uint32_t rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
> >>>> +    struct rte_mbuf_hdr_lens *hdr_lens)
> >>>> +{
> >>>> +    struct rte_mbuf_hdr_lens local_hdr_lens;
> >>>> +    const struct ether_hdr *eh;
> >>>> +    struct ether_hdr eh_copy;
> >>>> +    uint32_t pkt_type = RTE_PTYPE_L2_ETHER;
> >>>> +    uint32_t off = 0;
> >>>> +    uint16_t proto;
> >>>> +
> >>>> +    if (hdr_lens == NULL)
> >>>> +        hdr_lens = &local_hdr_lens;
> >>>> +
> >>>> +    eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy);
> >>>> +    if (unlikely(eh == NULL))
> >>>> +        return 0;
> >>>> +    proto = eh->ether_type;
> >>>> +    off = sizeof(*eh);
> >>>> +    hdr_lens->l2_len = off;
> >>>> +
> >>>> +    if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
> >>>> +        const struct ipv4_hdr *ip4h;
> >>>> +        struct ipv4_hdr ip4h_copy;
> >>>> +
> >>>> +        ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy);
> >>>> +        if (unlikely(ip4h == NULL))
> >>>> +            return pkt_type;
> >>>> +
> >>>> +        pkt_type |= ptype_l3_ip(ip4h->version_ihl);
> >>>> +        hdr_lens->l3_len = ip4_hlen(ip4h);
> >>>> +        off += hdr_lens->l3_len;
> >>>> +        if (ip4h->fragment_offset &
> >>>> +                rte_cpu_to_be_16(IPV4_HDR_OFFSET_MASK |
> >>>> +                    IPV4_HDR_MF_FLAG)) {
> >>>> +            pkt_type |= RTE_PTYPE_L4_FRAG;
> >>>> +            hdr_lens->l4_len = 0;
> >>>> +            return pkt_type;
> >>>> +        }
> >>>> +        proto = ip4h->next_proto_id;
> >>>> +        pkt_type |= ptype_l4(proto);
> >>>> +    } else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) {
> >>>> +        const struct ipv6_hdr *ip6h;
> >>>> +        struct ipv6_hdr ip6h_copy;
> >>>> +        int frag = 0;
> >>>> +
> >>>> +        ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy);
> >>>> +        if (unlikely(ip6h == NULL))
> >>>> +            return pkt_type;
> >>>> +
> >>>> +        proto = ip6h->proto;
> >>>> +        hdr_lens->l3_len = sizeof(*ip6h);
> >>>> +        off += hdr_lens->l3_len;
> >>>> +        pkt_type |= ptype_l3_ip6(proto);
> >>>> +        if ((pkt_type & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV6_EXT) {
> >>>> +            proto = skip_ip6_ext(proto, m, &off, &frag);
> >>>> +            hdr_lens->l3_len = off - hdr_lens->l2_len;
> >>>> +        }
> >>>> +        if (proto == 0)
> >>>> +            return pkt_type;
> >>>> +        if (frag) {
> >>>> +            pkt_type |= RTE_PTYPE_L4_FRAG;
> >>>> +            hdr_lens->l4_len = 0;
> >>>> +            return pkt_type;
> >>>> +        }
> >>>> +        pkt_type |= ptype_l4(proto);
> >>>> +    }
> >>>> +
> >>>> +    if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP) {
> >>>> +        hdr_lens->l4_len = sizeof(struct udp_hdr);
> >>>> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP) {
> >>>> +        const struct tcp_hdr *th;
> >>>> +        struct tcp_hdr th_copy;
> >>>> +
> >>>> +        th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy);
> >>>> +        if (unlikely(th == NULL))
> >>>> +            return pkt_type & (RTE_PTYPE_L2_MASK |
> >>>> +                RTE_PTYPE_L3_MASK);
> >>>> +        hdr_lens->l4_len = (th->data_off & 0xf0) >> 2;
> >>>> +    } else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) {
> >>>> +        hdr_lens->l4_len = sizeof(struct sctp_hdr);
> >>>> +    } else {
> >>>> +        hdr_lens->l4_len = 0;
> >>>> +    }
> >>>> +
> >>>> +    return pkt_type;
> >>>> +}
> >>>> diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h
> >>>> b/lib/librte_mbuf/rte_mbuf_ptype.h
> >>>> index 4a34678..f468520 100644
> >>>> --- a/lib/librte_mbuf/rte_mbuf_ptype.h
> >>>> +++ b/lib/librte_mbuf/rte_mbuf_ptype.h
> >>>> @@ -545,6 +545,49 @@ extern "C" {
> >>>>           RTE_PTYPE_INNER_L3_MASK |                \
> >>>>           RTE_PTYPE_INNER_L4_MASK))
> >>>>   +struct rte_mbuf;
> >>>> +
> >>>> +/**
> >>>> + * Structure containing header lengths associated to a packet.
> >>>> + */
> >>>> +struct rte_mbuf_hdr_lens {
> >>>> +    uint8_t l2_len;
> >>>> +    uint8_t l3_len;
> >>>> +    uint8_t l4_len;
> >>>> +    uint8_t tunnel_len;
> >>>> +    uint8_t inner_l2_len;
> >>>> +    uint8_t inner_l3_len;
> >>>> +    uint8_t inner_l4_len;
> >>>> +};
> >>> [LC] The header parsing graph usually is not unique. The definition
> >>> maybe nice for the basic IP and L4 tunnel.
> >>> However it can't scale out to other cases, e.g. qinq, mac-in-mac, mpls
> >>> l2/l3 tunnel.
> >>> The parsing logic of "rte_pktmbuf_get_ptype()" and the definition of
> >>> "struct rte_mbuf_hdr_lens" consist a pair for one specific parser scheme.
> >>> In this case, the fixed function is to support below.
> >>>
> >>> + * Supported packet types are:
> >>> + *   L2: Ether
> >>> + *   L3: IPv4, IPv6
> >>> + *   L4: TCP, UDP, SCTP
> >>>
> >>> Of course, it can add more packet type detection logic in future. But
> >>> the more support, the higher the cost.
> >>>
> >>> One of the alternative way is to allow registering parser pair. APP
> >>> decides to choose the predefined scheme(by DPDK LIB), or to self-define
> >>> the parsing logic.
> >>> In this way, the scheme can do some assumption for the specific case and
> >>> ignore some useless graph detection.
> >>> In addition, besides the SW parser, the HW parser(identified by
> >>> packet_type in mbuf) can be turn on/off by leveraging the same manner.
> >>
> >>
> >> Sorry, I'm not sure I'm fully getting what you are saying. If I
> >> understand well, you would like to have something more flexible that
> >> supports the registration of protocol to be recognized?
> > [LC] Not on that granularity, but on the entire parsing routine.
> > rte_pktmbuf_get_ptype() as the common API, and can present in different
> behavior.
> > Usually in different scenario, the interested packet set is different.
> > For the specific case, can do some speculation pre-checking on the optimization
> perspective.
> >
> >>
> >> I'm not sure having a function with a dynamic registration method would
> >> really increase the performance compared to a static complete function.
> > [LC] No, it won't. But the overhead is not much, refer to rx_pkt_burst(is a
> callback either).
> > If someone only interest for IPv4-NoFrag-TCP stream, the easiest way maybe
> not layer by layer detection.
> > The straight forward way maybe, 1) load n bytes 2) compare mask 3) update
> ptype.
> > We require a normal way to do SW detection, current version is perfect.
> > My point is, we can provide a simple mechanism to allow other way, and under
> the same unified API.
> 
> Again, sorry, I'm not perfectly sure I understand what you are saying.
> 
> What you describe (mask packet data, then compare with a value) seems
> very similar to what ovs does. Do you mean we should have an API for that?
[LC] No. Sorry to make you confused.
If there's one function can well detect all kinds of packet in low cost, it's perfect.
But from case to case, the packet detection interest is difficult(IPDC, wireless, metro Ethernet and etc).
Considering the possible tradeoff of performance and completeness, to
allow dedicated parser tuned for special purpose is an alternative way.

> 
> I think once we have masked+compared the data, we may know much more
> than just a packet_type.
[LC] Detection packet layer by layer is the normal way. In some case, it doesn't have to.
For example, we assume there's one network using VXLAN-GPE..
To detect the packet layer by layer, need to walk through two step, UDP Port and VXLAN NP.
In fact, UDP+VXLAN(16B) as a whole to compare mask once, you can know it's a VXLAN w/ inner Ethernet or not.

Probably it's not a perfect cases. SW Parser is not a low cost stuff, from cases to cases, if there are some special, it has potential space to optimize. One possible pseudo code as below.

struct rte_ptype_parser {
	char name[128];
	uint32_t (*get_ptype)(const struct rte_mbuf *m, void *hdr_lens);
};

struct rte_ptype_parser def_parser = 
{
	.name = "ipdc"; 
	.get_ptype = ipdc_get_ptype;
};

uint32_t rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
		void *hdr_lens)
{
	struct rte_ptype_parser parser = def_parser;
	
	[...]
	parser->get_ptype(m, hdr_lens);
	[...]
}

/* scheme for ipdc */
struct ipdc_hdr_lens {
	uint8_t l2_len;
	uint8_t l3_len;
	uint8_t l4_len;
	uint8_t tunnel_len;
	uint8_t inner_l2_len;
	uint8_t inner_l3_len;
	uint8_t inner_l4_len;
};
uint32_t ipdc_get_ptype(const struct rte_mbuf *m, void *hdr_lens)
{
	struct ipdc_hdr_lens ihl = (struct ipdc_hdr_lens*)hdr_lens;

	/* parser logic optimized for typical IP datacenter packet */
	[...]
}

/* scheme for l2mpls */
struct l2mpls_hdr_lens {
	uint8_t l2_len;
	uint8_t mpls_len;            /* total length for multi-layer */
	uint8_t inner_l2_len;
	uint8_t inner_l3_len;
};
uint32_t l2mpls_get_ptype(const struct rte_mbuf *m, void *hdr_lens)
{
	struct l2mpls_hdr_lens ihl = (struct l2mpls_hdr_lens*)hdr_lens;

	/* parser logic optimized for typical L2MPLS */
	[...]
}

> 
> 
> 
> >
> >> Actually, we will never support a tons of protocols since each layer
> >> packet type is 4 bits, and since it requires that at least one hw
> >> supports it.
> > [LC] Agree, it is today. But maybe dynamic in future, packet type definition as a
> template.
> >>
> >> As described in the cover letter, the 2 main goals of this patchset are
> >> to provide a reference implementation for packet type recognition, and
> >> enable the support of virtio offloads (I'll send the patchset soon).
> >> This function is adapted to these 2 usages. Are you thinking of another
> >> use-case that would not be covered?
> > [LC] That's excellent work.  Furthermore I believe it can cover all ethdev actually.
> > When HW can't report some demand packet type, then fallback to your SW
> parser version.
> > If the auto-switch can be transparent, that's perfect. Maybe rx callback and
> update ptype in mbuf?
> 
> I was also thinking about calling rte_pktmbuf_get_ptype() from a driver.
> I think drivers should not access to mbuf data if it's not absolutely
> required.
> Calling rte_pktmbuf_get_ptype() from inside a rx callback seems easily
> feasible, it may be useful for applications that mostly relies on
> packet_type to select an action.
> 
> 
> Regards,
> Olivier
  

Patch

diff --git a/doc/guides/rel_notes/release_16_11.rst b/doc/guides/rel_notes/release_16_11.rst
index 9b4d533..0ef8a87 100644
--- a/doc/guides/rel_notes/release_16_11.rst
+++ b/doc/guides/rel_notes/release_16_11.rst
@@ -39,6 +39,11 @@  New Features
   Added a new function ``rte_pktmbuf_read()`` to read the packet data from an
   mbuf chain, linearizing if required.
 
+* **Added a function to get the packet type from packet data.**
+
+  Added a new function ``rte_pktmbuf_get_ptype()`` to parse an Ethernet packet
+  in an mbuf chain and retrieve its packet type by software.
+
 Resolved Issues
 ---------------
 
diff --git a/lib/librte_mbuf/Makefile b/lib/librte_mbuf/Makefile
index 27e037c..15bbc78 100644
--- a/lib/librte_mbuf/Makefile
+++ b/lib/librte_mbuf/Makefile
@@ -41,12 +41,13 @@  EXPORT_MAP := rte_mbuf_version.map
 LIBABIVER := 2
 
 # all source are stored in SRCS-y
-SRCS-$(CONFIG_RTE_LIBRTE_MBUF) := rte_mbuf.c
+SRCS-$(CONFIG_RTE_LIBRTE_MBUF) := rte_mbuf.c rte_mbuf_ptype.c
 
 # install includes
 SYMLINK-$(CONFIG_RTE_LIBRTE_MBUF)-include := rte_mbuf.h rte_mbuf_ptype.h
 
 # this lib needs eal
-DEPDIRS-$(CONFIG_RTE_LIBRTE_MBUF) += lib/librte_eal lib/librte_mempool
+DEPDIRS-$(CONFIG_RTE_LIBRTE_MBUF) += lib/librte_eal lib/librte_mempool \
+                                     lib/librte_net
 
 include $(RTE_SDK)/mk/rte.lib.mk
diff --git a/lib/librte_mbuf/rte_mbuf_ptype.c b/lib/librte_mbuf/rte_mbuf_ptype.c
new file mode 100644
index 0000000..73284ae
--- /dev/null
+++ b/lib/librte_mbuf/rte_mbuf_ptype.c
@@ -0,0 +1,234 @@ 
+/*-
+ *   BSD LICENSE
+ *
+ *   Copyright 2016 6WIND S.A.
+ *   All rights reserved.
+ *
+ *   Redistribution and use in source and binary forms, with or without
+ *   modification, are permitted provided that the following conditions
+ *   are met:
+ *
+ *     * Redistributions of source code must retain the above copyright
+ *       notice, this list of conditions and the following disclaimer.
+ *     * Redistributions in binary form must reproduce the above copyright
+ *       notice, this list of conditions and the following disclaimer in
+ *       the documentation and/or other materials provided with the
+ *       distribution.
+ *     * Neither the name of Intel Corporation nor the names of its
+ *       contributors may be used to endorse or promote products derived
+ *       from this software without specific prior written permission.
+ *
+ *   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
+ *   "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+ *   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
+ *   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT
+ *   OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
+ *   SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
+ *   LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ *   DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ *   THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ *   (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+ *   OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+
+#include <stdint.h>
+
+#include <rte_mbuf.h>
+#include <rte_mbuf_ptype.h>
+#include <rte_byteorder.h>
+#include <rte_ether.h>
+#include <rte_ip.h>
+#include <rte_tcp.h>
+#include <rte_udp.h>
+#include <rte_sctp.h>
+
+/* get l3 packet type from ip6 next protocol */
+static uint32_t
+ptype_l3_ip6(uint8_t ip6_proto)
+{
+	static const uint32_t ip6_ext_proto_map[256] = {
+		[IPPROTO_HOPOPTS] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6,
+		[IPPROTO_ROUTING] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6,
+		[IPPROTO_FRAGMENT] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6,
+		[IPPROTO_ESP] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6,
+		[IPPROTO_AH] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6,
+		[IPPROTO_DSTOPTS] = RTE_PTYPE_L3_IPV6_EXT - RTE_PTYPE_L3_IPV6,
+	};
+
+	return RTE_PTYPE_L3_IPV6 + ip6_ext_proto_map[ip6_proto];
+}
+
+/* get l3 packet type from ip version and header length */
+static uint32_t
+ptype_l3_ip(uint8_t ipv_ihl)
+{
+	static const uint32_t ptype_l3_ip_proto_map[256] = {
+		[0x45] = RTE_PTYPE_L3_IPV4,
+		[0x46] = RTE_PTYPE_L3_IPV4_EXT,
+		[0x47] = RTE_PTYPE_L3_IPV4_EXT,
+		[0x48] = RTE_PTYPE_L3_IPV4_EXT,
+		[0x49] = RTE_PTYPE_L3_IPV4_EXT,
+		[0x4A] = RTE_PTYPE_L3_IPV4_EXT,
+		[0x4B] = RTE_PTYPE_L3_IPV4_EXT,
+		[0x4C] = RTE_PTYPE_L3_IPV4_EXT,
+		[0x4D] = RTE_PTYPE_L3_IPV4_EXT,
+		[0x4E] = RTE_PTYPE_L3_IPV4_EXT,
+		[0x4F] = RTE_PTYPE_L3_IPV4_EXT,
+	};
+
+	return ptype_l3_ip_proto_map[ipv_ihl];
+}
+
+/* get l4 packet type from proto */
+static uint32_t
+ptype_l4(uint8_t proto)
+{
+	static const uint32_t ptype_l4_proto[256] = {
+		[IPPROTO_UDP] = RTE_PTYPE_L4_UDP,
+		[IPPROTO_TCP] = RTE_PTYPE_L4_TCP,
+		[IPPROTO_SCTP] = RTE_PTYPE_L4_SCTP,
+	};
+
+	return ptype_l4_proto[proto];
+}
+
+/* get the ipv4 header length */
+static uint8_t
+ip4_hlen(const struct ipv4_hdr *hdr)
+{
+	return (hdr->version_ihl & 0xf) * 4;
+}
+
+/* parse ipv6 extended headers, update offset and return next proto */
+static uint16_t
+skip_ip6_ext(uint16_t proto, const struct rte_mbuf *m, uint32_t *off,
+	int *frag)
+{
+	struct ext_hdr {
+		uint8_t next_hdr;
+		uint8_t len;
+	};
+	const struct ext_hdr *xh;
+	struct ext_hdr xh_copy;
+	unsigned int i;
+
+	*frag = 0;
+
+#define MAX_EXT_HDRS 5
+	for (i = 0; i < MAX_EXT_HDRS; i++) {
+		switch (proto) {
+		case IPPROTO_HOPOPTS:
+		case IPPROTO_ROUTING:
+		case IPPROTO_DSTOPTS:
+			xh = rte_pktmbuf_read(m, *off, sizeof(*xh),
+				&xh_copy);
+			if (xh == NULL)
+				return 0;
+			*off += (xh->len + 1) * 8;
+			proto = xh->next_hdr;
+			break;
+		case IPPROTO_FRAGMENT:
+			xh = rte_pktmbuf_read(m, *off, sizeof(*xh),
+				&xh_copy);
+			if (xh == NULL)
+				return 0;
+			*off += 8;
+			proto = xh->next_hdr;
+			*frag = 1;
+			return proto; /* this is always the last ext hdr */
+		case IPPROTO_NONE:
+			return 0;
+		default:
+			return proto;
+		}
+	}
+	return 0;
+}
+
+/* parse mbuf data to get packet type */
+uint32_t rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
+	struct rte_mbuf_hdr_lens *hdr_lens)
+{
+	struct rte_mbuf_hdr_lens local_hdr_lens;
+	const struct ether_hdr *eh;
+	struct ether_hdr eh_copy;
+	uint32_t pkt_type = RTE_PTYPE_L2_ETHER;
+	uint32_t off = 0;
+	uint16_t proto;
+
+	if (hdr_lens == NULL)
+		hdr_lens = &local_hdr_lens;
+
+	eh = rte_pktmbuf_read(m, off, sizeof(*eh), &eh_copy);
+	if (unlikely(eh == NULL))
+		return 0;
+	proto = eh->ether_type;
+	off = sizeof(*eh);
+	hdr_lens->l2_len = off;
+
+	if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv4)) {
+		const struct ipv4_hdr *ip4h;
+		struct ipv4_hdr ip4h_copy;
+
+		ip4h = rte_pktmbuf_read(m, off, sizeof(*ip4h), &ip4h_copy);
+		if (unlikely(ip4h == NULL))
+			return pkt_type;
+
+		pkt_type |= ptype_l3_ip(ip4h->version_ihl);
+		hdr_lens->l3_len = ip4_hlen(ip4h);
+		off += hdr_lens->l3_len;
+		if (ip4h->fragment_offset &
+				rte_cpu_to_be_16(IPV4_HDR_OFFSET_MASK |
+					IPV4_HDR_MF_FLAG)) {
+			pkt_type |= RTE_PTYPE_L4_FRAG;
+			hdr_lens->l4_len = 0;
+			return pkt_type;
+		}
+		proto = ip4h->next_proto_id;
+		pkt_type |= ptype_l4(proto);
+	} else if (proto == rte_cpu_to_be_16(ETHER_TYPE_IPv6)) {
+		const struct ipv6_hdr *ip6h;
+		struct ipv6_hdr ip6h_copy;
+		int frag = 0;
+
+		ip6h = rte_pktmbuf_read(m, off, sizeof(*ip6h), &ip6h_copy);
+		if (unlikely(ip6h == NULL))
+			return pkt_type;
+
+		proto = ip6h->proto;
+		hdr_lens->l3_len = sizeof(*ip6h);
+		off += hdr_lens->l3_len;
+		pkt_type |= ptype_l3_ip6(proto);
+		if ((pkt_type & RTE_PTYPE_L3_MASK) == RTE_PTYPE_L3_IPV6_EXT) {
+			proto = skip_ip6_ext(proto, m, &off, &frag);
+			hdr_lens->l3_len = off - hdr_lens->l2_len;
+		}
+		if (proto == 0)
+			return pkt_type;
+		if (frag) {
+			pkt_type |= RTE_PTYPE_L4_FRAG;
+			hdr_lens->l4_len = 0;
+			return pkt_type;
+		}
+		pkt_type |= ptype_l4(proto);
+	}
+
+	if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_UDP) {
+		hdr_lens->l4_len = sizeof(struct udp_hdr);
+	} else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_TCP) {
+		const struct tcp_hdr *th;
+		struct tcp_hdr th_copy;
+
+		th = rte_pktmbuf_read(m, off, sizeof(*th), &th_copy);
+		if (unlikely(th == NULL))
+			return pkt_type & (RTE_PTYPE_L2_MASK |
+				RTE_PTYPE_L3_MASK);
+		hdr_lens->l4_len = (th->data_off & 0xf0) >> 2;
+	} else if ((pkt_type & RTE_PTYPE_L4_MASK) == RTE_PTYPE_L4_SCTP) {
+		hdr_lens->l4_len = sizeof(struct sctp_hdr);
+	} else {
+		hdr_lens->l4_len = 0;
+	}
+
+	return pkt_type;
+}
diff --git a/lib/librte_mbuf/rte_mbuf_ptype.h b/lib/librte_mbuf/rte_mbuf_ptype.h
index 4a34678..f468520 100644
--- a/lib/librte_mbuf/rte_mbuf_ptype.h
+++ b/lib/librte_mbuf/rte_mbuf_ptype.h
@@ -545,6 +545,49 @@  extern "C" {
 		RTE_PTYPE_INNER_L3_MASK |				\
 		RTE_PTYPE_INNER_L4_MASK))
 
+struct rte_mbuf;
+
+/**
+ * Structure containing header lengths associated to a packet.
+ */
+struct rte_mbuf_hdr_lens {
+	uint8_t l2_len;
+	uint8_t l3_len;
+	uint8_t l4_len;
+	uint8_t tunnel_len;
+	uint8_t inner_l2_len;
+	uint8_t inner_l3_len;
+	uint8_t inner_l4_len;
+};
+
+/**
+ * Parse an Ethernet packet to get its packet type.
+ *
+ * This function parses the network headers in mbuf data and return its
+ * packet type.
+ *
+ * If it is provided by the user, it also fills a rte_mbuf_hdr_lens
+ * structure that contains the lengths of the parsed network
+ * headers. Each length field is valid only if the associated packet
+ * type is set. For instance, hdr_lens->l2_len is valid only if
+ * (retval & RTE_PTYPE_L2_MASK) != RTE_PTYPE_UNKNOWN.
+ *
+ * Supported packet types are:
+ *   L2: Ether
+ *   L3: IPv4, IPv6
+ *   L4: TCP, UDP, SCTP
+ *
+ * @param m
+ *   The packet mbuf to be parsed.
+ * @param hdr_lens
+ *   A pointer to a structure where the header lengths will be returned,
+ *   or NULL.
+ * @return
+ *   The packet type of the packet.
+ */
+uint32_t rte_pktmbuf_get_ptype(const struct rte_mbuf *m,
+	struct rte_mbuf_hdr_lens *hdr_lens);
+
 #ifdef __cplusplus
 }
 #endif
diff --git a/lib/librte_mbuf/rte_mbuf_version.map b/lib/librte_mbuf/rte_mbuf_version.map
index 79e4dd8..416af8e 100644
--- a/lib/librte_mbuf/rte_mbuf_version.map
+++ b/lib/librte_mbuf/rte_mbuf_version.map
@@ -23,5 +23,6 @@  DPDK_16.11 {
 	global:
 
 	__rte_pktmbuf_read;
+	rte_pktmbuf_get_ptype;
 
 } DPDK_2.1;