diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index 44eeb5d61ba9f69dc5c65c9a389621b3102c4fad..3531558c7b518ca5a8c391f02ac45a859af0f41e 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -191,6 +191,13 @@ config MACVTAP To compile this driver as a module, choose M here: the module will be called macvtap. +config IPVLAN_L2E + bool "IP-VLAN L2E mode support" + depends on IPVLAN + default n + help + Support IP-VLAN L2E mode. + config IPVLAN_L3S depends on NETFILTER depends on IPVLAN diff --git a/drivers/net/ipvlan/ipvlan.h b/drivers/net/ipvlan/ipvlan.h index 025e0c19ec255295fb904d1bbba9b15e0ad8924e..4cf12fbe8f6244f2acb8e6620cfcaf90cf068c14 100644 --- a/drivers/net/ipvlan/ipvlan.h +++ b/drivers/net/ipvlan/ipvlan.h @@ -39,6 +39,11 @@ #define IPVLAN_QBACKLOG_LIMIT 1000 +#if IS_ENABLED(CONFIG_IPVLAN_L2E) +extern int sysctl_ipvlan_loop_qlen; +extern int sysctl_ipvlan_loop_delay; +#endif + typedef enum { IPVL_IPV6 = 0, IPVL_ICMPV6, @@ -70,6 +75,12 @@ struct ipvl_dev { netdev_features_t sfeatures; u32 msg_enable; spinlock_t addrs_lock; +#if IS_ENABLED(CONFIG_IPVLAN_L2E) + int local_packets_cached; + unsigned long local_timeout; + struct timer_list local_free_timer; + struct sk_buff_head local_xmit_queue; +#endif }; struct ipvl_addr { diff --git a/drivers/net/ipvlan/ipvlan_core.c b/drivers/net/ipvlan/ipvlan_core.c index 2d5b021b4ea6053eeb055a76fa4c7d9380cd2a53..268a95642e37607c62acfaa8af5ddbba16d8281a 100644 --- a/drivers/net/ipvlan/ipvlan_core.c +++ b/drivers/net/ipvlan/ipvlan_core.c @@ -547,6 +547,124 @@ static int ipvlan_process_outbound(struct sk_buff *skb) return ret; } +#if IS_ENABLED(CONFIG_IPVLAN_L2E) +static int ipvlan_process_v4_forward(struct sk_buff *skb) +{ + const struct iphdr *ip4h = ip_hdr(skb); + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + struct rtable *rt; + int err, ret = NET_XMIT_DROP; + struct flowi4 fl4 = { + .flowi4_tos = RT_TOS(ip4h->tos), + .flowi4_flags = FLOWI_FLAG_ANYSRC, + .flowi4_mark = skb->mark, + .daddr = ip4h->daddr, + .saddr = ip4h->saddr, + }; + + rt = ip_route_output_flow(net, &fl4, NULL); + if (IS_ERR(rt)) + goto err; + + if (rt->rt_type != RTN_UNICAST && rt->rt_type != RTN_LOCAL) { + ip_rt_put(rt); + goto err; + } + skb_dst_set(skb, &rt->dst); + err = ip_local_out(net, skb->sk, skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out; +err: + dev->stats.tx_errors++; + kfree_skb(skb); +out: + return ret; +} + +#if IS_ENABLED(CONFIG_IPV6) +static int ipvlan_process_v6_forward(struct sk_buff *skb) +{ + const struct ipv6hdr *ip6h = ipv6_hdr(skb); + struct net_device *dev = skb->dev; + struct net *net = dev_net(dev); + struct dst_entry *dst; + int err, ret = NET_XMIT_DROP; + struct flowi6 fl6 = { + .daddr = ip6h->daddr, + .saddr = ip6h->saddr, + .flowi6_flags = FLOWI_FLAG_ANYSRC, + .flowlabel = ip6_flowinfo(ip6h), + .flowi6_mark = skb->mark, + .flowi6_proto = ip6h->nexthdr, + }; + + dst = ip6_route_output(net, NULL, &fl6); + if (dst->error) { + ret = dst->error; + dst_release(dst); + goto err; + } + skb_dst_set(skb, dst); + err = ip6_local_out(net, skb->sk, skb); + if (unlikely(net_xmit_eval(err))) + dev->stats.tx_errors++; + else + ret = NET_XMIT_SUCCESS; + goto out; +err: + dev->stats.tx_errors++; + kfree_skb(skb); +out: + return ret; +} +#else +static int ipvlan_process_v6_forward(struct sk_buff *skb) +{ + return NET_XMIT_DROP; +} +#endif + +static int ipvlan_process_forward(struct sk_buff *skb) +{ + struct ethhdr *ethh = eth_hdr(skb); + int ret = NET_XMIT_DROP; + + /* In this mode we dont care about multicast and broadcast traffic */ + if (is_multicast_ether_addr(ethh->h_dest)) { + pr_debug_ratelimited("Dropped {multi|broad}cast of type=[%x]\n", + ntohs(skb->protocol)); + kfree_skb(skb); + goto out; + } + + /* The ipvlan is a pseudo-L2 device, so the packets that we receive + * will have L2; which need to discarded and processed further + * in the net-ns of the main-device. + */ + if (skb_mac_header_was_set(skb)) { + skb_pull(skb, sizeof(*ethh)); + skb->mac_header = (typeof(skb->mac_header))~0U; + skb_reset_network_header(skb); + } + + if (skb->protocol == htons(ETH_P_IPV6)) { + ret = ipvlan_process_v6_forward(skb); + } else if (skb->protocol == htons(ETH_P_IP)) { + ret = ipvlan_process_v4_forward(skb); + } else { + pr_warn_ratelimited("Dropped outbound packet type=%x\n", + ntohs(skb->protocol)); + kfree_skb(skb); + } +out: + return ret; +} +#endif + static void ipvlan_multicast_enqueue(struct ipvl_port *port, struct sk_buff *skb, bool tx_pkt) { @@ -647,6 +765,80 @@ static int ipvlan_xmit_mode_l2(struct sk_buff *skb, struct net_device *dev) return dev_queue_xmit(skb); } +#if IS_ENABLED(CONFIG_IPVLAN_L2E) +static int ipvlan_l2e_local_xmit_event(struct ipvl_dev *ipvlan, + struct sk_buff **pskb) +{ + struct sk_buff *nskb, *tskb; + + while ((ipvlan->local_packets_cached >= sysctl_ipvlan_loop_qlen) && + (tskb = skb_dequeue(&ipvlan->local_xmit_queue))) { + ipvlan->local_packets_cached -= tskb->truesize; + if (ipvlan->local_packets_cached < 0 || + skb_queue_empty(&ipvlan->local_xmit_queue)) + ipvlan->local_packets_cached = 0; + kfree_skb(tskb); + } + + nskb = skb_clone(*pskb, GFP_ATOMIC); + if (!nskb) + return NET_XMIT_DROP; + + ipvlan->local_timeout = jiffies + + (sysctl_ipvlan_loop_delay * HZ) / 1000; + mod_timer(&ipvlan->local_free_timer, ipvlan->local_timeout); + skb_queue_tail(&ipvlan->local_xmit_queue, *pskb); + ipvlan->local_packets_cached += (*pskb)->truesize; + *pskb = nskb; + + return 0; +} + +static int ipvlan_xmit_mode_l2e(struct sk_buff *skb, struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + struct ethhdr *eth = eth_hdr(skb); + struct ipvl_addr *addr; + void *lyr3h; + int addr_type; + + if (!ipvlan_is_vepa(ipvlan->port) && + ether_addr_equal(eth->h_dest, eth->h_source)) { + lyr3h = ipvlan_get_L3_hdr(ipvlan->port, skb, &addr_type); + if (lyr3h) { + addr = ipvlan_addr_lookup(ipvlan->port, lyr3h, + addr_type, true); + if (addr) { + if (ipvlan_is_private(ipvlan->port)) { + consume_skb(skb); + return NET_XMIT_DROP; + } + + if (unlikely(ipvlan_l2e_local_xmit_event(ipvlan, + &skb))) + return NET_XMIT_DROP; + return ipvlan_rcv_frame(addr, &skb, true); + } + } + skb = skb_share_check(skb, GFP_ATOMIC); + if (!skb) + return NET_XMIT_DROP; + + /* maybe the packet need been forward */ + skb->dev = ipvlan->phy_dev; + ipvlan_skb_crossing_ns(skb, ipvlan->phy_dev); + return ipvlan_process_forward(skb); + } else if (is_multicast_ether_addr(eth->h_dest)) { + ipvlan_skb_crossing_ns(skb, NULL); + ipvlan_multicast_enqueue(ipvlan->port, skb, true); + return NET_XMIT_SUCCESS; + } + + skb->dev = ipvlan->phy_dev; + return dev_queue_xmit(skb); +} +#endif + int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); @@ -666,6 +858,10 @@ int ipvlan_queue_xmit(struct sk_buff *skb, struct net_device *dev) case IPVLAN_MODE_L3S: #endif return ipvlan_xmit_mode_l3(skb, dev); +#if IS_ENABLED(CONFIG_IPVLAN_L2E) + case IPVLAN_MODE_L2E: + return ipvlan_xmit_mode_l2e(skb, dev); +#endif } /* Should not reach here */ @@ -746,6 +942,38 @@ static rx_handler_result_t ipvlan_handle_mode_l2(struct sk_buff **pskb, return ret; } +#if IS_ENABLED(CONFIG_IPVLAN_L2E) +static rx_handler_result_t ipvlan_handle_mode_l2e(struct sk_buff **pskb, + struct ipvl_port *port) +{ + struct sk_buff *skb = *pskb; + struct ethhdr *eth = eth_hdr(skb); + rx_handler_result_t ret = RX_HANDLER_PASS; + + if (is_multicast_ether_addr(eth->h_dest)) { + if (ipvlan_external_frame(skb, port)) { + struct sk_buff *nskb = skb_clone(skb, GFP_ATOMIC); + + /* External frames are queued for device local + * distribution, but a copy is given to master + * straight away to avoid sending duplicates later + * when work-queue processes this frame. This is + * achieved by returning RX_HANDLER_PASS. + */ + if (nskb) { + ipvlan_skb_crossing_ns(nskb, NULL); + ipvlan_multicast_enqueue(port, nskb, false); + } + } + } else { + /* Perform like l3 mode for non-multicast packet */ + ret = ipvlan_handle_mode_l3(pskb, port); + } + + return ret; +} +#endif + rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) { struct sk_buff *skb = *pskb; @@ -762,6 +990,10 @@ rx_handler_result_t ipvlan_handle_frame(struct sk_buff **pskb) #ifdef CONFIG_IPVLAN_L3S case IPVLAN_MODE_L3S: return RX_HANDLER_PASS; +#endif +#if IS_ENABLED(CONFIG_IPVLAN_L2E) + case IPVLAN_MODE_L2E: + return ipvlan_handle_mode_l2e(pskb, port); #endif } diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c index 57c79f5f29916b66154da2c8ff10238ef2c02584..c273b0ff1f9d9d84477d6b63a5638ad72d0d1c25 100644 --- a/drivers/net/ipvlan/ipvlan_main.c +++ b/drivers/net/ipvlan/ipvlan_main.c @@ -6,6 +6,55 @@ #include "ipvlan.h" +#if IS_ENABLED(CONFIG_IPVLAN_L2E) +static int one = 1; +static int delay_max = 100; +/* set loop queue length from 0 to 10 big packets(65536) */ +static int qlen_min; +static int qlen_max = 655360; + +int sysctl_ipvlan_loop_qlen = 131072; +int sysctl_ipvlan_loop_delay = 10; +static int ipvlan_default_mode = IPVLAN_MODE_L3; +module_param(ipvlan_default_mode, int, 0400); +MODULE_PARM_DESC(ipvlan_default_mode, "set ipvlan default mode: 0 for l2, 1 for l3, 2 for l2e, 3 for l3s, others invalid now"); + +static struct ctl_table_header *ipvlan_table_hrd; +static struct ctl_table ipvlan_table[] = { + { + .procname = "loop_delay", + .data = &sysctl_ipvlan_loop_delay, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &one, + .extra2 = &delay_max, + }, + { + .procname = "loop_qlen", + .data = &sysctl_ipvlan_loop_qlen, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = &qlen_min, + .extra2 = &qlen_max, + }, + { } +}; + +static int ipvlan_sysctl_init(void) +{ + ipvlan_table_hrd = register_net_sysctl(&init_net, + "net/ipvlan", ipvlan_table); + return !ipvlan_table_hrd ? -ENOMEM : 0; +} + +static void ipvlan_sysctl_exit(void) +{ + unregister_net_sysctl_table(ipvlan_table_hrd); +} +#endif + static int ipvlan_set_port_mode(struct ipvl_port *port, u16 nval, struct netlink_ext_ack *extack) { @@ -162,6 +211,34 @@ static int ipvlan_init(struct net_device *dev) return 0; } +#if IS_ENABLED(CONFIG_IPVLAN_L2E) +static void ipvlan_local_free_handler(struct timer_list *t) +{ + struct ipvl_dev *ipvlan = from_timer(ipvlan, t, local_free_timer); + + skb_queue_purge(&ipvlan->local_xmit_queue); + ipvlan->local_packets_cached = 0; +} + +static inline void ipvlan_local_init(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + ipvlan->local_packets_cached = 0; + skb_queue_head_init(&ipvlan->local_xmit_queue); + timer_setup(&ipvlan->local_free_timer, + ipvlan_local_free_handler, 0); +} + +static inline void ipvlan_local_uninit(struct net_device *dev) +{ + struct ipvl_dev *ipvlan = netdev_priv(dev); + + del_timer(&ipvlan->local_free_timer); + skb_queue_purge(&ipvlan->local_xmit_queue); +} +#endif + static void ipvlan_uninit(struct net_device *dev) { struct ipvl_dev *ipvlan = netdev_priv(dev); @@ -187,6 +264,9 @@ static int ipvlan_open(struct net_device *dev) else dev->flags &= ~IFF_NOARP; +#if IS_ENABLED(CONFIG_IPVLAN_L2E) + ipvlan_local_init(dev); +#endif rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_add(ipvlan, addr); @@ -203,7 +283,9 @@ static int ipvlan_stop(struct net_device *dev) dev_uc_unsync(phy_dev, dev); dev_mc_unsync(phy_dev, dev); - +#if IS_ENABLED(CONFIG_IPVLAN_L2E) + ipvlan_local_uninit(dev); +#endif rcu_read_lock(); list_for_each_entry_rcu(addr, &ipvlan->addrs, anode) ipvlan_ht_addr_del(addr); @@ -538,7 +620,11 @@ int ipvlan_link_new(struct net *src_net, struct net_device *dev, struct ipvl_port *port; struct net_device *phy_dev; int err; +#if IS_ENABLED(CONFIG_IPVLAN_L2E) + u16 mode = ipvlan_default_mode; +#else u16 mode = IPVLAN_MODE_L3; +#endif if (!tb[IFLA_LINK]) return -EINVAL; @@ -1026,6 +1112,12 @@ static int __init ipvlan_init_module(void) { int err; +#if IS_ENABLED(CONFIG_IPVLAN_L2E) + if (ipvlan_default_mode >= IPVLAN_MODE_MAX || + ipvlan_default_mode < IPVLAN_MODE_L2) + return -EINVAL; +#endif + ipvlan_init_secret(); register_netdevice_notifier(&ipvlan_notifier_block); #if IS_ENABLED(CONFIG_IPV6) @@ -1045,7 +1137,11 @@ static int __init ipvlan_init_module(void) ipvlan_l3s_cleanup(); goto error; } - +#if IS_ENABLED(CONFIG_IPVLAN_L2E) + err = ipvlan_sysctl_init(); + if (err < 0) + pr_err("ipvlan proc init failed, continue\n"); +#endif return 0; error: unregister_inetaddr_notifier(&ipvlan_addr4_notifier_block); @@ -1073,6 +1169,9 @@ static void __exit ipvlan_cleanup_module(void) unregister_inet6addr_validator_notifier( &ipvlan_addr6_vtor_notifier_block); #endif +#if IS_ENABLED(CONFIG_IPVLAN_L2E) + ipvlan_sysctl_exit(); +#endif } module_init(ipvlan_init_module); diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h index ce3117df9cec2948203319c64f62c0af99d58c01..96ecfe1709e0b532bd454c934de7108a0ea63210 100644 --- a/include/uapi/linux/if_link.h +++ b/include/uapi/linux/if_link.h @@ -741,6 +741,7 @@ enum { enum ipvlan_mode { IPVLAN_MODE_L2 = 0, IPVLAN_MODE_L3, + IPVLAN_MODE_L2E, IPVLAN_MODE_L3S, IPVLAN_MODE_MAX };