diff --git a/Documentation/networking/ethtool-netlink.rst b/Documentation/networking/ethtool-netlink.rst index 30b98245979fdc848842e125cace2d687aeeecc8..c0a431860410409e27ac2af08e6663fa6419eee7 100644 --- a/Documentation/networking/ethtool-netlink.rst +++ b/Documentation/networking/ethtool-netlink.rst @@ -928,6 +928,8 @@ Kernel response contents: ``ETHTOOL_A_COALESCE_TX_USECS_HIGH`` u32 delay (us), high Tx ``ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH`` u32 max packets, high Tx ``ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL`` u32 rate sampling interval + ``ETHTOOL_A_COALESCE_RX_PROFILE`` nested profile of DIM, Rx + ``ETHTOOL_A_COALESCE_TX_PROFILE`` nested profile of DIM, Tx =========================================== ====== ======================= Attributes are only included in reply if their value is not zero or the @@ -966,6 +968,8 @@ Request contents: ``ETHTOOL_A_COALESCE_TX_USECS_HIGH`` u32 delay (us), high Tx ``ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH`` u32 max packets, high Tx ``ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL`` u32 rate sampling interval + ``ETHTOOL_A_COALESCE_RX_PROFILE`` nested profile of DIM, Rx + ``ETHTOOL_A_COALESCE_TX_PROFILE`` nested profile of DIM, Tx =========================================== ====== ======================= Request is rejected if it attributes declared as unsupported by driver (i.e. diff --git a/Documentation/networking/net_dim.rst b/Documentation/networking/net_dim.rst index 3bed9fd953360d3ceb92f0a74a17fbad2ddd256a..8908fd7b0a8d23e7891b5dba5382e498f8854bb3 100644 --- a/Documentation/networking/net_dim.rst +++ b/Documentation/networking/net_dim.rst @@ -169,6 +169,48 @@ usage is not complete but it should make the outline of the usage clear. ... } + +Tuning DIM +========== + +Net DIM serves a range of network devices and delivers excellent acceleration +benefits. Yet, it has been observed that some preset configurations of DIM may +not align seamlessly with the varying specifications of network devices, and +this discrepancy has been identified as a factor to the suboptimal performance +outcomes of DIM-enabled network devices, related to a mismatch in profiles. + +To address this issue, Net DIM introduces a per-device control to modify and +access a device's ``rx-profile`` and ``tx-profile`` parameters: +Assume that the target network device is named ethx, and ethx only declares +support for RX profile setting and supports modification of ``usec`` field +and ``pkts`` field (See the data structure: +:c:type:`struct dim_cq_moder `). + +You can use ethtool to modify the current RX DIM profile where all +values are 64:: + + $ ethtool -C ethx rx-profile 1,1,n_2,2,n_3,n,n_n,4,n_n,n,n + +``n`` means do not modify this field, and ``_`` separates structure +elements of the profile array. + +Querying the current profiles using:: + + $ ethtool -c ethx + ... + rx-profile: + {.usec = 1, .pkts = 1, .comps = n/a,}, + {.usec = 2, .pkts = 2, .comps = n/a,}, + {.usec = 3, .pkts = 64, .comps = n/a,}, + {.usec = 64, .pkts = 4, .comps = n/a,}, + {.usec = 64, .pkts = 64, .comps = n/a,} + tx-profile: n/a + +If the network device does not support specific fields of DIM profiles, +the corresponding ``n/a`` will display. If the ``n/a`` field is being +modified, error messages will be reported. + + Dynamic Interrupt Moderation (DIM) library API ============================================== diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 2ebc36adecffc33fccab01f10d751df0f7b412b8..c716037e55c76dcaf4d1ad3669debe0755c3e549 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -52,6 +52,9 @@ module_param(napi_tx, bool, 0644); module_param(force_xdp, bool, 0644); module_param(lro, bool, 0644); +#define VIRTNET_DIM_TUNE_TRAFFIC 1 +#define VIRTNET_DIM_NEVENTS 128 + /* FIXME: MTU in config. */ #define GOOD_PACKET_LEN (ETH_HLEN + VLAN_HLEN + ETH_DATA_LEN) #define GOOD_COPY_LEN 128 @@ -257,6 +260,14 @@ struct virtnet_interrupt_coalesce { u32 max_usecs; }; +struct virtnet_coal_node { + struct virtio_net_ctrl_hdr hdr; + virtio_net_ctrl_ack status; + struct virtio_net_ctrl_coal_vq coal_vqs; + bool is_wait; + struct list_head list; +}; + /* Internal representation of a send virtqueue */ struct send_queue { /* Virtqueue associated with this send _queue */ @@ -324,9 +335,6 @@ struct receive_queue { /* Is dynamic interrupt moderation enabled? */ bool dim_enabled; - /* Used to protect dim_enabled and inter_coal */ - struct mutex dim_lock; - /* Dynamic Interrupt Moderation */ struct dim dim; @@ -464,6 +472,18 @@ struct virtnet_info { struct virtnet_interrupt_coalesce intr_coal_tx; struct virtnet_interrupt_coalesce intr_coal_rx; + /* Used by dim cmds for concurrent delivery */ + int dim_cmd_nums; + struct delayed_work get_cvq; + + /* Free nodes for dim filled by rx_dim_work. */ + struct mutex coal_free_lock; + struct list_head coal_free_list; + + /* Filled when there are no free nodes for dim. */ + struct mutex coal_wait_lock; + struct list_head coal_wait_list; + unsigned long guest_offloads; unsigned long guest_offloads_capable; @@ -1705,6 +1725,143 @@ static bool try_fill_recv(struct virtnet_info *vi, struct receive_queue *rq, return !oom; } +static void __virtnet_add_dim_command(struct virtnet_info *vi, + struct virtnet_coal_node *ctrl) +{ + struct scatterlist *sgs[4], hdr, stat, out; + unsigned int out_num = 0; + int ret; + + BUG_ON(!virtio_has_feature(vi->vdev, VIRTIO_NET_F_CTRL_VQ)); + + ctrl->hdr.class = VIRTIO_NET_CTRL_NOTF_COAL; + ctrl->hdr.cmd = VIRTIO_NET_CTRL_NOTF_COAL_VQ_SET; + + sg_init_one(&hdr, &ctrl->hdr, sizeof(ctrl->hdr)); + sgs[out_num++] = &hdr; + + sg_init_one(&out, &ctrl->coal_vqs, sizeof(ctrl->coal_vqs)); + sgs[out_num++] = &out; + + ctrl->status = VIRTIO_NET_OK; + sg_init_one(&stat, &ctrl->status, sizeof(ctrl->status)); + sgs[out_num] = &stat; + + BUG_ON(out_num + 1 > ARRAY_SIZE(sgs)); + ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, 1, ctrl, GFP_ATOMIC); + if (ret < 0) { + dev_warn(&vi->vdev->dev, "Failed to add sgs for command vq: %d\n.", ret); + return; + } + + virtqueue_kick(vi->cvq); + vi->dim_cmd_nums++; +} + +static void virtnet_add_dim_command(struct virtnet_info *vi, + struct virtnet_coal_node *ctrl) +{ + mutex_lock(&vi->cvq_lock); + __virtnet_add_dim_command(vi, ctrl); + mutex_unlock(&vi->cvq_lock); +} + +static void virtnet_process_dim_cmd(struct virtnet_info *vi, void *res) +{ + struct virtnet_coal_node *node; + u16 qnum; + + node = (struct virtnet_coal_node *)res; + qnum = le16_to_cpu(node->coal_vqs.vqn) / 2; + + vi->rq[qnum].intr_coal.max_usecs = + le32_to_cpu(node->coal_vqs.coal.max_usecs); + vi->rq[qnum].intr_coal.max_packets = + le32_to_cpu(node->coal_vqs.coal.max_packets); + vi->rq[qnum].dim.state = DIM_START_MEASURE; + + if (!node->is_wait) { + mutex_lock(&vi->coal_free_lock); + list_add(&node->list, &vi->coal_free_list); + mutex_unlock(&vi->coal_free_lock); + } else { + kfree(node); + } + + vi->dim_cmd_nums--; +} + +/** + * virtnet_cvq_response - get the response for filled ctrlq requests + * @poll: keep polling ctrlq when a NULL buffer is obtained. + * @dim_oneshot: process a dim cmd then exit, excluding user commands. + * + * Note that user commands must be processed synchronously + * (poll = true, dim_oneshot = false). + */ +static void virtnet_cvq_response(struct virtnet_info *vi, + bool poll, + bool dim_oneshot) +{ + unsigned tmp; + void *res; + + while (true) { + res = virtqueue_get_buf(vi->cvq, &tmp); + if (virtqueue_is_broken(vi->cvq)) { + dev_warn(&vi->dev->dev, "Control vq is broken.\n"); + return; + } + + if (!res) { + if (!poll) + return; + + cpu_relax(); + continue; + } + + /* this does not occur inside the process of waiting dim */ + if (res == ((void *)vi)) + return; + + virtnet_process_dim_cmd(vi, res); + /* When it is a user command, we must wait until the + * processing result is processed synchronously. + */ + if (dim_oneshot) + return; + } +} + +static void virtnet_get_cvq_work(struct work_struct *work) +{ + struct virtnet_info *vi = + container_of(work, struct virtnet_info, get_cvq.work); + struct virtnet_coal_node *wait_coal; + + mutex_lock(&vi->cvq_lock); + + virtnet_cvq_response(vi, false, false); + + if (vi->cvq->num_free >= 3) { + mutex_lock(&vi->coal_wait_lock); + while (!list_empty(&vi->coal_wait_list)) { + wait_coal = list_first_entry(&vi->coal_wait_list, + struct virtnet_coal_node, + list); + list_del(&wait_coal->list); + __virtnet_add_dim_command(vi, wait_coal); + } + mutex_unlock(&vi->coal_wait_lock); + } + + if (vi->dim_cmd_nums) + schedule_delayed_work(&vi->get_cvq, 1); + + mutex_unlock(&vi->cvq_lock); +} + static void skb_recv_done(struct virtqueue *rvq) { struct virtnet_info *vi = rvq->vdev->priv; @@ -1979,7 +2136,8 @@ static void virtnet_rx_dim_update(struct virtnet_info *vi, struct receive_queue &cur_sample); u64_stats_update_end(&rq->stats.syncp); - net_dim(&rq->dim, cur_sample); + net_dim_tune(&rq->dim, cur_sample, VIRTNET_DIM_NEVENTS, + VIRTNET_DIM_TUNE_TRAFFIC); rq->packets_in_napi = 0; } @@ -2001,10 +2159,6 @@ static int virtnet_poll(struct napi_struct *napi, int budget) /* Out of packets? */ if (received < budget) { napi_complete = virtqueue_napi_complete(napi, rq->vq, received); - /* Intentionally not taking dim_lock here. This may result in a - * spurious net_dim call. But if that happens virtnet_rx_dim_work - * will not act on the scheduled work. - */ if (napi_complete && rq->dim_enabled) virtnet_rx_dim_update(vi, rq); } @@ -2232,7 +2386,7 @@ static int virtnet_rx_resize(struct virtnet_info *vi, if (running) { napi_disable(&rq->napi); - cancel_work_sync(&rq->dim.work); + net_dim_work_cancel(&rq->dim); } err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_free_unused_buf); @@ -2298,7 +2452,7 @@ static bool virtnet_send_command_reply(struct virtnet_info *vi, u8 class, u8 cmd struct scatterlist *in) { struct scatterlist *sgs[5], hdr, stat; - u32 out_num = 0, tmp, in_num = 0; + u32 out_num = 0, in_num = 0; int ret; /* Caller should know better */ @@ -2324,6 +2478,10 @@ static bool virtnet_send_command_reply(struct virtnet_info *vi, u8 class, u8 cmd BUG_ON(out_num + in_num > ARRAY_SIZE(sgs)); ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC); + if (ret == -ENOSPC) { + virtnet_cvq_response(vi, true, true); + ret = virtqueue_add_sgs(vi->cvq, sgs, out_num, in_num, vi, GFP_ATOMIC); + } if (ret < 0) { dev_warn(&vi->vdev->dev, "Failed to add sgs for command vq: %d\n.", ret); @@ -2334,14 +2492,7 @@ static bool virtnet_send_command_reply(struct virtnet_info *vi, u8 class, u8 cmd if (unlikely(!virtqueue_kick(vi->cvq))) goto unlock; - /* Spin for a response, the kick causes an ioport write, trapping - * into the hypervisor, so the request should be handled immediately. - */ - while (!virtqueue_get_buf(vi->cvq, &tmp) && - !virtqueue_is_broken(vi->cvq)) { - cond_resched(); - cpu_relax(); - } + virtnet_cvq_response(vi, true, false); unlock: mutex_unlock(&vi->cvq_lock); @@ -2497,7 +2648,7 @@ static int virtnet_close(struct net_device *dev) xdp_rxq_info_unreg(&vi->rq[i].xdp_rxq); napi_disable(&vi->rq[i].napi); virtnet_napi_tx_disable(&vi->sq[i].napi); - cancel_work_sync(&vi->rq[i].dim.work); + net_dim_work_cancel(&vi->rq[i].dim); } return 0; @@ -2875,11 +3026,9 @@ static int virtnet_set_ringparam(struct net_device *dev, return err; /* The reason is same as the transmit virtqueue reset */ - mutex_lock(&vi->rq[i].dim_lock); err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, i, vi->intr_coal_rx.max_usecs, vi->intr_coal_rx.max_packets); - mutex_unlock(&vi->rq[i].dim_lock); if (err) return err; } @@ -3765,22 +3914,16 @@ static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, ec->rx_max_coalesced_frames != vi->intr_coal_rx.max_packets)) return -EINVAL; - /* Acquire all queues dim_locks */ - for (i = 0; i < vi->max_queue_pairs; i++) - mutex_lock(&vi->rq[i].dim_lock); - if (rx_ctrl_dim_on && !vi->rx_dim_enabled) { vi->rx_dim_enabled = true; for (i = 0; i < vi->max_queue_pairs; i++) vi->rq[i].dim_enabled = true; - goto unlock; + return 0; } coal_rx = kzalloc(sizeof(*coal_rx), GFP_KERNEL); - if (!coal_rx) { - ret = -ENOMEM; - goto unlock; - } + if (!coal_rx) + return -ENOMEM; if (!rx_ctrl_dim_on && vi->rx_dim_enabled) { vi->rx_dim_enabled = false; @@ -3800,7 +3943,7 @@ static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, VIRTIO_NET_CTRL_NOTF_COAL_RX_SET, &sgs_rx)) { ret = -EINVAL; - goto unlock; + goto out; } vi->intr_coal_rx.max_usecs = ec->rx_coalesce_usecs; @@ -3810,9 +3953,7 @@ static int virtnet_send_rx_notf_coal_cmds(struct virtnet_info *vi, vi->rq[i].intr_coal.max_packets = ec->rx_max_coalesced_frames; } -unlock: - for (i = vi->max_queue_pairs - 1; i >= 0; i--) - mutex_unlock(&vi->rq[i].dim_lock); +out: kfree(coal_rx); return ret; } @@ -3842,20 +3983,16 @@ static int virtnet_send_rx_notf_coal_vq_cmds(struct virtnet_info *vi, bool cur_rx_dim; int err; - mutex_lock(&vi->rq[queue].dim_lock); cur_rx_dim = vi->rq[queue].dim_enabled; max_usecs = vi->rq[queue].intr_coal.max_usecs; max_packets = vi->rq[queue].intr_coal.max_packets; if (rx_ctrl_dim_on && (ec->rx_coalesce_usecs != max_usecs || - ec->rx_max_coalesced_frames != max_packets)) { - mutex_unlock(&vi->rq[queue].dim_lock); + ec->rx_max_coalesced_frames != max_packets)) return -EINVAL; - } if (rx_ctrl_dim_on && !cur_rx_dim) { vi->rq[queue].dim_enabled = true; - mutex_unlock(&vi->rq[queue].dim_lock); return 0; } @@ -3868,7 +4005,6 @@ static int virtnet_send_rx_notf_coal_vq_cmds(struct virtnet_info *vi, err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, queue, ec->rx_coalesce_usecs, ec->rx_max_coalesced_frames); - mutex_unlock(&vi->rq[queue].dim_lock); return err; } @@ -3891,35 +4027,75 @@ static int virtnet_send_notf_coal_vq_cmds(struct virtnet_info *vi, return 0; } +static void virtnet_put_wait_coal(struct virtnet_info *vi, + struct receive_queue *rq, + struct dim_cq_moder moder) +{ + struct virtnet_coal_node *wait_node; + + wait_node = kzalloc(sizeof(*wait_node), GFP_KERNEL); + if (!wait_node) { + rq->dim.state = DIM_START_MEASURE; + return; + } + + wait_node->is_wait = true; + wait_node->coal_vqs.vqn = cpu_to_le16(rxq2vq(rq - vi->rq)); + wait_node->coal_vqs.coal.max_usecs = cpu_to_le32(moder.usec); + wait_node->coal_vqs.coal.max_packets = cpu_to_le32(moder.pkts); + mutex_lock(&vi->coal_wait_lock); + list_add_tail(&wait_node->list, &vi->coal_wait_list); + mutex_unlock(&vi->coal_wait_lock); +} + static void virtnet_rx_dim_work(struct work_struct *work) { struct dim *dim = container_of(work, struct dim, work); struct receive_queue *rq = container_of(dim, struct receive_queue, dim); struct virtnet_info *vi = rq->vq->vdev->priv; - struct net_device *dev = vi->dev; + struct virtnet_coal_node *avail_coal; struct dim_cq_moder update_moder; - int qnum, err; - qnum = rq - vi->rq; + update_moder = net_dim_get_rx_irq_moder(vi->dev, dim); - mutex_lock(&rq->dim_lock); - if (!rq->dim_enabled) - goto out; + if (!rq->dim_enabled || + (update_moder.usec == rq->intr_coal.max_usecs && + update_moder.pkts == rq->intr_coal.max_packets)) { + rq->dim.state = DIM_START_MEASURE; + return; + } - update_moder = net_dim_get_rx_moderation(dim->mode, dim->profile_ix); - if (update_moder.usec != rq->intr_coal.max_usecs || - update_moder.pkts != rq->intr_coal.max_packets) { - err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, qnum, - update_moder.usec, - update_moder.pkts); - if (err) - pr_debug("%s: Failed to send dim parameters on rxq%d\n", - dev->name, qnum); + mutex_lock(&vi->cvq_lock); + if (vi->cvq->num_free < 3) { + virtnet_put_wait_coal(vi, rq, update_moder); + mutex_unlock(&vi->cvq_lock); + return; } -out: - dim->state = DIM_START_MEASURE; - mutex_unlock(&rq->dim_lock); + mutex_unlock(&vi->cvq_lock); + + mutex_lock(&vi->coal_free_lock); + if (list_empty(&vi->coal_free_list)) { + virtnet_put_wait_coal(vi, rq, update_moder); + mutex_unlock(&vi->coal_free_lock); + return; + } + + avail_coal = list_first_entry(&vi->coal_free_list, + struct virtnet_coal_node, list); + avail_coal->coal_vqs.vqn = cpu_to_le16(rxq2vq(rq - vi->rq)); + avail_coal->coal_vqs.coal.max_usecs = cpu_to_le32(update_moder.usec); + avail_coal->coal_vqs.coal.max_packets = cpu_to_le32(update_moder.pkts); + + list_del(&avail_coal->list); + mutex_unlock(&vi->coal_free_lock); + + virtnet_add_dim_command(vi, avail_coal); + + mutex_lock(&vi->cvq_lock); + if (vi->dim_cmd_nums) + schedule_delayed_work(&vi->get_cvq, 1); + mutex_unlock(&vi->cvq_lock); } static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) @@ -3937,6 +4113,48 @@ static int virtnet_coal_params_supported(struct ethtool_coalesce *ec) return 0; } +static void virtnet_del_coal_free_list(struct virtnet_info *vi) +{ + struct virtnet_coal_node *coal_node, *tmp; + + list_for_each_entry_safe(coal_node, tmp, &vi->coal_free_list, list) { + list_del(&coal_node->list); + kfree(coal_node); + } +} + +static int virtnet_init_coal_list(struct virtnet_info *vi) +{ + struct virtnet_coal_node *coal_node; + int batch_dim_nums; + int i; + + INIT_LIST_HEAD(&vi->coal_free_list); + mutex_init(&vi->coal_free_lock); + + INIT_LIST_HEAD(&vi->coal_wait_list); + mutex_init(&vi->coal_wait_lock); + + INIT_DELAYED_WORK(&vi->get_cvq, virtnet_get_cvq_work); + + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) + return 0; + + vi->dim_cmd_nums = 0; + batch_dim_nums = min((unsigned int)vi->max_queue_pairs, + virtqueue_get_vring_size(vi->cvq) / 3); + for (i = 0; i < batch_dim_nums; i++) { + coal_node = kzalloc(sizeof(*coal_node), GFP_KERNEL); + if (!coal_node) { + virtnet_del_coal_free_list(vi); + return -ENOMEM; + } + list_add(&coal_node->list, &vi->coal_free_list); + } + + return 0; +} + static int virtnet_should_update_vq_weight(int dev_flags, int weight, int vq_weight, bool *should_update) { @@ -4053,13 +4271,11 @@ static int virtnet_get_per_queue_coalesce(struct net_device *dev, return -EINVAL; if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) { - mutex_lock(&vi->rq[queue].dim_lock); ec->rx_coalesce_usecs = vi->rq[queue].intr_coal.max_usecs; ec->tx_coalesce_usecs = vi->sq[queue].intr_coal.max_usecs; ec->tx_max_coalesced_frames = vi->sq[queue].intr_coal.max_packets; ec->rx_max_coalesced_frames = vi->rq[queue].intr_coal.max_packets; ec->use_adaptive_rx_coalesce = vi->rq[queue].dim_enabled; - mutex_unlock(&vi->rq[queue].dim_lock); } else { ec->rx_max_coalesced_frames = 1; @@ -4860,6 +5076,36 @@ static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue) } } +static int virtnet_init_irq_moder(struct virtnet_info *vi) +{ + u8 profile_flags = 0, coal_flags = 0; + int ret, i; + + profile_flags |= DIM_PROFILE_RX; + coal_flags |= DIM_COALESCE_USEC | DIM_COALESCE_PKTS; + ret = net_dim_init_irq_moder(vi->dev, profile_flags, coal_flags, + DIM_CQ_PERIOD_MODE_START_FROM_EQE, + 0, virtnet_rx_dim_work, NULL); + + if (ret) + return ret; + + for (i = 0; i < vi->max_queue_pairs; i++) + net_dim_setting(vi->dev, &vi->rq[i].dim, false); + + return 0; +} + +static void virtnet_free_irq_moder(struct virtnet_info *vi) +{ + if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL)) + return; + + rtnl_lock(); + net_dim_free_irq_moder(vi->dev); + rtnl_unlock(); +} + static const struct net_device_ops virtnet_netdev = { .ndo_open = virtnet_open, .ndo_stop = virtnet_close, @@ -5158,16 +5404,12 @@ static int virtnet_alloc_queues(struct virtnet_info *vi) netif_tx_napi_add(vi->dev, &vi->sq[i].napi, virtnet_poll_tx, napi_tx ? napi_weight : 0); - INIT_WORK(&vi->rq[i].dim.work, virtnet_rx_dim_work); - vi->rq[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE; - sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg)); ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len); sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg)); u64_stats_init(&vi->rq[i].stats.syncp); u64_stats_init(&vi->sq[i].stats.syncp); - mutex_init(&vi->rq[i].dim_lock); } return 0; @@ -5494,6 +5736,9 @@ static int virtnet_probe(struct virtio_device *vdev) if (err) goto free; + if (virtnet_init_coal_list(vi)) + goto free; + if (virtio_has_feature(vi->vdev, VIRTIO_NET_F_NOTF_COAL)) { vi->intr_coal_rx.max_usecs = 0; vi->intr_coal_tx.max_usecs = 0; @@ -5513,6 +5758,10 @@ static int virtnet_probe(struct virtio_device *vdev) for (i = 0; i < vi->max_queue_pairs; i++) if (vi->sq[i].napi.weight) vi->sq[i].intr_coal.max_packets = 1; + + err = virtnet_init_irq_moder(vi); + if (err) + goto free; } #ifdef CONFIG_SYSFS @@ -5648,10 +5897,14 @@ static void virtnet_remove(struct virtio_device *vdev) disable_rx_mode_work(vi); flush_work(&vi->rx_mode_work); + virtnet_free_irq_moder(vi); + unregister_netdev(vi->dev); net_failover_destroy(vi->failover); + virtnet_del_coal_free_list(vi); + remove_vq_common(vi); free_netdev(vi->dev); diff --git a/drivers/soc/fsl/Kconfig b/drivers/soc/fsl/Kconfig index 4df32bc4c7a6ec38e262d059f0ad25c94864bc2b..66d8ef596c6c9d84626416a19785f870d57a3695 100644 --- a/drivers/soc/fsl/Kconfig +++ b/drivers/soc/fsl/Kconfig @@ -22,7 +22,7 @@ config FSL_GUTS config FSL_MC_DPIO tristate "QorIQ DPAA2 DPIO driver" - depends on FSL_MC_BUS + depends on FSL_MC_BUS && NET select SOC_BUS help Driver for the DPAA2 DPIO object. A DPIO provides queue and diff --git a/include/linux/dim.h b/include/linux/dim.h index 6c5733981563eadf5f06c59c5dc97df961692b02..020f0c20116ea1167ea2329d578ccb7eddbb6baa 100644 --- a/include/linux/dim.h +++ b/include/linux/dim.h @@ -10,12 +10,23 @@ #include #include +struct net_device; + +/* Number of DIM profiles and period mode. */ +#define NET_DIM_PARAMS_NUM_PROFILES 5 +#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256 +#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128 +#define NET_DIM_DEF_PROFILE_CQE 1 +#define NET_DIM_DEF_PROFILE_EQE 1 + /* * Number of events between DIM iterations. * Causes a moderation of the algorithm run. */ #define DIM_NEVENTS 64 +#define DIM_RATIO 5 + /* * Is a difference between values justifies taking an action. * We consider 10% difference as significant. @@ -23,6 +34,10 @@ #define IS_SIGNIFICANT_DIFF(val, ref) \ ((ref) && (((100UL * abs((val) - (ref))) / (ref)) > 10)) +/* Consider 1% difference as traffic is stable. */ +#define IS_SIGNIFICANT_DIFF_1(val, ref) \ + ((ref) && (((100UL * abs((val) - (ref))) / (ref)) <= 1)) + /* * Calculate the gap between two values. * Take wrap-around and variable size into consideration. @@ -38,12 +53,45 @@ * @pkts: CQ packet counter suggestion (by DIM) * @comps: Completion counter * @cq_period_mode: CQ period count mode (from CQE/EQE) + * @rcu: for asynchronous kfree_rcu */ struct dim_cq_moder { u16 usec; u16 pkts; u16 comps; u8 cq_period_mode; + struct rcu_head rcu; +}; + +#define DIM_PROFILE_RX BIT(0) /* support rx profile modification */ +#define DIM_PROFILE_TX BIT(1) /* support tx profile modification */ + +#define DIM_COALESCE_USEC BIT(0) /* support usec field modification */ +#define DIM_COALESCE_PKTS BIT(1) /* support pkts field modification */ +#define DIM_COALESCE_COMPS BIT(2) /* support comps field modification */ + +/** + * struct dim_irq_moder - Structure for irq moderation information. + * Used to collect irq moderation related information. + * + * @profile_flags: DIM_PROFILE_* + * @coal_flags: DIM_COALESCE_* for Rx and Tx + * @dim_rx_mode: Rx DIM period count mode: CQE or EQE + * @dim_tx_mode: Tx DIM period count mode: CQE or EQE + * @rx_profile: DIM profile list for Rx + * @tx_profile: DIM profile list for Tx + * @rx_dim_work: Rx DIM worker scheduled by net_dim() + * @tx_dim_work: Tx DIM worker scheduled by net_dim() + */ +struct dim_irq_moder { + u8 profile_flags; + u8 coal_flags; + u8 dim_rx_mode; + u8 dim_tx_mode; + struct dim_cq_moder __rcu *rx_profile; + struct dim_cq_moder __rcu *tx_profile; + void (*rx_dim_work)(struct work_struct *work); + void (*tx_dim_work)(struct work_struct *work); }; /** @@ -191,6 +239,77 @@ enum dim_step_result { DIM_ON_EDGE, }; +/** + * net_dim_init_irq_moder - collect information to initialize irq moderation + * @dev: target network device + * @profile_flags: Rx or Tx profile modification capability + * @coal_flags: irq moderation params flags + * @rx_mode: CQ period mode for Rx + * @tx_mode: CQ period mode for Tx + * @rx_dim_work: Rx worker called after dim decision + * @tx_dim_work: Tx worker called after dim decision + * + * Return: 0 on success or a negative error code. + */ +int net_dim_init_irq_moder(struct net_device *dev, u8 profile_flags, + u8 coal_flags, u8 rx_mode, u8 tx_mode, + void (*rx_dim_work)(struct work_struct *work), + void (*tx_dim_work)(struct work_struct *work)); + +/** + * net_dim_free_irq_moder - free fields for irq moderation + * @dev: target network device + */ +void net_dim_free_irq_moder(struct net_device *dev); + +/** + * net_dim_setting - initialize DIM's cq mode and schedule worker + * @dev: target network device + * @dim: DIM context + * @is_tx: true indicates the tx direction, false indicates the rx direction + */ +void net_dim_setting(struct net_device *dev, struct dim *dim, bool is_tx); + +/** + * net_dim_work_cancel - synchronously cancel dim's worker + * @dim: DIM context + */ +void net_dim_work_cancel(struct dim *dim); + +/** + * net_dim_get_rx_irq_moder - get DIM rx results based on profile_ix + * @dev: target network device + * @dim: DIM context + * + * Return: DIM irq moderation + */ +struct dim_cq_moder +net_dim_get_rx_irq_moder(struct net_device *dev, struct dim *dim); + +/** + * net_dim_get_tx_irq_moder - get DIM tx results based on profile_ix + * @dev: target network device + * @dim: DIM context + * + * Return: DIM irq moderation + */ +struct dim_cq_moder +net_dim_get_tx_irq_moder(struct net_device *dev, struct dim *dim); + +/** + * net_dim_set_rx_mode - set DIM rx cq mode + * @dev: target network device + * @rx_mode: target rx cq mode + */ +void net_dim_set_rx_mode(struct net_device *dev, u8 rx_mode); + +/** + * net_dim_set_tx_mode - set DIM tx cq mode + * @dev: target network device + * @tx_mode: target tx cq mode + */ +void net_dim_set_tx_mode(struct net_device *dev, u8 tx_mode); + /** * dim_on_top - check if current state is a good place to stop (top location) * @dim: DIM context @@ -312,6 +431,18 @@ struct dim_cq_moder net_dim_get_def_tx_moderation(u8 cq_period_mode); */ void net_dim(struct dim *dim, struct dim_sample end_sample); +/** + * net_dim_tune - DIM algorithm entry point with tunable params + * @dim: DIM instance information + * @end_sample: Current data measurement + * @sample_events: Sampling event interval + * @tune_traffic: non-high load traffic decision optimization + * + * This provides more tuning parameter settings than the net_dim interface. + */ +void net_dim_tune(struct dim *dim, struct dim_sample end_sample, + u16 sample_events, bool tune_traffic); + /* RDMA DIM */ /* diff --git a/include/linux/ethtool.h b/include/linux/ethtool.h index a5592a2decd20fe485a09d7ceb2386b4eb6a61b1..028a66710909409c46622dc496ec4f103e9f035d 100644 --- a/include/linux/ethtool.h +++ b/include/linux/ethtool.h @@ -214,7 +214,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, #define ETHTOOL_COALESCE_TX_USECS_HIGH BIT(19) #define ETHTOOL_COALESCE_TX_MAX_FRAMES_HIGH BIT(20) #define ETHTOOL_COALESCE_RATE_SAMPLE_INTERVAL BIT(21) -#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(21, 0) +#define ETHTOOL_COALESCE_RX_PROFILE BIT(22) +#define ETHTOOL_COALESCE_TX_PROFILE BIT(23) +#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(23, 0) #define ETHTOOL_COALESCE_USECS \ (ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS) diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index ce53951e9754d94f082d2d112c7ca1631a54ec5c..7e972b4f7ca1d120a9e15d56d84323ec3ae58cd1 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -2235,6 +2235,8 @@ struct net_device { struct bpf_xdp_entity xdp_state[__MAX_XDP_MODE]; CK_KABI_USE_SPLIT(1, enum netdev_stat_type pcpu_stat_type:8) + /** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */ + CK_KABI_USE(2, struct dim_irq_moder *irq_moder) CK_KABI_RESERVE(2) CK_KABI_RESERVE(3) CK_KABI_RESERVE(4) diff --git a/include/uapi/linux/ethtool_netlink.h b/include/uapi/linux/ethtool_netlink.h index c94fa2941502123856eb5564049628ca668eb88a..f8848aeac67fa2f4652276ef44e5462a5e6748c8 100644 --- a/include/uapi/linux/ethtool_netlink.h +++ b/include/uapi/linux/ethtool_netlink.h @@ -366,12 +366,32 @@ enum { ETHTOOL_A_COALESCE_TX_USECS_HIGH, /* u32 */ ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH, /* u32 */ ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL, /* u32 */ + ETHTOOL_A_COALESCE_RX_PROFILE, /* nest - _A_PROFILE_IRQ_MODERATION */ + ETHTOOL_A_COALESCE_TX_PROFILE, /* nest - _A_PROFILE_IRQ_MODERATION */ /* add new constants above here */ __ETHTOOL_A_COALESCE_CNT, ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1) }; +enum { + ETHTOOL_A_PROFILE_UNSPEC, + ETHTOOL_A_PROFILE_IRQ_MODERATION, /* nest, _A_IRQ_MODERATION_* */ + + __ETHTOOL_A_PROFILE_CNT, + ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1) +}; + +enum { + ETHTOOL_A_IRQ_MODERATION_UNSPEC, + ETHTOOL_A_IRQ_MODERATION_USEC, /* u32 */ + ETHTOOL_A_IRQ_MODERATION_PKTS, /* u32 */ + ETHTOOL_A_IRQ_MODERATION_COMPS, /* u32 */ + + __ETHTOOL_A_IRQ_MODERATION_CNT, + ETHTOOL_A_IRQ_MODERATION_MAX = (__ETHTOOL_A_IRQ_MODERATION_CNT - 1) +}; + /* PAUSE */ enum { diff --git a/lib/Kconfig b/lib/Kconfig index 3a762e856b9db0ad810f298a791d90098721b628..0246c78afba6a343a3b2855209e525b7e5787aab 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -576,6 +576,7 @@ config SIGNATURE config DIMLIB tristate + depends on NET help Dynamic Interrupt Moderation library. Implements an algorithm for dynamically changing CQ moderation values diff --git a/lib/dim/dim.c b/lib/dim/dim.c index 62c33de1d0692d7dfb32551f8b87f2216ebc4766..d87ea860f4d850efb265afefa55cf33e6e21cccb 100644 --- a/lib/dim/dim.c +++ b/lib/dim/dim.c @@ -64,13 +64,15 @@ void dim_calc_stats(struct dim_sample *start, struct dim_sample *end, start->byte_ctr); u32 ncomps = BIT_GAP(BITS_PER_TYPE(u32), end->comp_ctr, start->comp_ctr); + u16 nevents = BIT_GAP(BITS_PER_TYPE(u16), end->event_ctr, + start->event_ctr); if (!delta_us) return; curr_stats->ppms = DIV_ROUND_UP(npkts * USEC_PER_MSEC, delta_us); curr_stats->bpms = DIV_ROUND_UP(nbytes * USEC_PER_MSEC, delta_us); - curr_stats->epms = DIV_ROUND_UP(DIM_NEVENTS * USEC_PER_MSEC, + curr_stats->epms = DIV_ROUND_UP(nevents * USEC_PER_MSEC, delta_us); curr_stats->cpms = DIV_ROUND_UP(ncomps * USEC_PER_MSEC, delta_us); if (curr_stats->epms != 0) diff --git a/lib/dim/net_dim.c b/lib/dim/net_dim.c index dae3b51ac3d9befb2cb57a8ce77fc6d2a8e1fde1..9ff65f2240fad55c0c3a752dbbd382f81bedbb48 100644 --- a/lib/dim/net_dim.c +++ b/lib/dim/net_dim.c @@ -4,6 +4,7 @@ */ #include +#include /* * Net DIM profiles: @@ -11,12 +12,6 @@ * There are different set of profiles for RX/TX CQs. * Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES */ -#define NET_DIM_PARAMS_NUM_PROFILES 5 -#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256 -#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128 -#define NET_DIM_DEF_PROFILE_CQE 1 -#define NET_DIM_DEF_PROFILE_EQE 1 - #define NET_DIM_RX_EQE_PROFILES { \ {.usec = 1, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \ {.usec = 8, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \ @@ -101,6 +96,143 @@ net_dim_get_def_tx_moderation(u8 cq_period_mode) } EXPORT_SYMBOL(net_dim_get_def_tx_moderation); +int net_dim_init_irq_moder(struct net_device *dev, u8 profile_flags, + u8 coal_flags, u8 rx_mode, u8 tx_mode, + void (*rx_dim_work)(struct work_struct *work), + void (*tx_dim_work)(struct work_struct *work)) +{ + struct dim_cq_moder *rxp = NULL, *txp; + struct dim_irq_moder *moder; + int len; + + dev->irq_moder = kzalloc(sizeof(*dev->irq_moder), GFP_KERNEL); + if (!dev->irq_moder) + return -ENOMEM; + + moder = dev->irq_moder; + len = NET_DIM_PARAMS_NUM_PROFILES * sizeof(*moder->rx_profile); + + moder->coal_flags = coal_flags; + moder->profile_flags = profile_flags; + + if (profile_flags & DIM_PROFILE_RX) { + moder->rx_dim_work = rx_dim_work; + moder->dim_rx_mode = rx_mode; + rxp = kmemdup(rx_profile[rx_mode], len, GFP_KERNEL); + if (!rxp) + goto free_moder; + + rcu_assign_pointer(moder->rx_profile, rxp); + } + + if (profile_flags & DIM_PROFILE_TX) { + moder->tx_dim_work = tx_dim_work; + moder->dim_tx_mode = tx_mode; + txp = kmemdup(tx_profile[tx_mode], len, GFP_KERNEL); + if (!txp) + goto free_rxp; + + rcu_assign_pointer(moder->tx_profile, txp); + } + + return 0; + +free_rxp: + kfree(rxp); +free_moder: + kfree(moder); + return -ENOMEM; +} +EXPORT_SYMBOL(net_dim_init_irq_moder); + +/* RTNL lock is held. */ +void net_dim_free_irq_moder(struct net_device *dev) +{ + struct dim_cq_moder *rxp, *txp; + + if (!dev->irq_moder) + return; + + rxp = rtnl_dereference(dev->irq_moder->rx_profile); + txp = rtnl_dereference(dev->irq_moder->tx_profile); + + rcu_assign_pointer(dev->irq_moder->rx_profile, NULL); + rcu_assign_pointer(dev->irq_moder->tx_profile, NULL); + + kfree_rcu(rxp, rcu); + kfree_rcu(txp, rcu); + kfree(dev->irq_moder); +} +EXPORT_SYMBOL(net_dim_free_irq_moder); + +void net_dim_setting(struct net_device *dev, struct dim *dim, bool is_tx) +{ + struct dim_irq_moder *irq_moder = dev->irq_moder; + + if (!irq_moder) + return; + + if (is_tx) { + INIT_WORK(&dim->work, irq_moder->tx_dim_work); + dim->mode = READ_ONCE(irq_moder->dim_tx_mode); + return; + } + + INIT_WORK(&dim->work, irq_moder->rx_dim_work); + dim->mode = READ_ONCE(irq_moder->dim_rx_mode); +} +EXPORT_SYMBOL(net_dim_setting); + +void net_dim_work_cancel(struct dim *dim) +{ + cancel_work_sync(&dim->work); +} +EXPORT_SYMBOL(net_dim_work_cancel); + +struct dim_cq_moder net_dim_get_rx_irq_moder(struct net_device *dev, + struct dim *dim) +{ + struct dim_cq_moder res, *profile; + + rcu_read_lock(); + profile = rcu_dereference(dev->irq_moder->rx_profile); + res = profile[dim->profile_ix]; + rcu_read_unlock(); + + res.cq_period_mode = dim->mode; + + return res; +} +EXPORT_SYMBOL(net_dim_get_rx_irq_moder); + +struct dim_cq_moder net_dim_get_tx_irq_moder(struct net_device *dev, + struct dim *dim) +{ + struct dim_cq_moder res, *profile; + + rcu_read_lock(); + profile = rcu_dereference(dev->irq_moder->tx_profile); + res = profile[dim->profile_ix]; + rcu_read_unlock(); + + res.cq_period_mode = dim->mode; + + return res; +} +EXPORT_SYMBOL(net_dim_get_tx_irq_moder); + +void net_dim_set_rx_mode(struct net_device *dev, u8 rx_mode) +{ + WRITE_ONCE(dev->irq_moder->dim_rx_mode, rx_mode); +} +EXPORT_SYMBOL(net_dim_set_rx_mode); + +void net_dim_set_tx_mode(struct net_device *dev, u8 tx_mode) +{ + WRITE_ONCE(dev->irq_moder->dim_tx_mode, tx_mode); +} +EXPORT_SYMBOL(net_dim_set_tx_mode); + static int net_dim_step(struct dim *dim) { if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2)) @@ -162,7 +294,8 @@ static int net_dim_stats_compare(struct dim_stats *curr, return DIM_STATS_SAME; } -static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim) +static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim, + bool tune_traffic) { int prev_state = dim->tune_state; int prev_ix = dim->profile_ix; @@ -185,6 +318,16 @@ static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim) case DIM_GOING_RIGHT: case DIM_GOING_LEFT: + /* A new local optimum for tune. Other states are cleared.*/ + if (tune_traffic && dim->prev_stats.ppms && curr_stats->epms && + IS_SIGNIFICANT_DIFF_1(curr_stats->ppms, dim->prev_stats.ppms) && + (curr_stats->ppms / curr_stats->epms) <= DIM_RATIO) { + dim_park_on_top(dim); + dim->profile_ix = 1; + dim->prev_stats = *curr_stats; + return true; + } + stats_res = net_dim_stats_compare(curr_stats, &dim->prev_stats); if (stats_res != DIM_STATS_BETTER) @@ -215,7 +358,8 @@ static bool net_dim_decision(struct dim_stats *curr_stats, struct dim *dim) return dim->profile_ix != prev_ix; } -void net_dim(struct dim *dim, struct dim_sample end_sample) +static void net_dim_impl(struct dim *dim, struct dim_sample end_sample, + u16 sample_events, bool tune_traffic) { struct dim_stats curr_stats; u16 nevents; @@ -225,10 +369,15 @@ void net_dim(struct dim *dim, struct dim_sample end_sample) nevents = BIT_GAP(BITS_PER_TYPE(u16), end_sample.event_ctr, dim->start_sample.event_ctr); - if (nevents < DIM_NEVENTS) + /* Compared with the timer method, judging the minimum + * interval of an iteration with a larger sample_nevents + * has a better results. + */ + if (nevents < sample_events) break; + dim_calc_stats(&dim->start_sample, &end_sample, &curr_stats); - if (net_dim_decision(&curr_stats, dim)) { + if (net_dim_decision(&curr_stats, dim, tune_traffic)) { dim->state = DIM_APPLY_NEW_PROFILE; schedule_work(&dim->work); break; @@ -243,4 +392,16 @@ void net_dim(struct dim *dim, struct dim_sample end_sample) break; } } + +void net_dim(struct dim *dim, struct dim_sample end_sample) +{ + net_dim_impl(dim, end_sample, DIM_NEVENTS, false); +} EXPORT_SYMBOL(net_dim); + +void net_dim_tune(struct dim *dim, struct dim_sample end_sample, + u16 sample_events, bool tune_traffic) +{ + net_dim_impl(dim, end_sample, sample_events, tune_traffic); +} +EXPORT_SYMBOL(net_dim_tune); diff --git a/net/Kconfig b/net/Kconfig index 2611318081d312bc9dc24060d76a2607cf5d6634..0eb1f7c1f6c33c81dc1e1bac40e91298e5351edc 100644 --- a/net/Kconfig +++ b/net/Kconfig @@ -455,6 +455,7 @@ config FAILOVER config ETHTOOL_NETLINK bool "Netlink interface for ethtool" + select DIMLIB default y help An alternative userspace interface for ethtool based on generic diff --git a/net/ethtool/coalesce.c b/net/ethtool/coalesce.c index 1d6bc132aa4d0707f89576589d624ae31bb4cf7a..30e1d50abf0cdde54f40f7aeb6b4ef31ea7eb228 100644 --- a/net/ethtool/coalesce.c +++ b/net/ethtool/coalesce.c @@ -1,5 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only +#include #include "netlink.h" #include "common.h" @@ -79,6 +80,14 @@ static int coalesce_prepare_data(const struct ethnl_req_info *req_base, static int coalesce_reply_size(const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { + int modersz = nla_total_size(0) + /* _PROFILE_IRQ_MODERATION, nest */ + nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_USEC */ + nla_total_size(sizeof(u32)) + /* _IRQ_MODERATION_PKTS */ + nla_total_size(sizeof(u32)); /* _IRQ_MODERATION_COMPS */ + + int total_modersz = nla_total_size(0) + /* _{R,T}X_PROFILE, nest */ + modersz * NET_DIM_PARAMS_NUM_PROFILES; + return nla_total_size(sizeof(u32)) + /* _RX_USECS */ nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES */ nla_total_size(sizeof(u32)) + /* _RX_USECS_IRQ */ @@ -100,7 +109,8 @@ static int coalesce_reply_size(const struct ethnl_req_info *req_base, nla_total_size(sizeof(u32)) + /* _RX_MAX_FRAMES_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_USECS_HIGH */ nla_total_size(sizeof(u32)) + /* _TX_MAX_FRAMES_HIGH */ - nla_total_size(sizeof(u32)); /* _RATE_SAMPLE_INTERVAL */ + nla_total_size(sizeof(u32)) + /* _RATE_SAMPLE_INTERVAL */ + total_modersz * 2; /* _{R,T}X_PROFILE */ } static bool coalesce_put_u32(struct sk_buff *skb, u16 attr_type, u32 val, @@ -119,13 +129,83 @@ static bool coalesce_put_bool(struct sk_buff *skb, u16 attr_type, u32 val, return nla_put_u8(skb, attr_type, !!val); } +/** + * coalesce_put_profile - fill reply with a nla nest with four child nla nests. + * @skb: socket buffer the message is stored in + * @attr_type: nest attr type ETHTOOL_A_COALESCE_*X_PROFILE + * @profile: data passed to userspace + * @coal_flags: modifiable parameters supported by the driver + * + * Put a dim profile nest attribute. Refer to ETHTOOL_A_PROFILE_IRQ_MODERATION. + * + * Return: 0 on success or a negative error code. + */ +static int coalesce_put_profile(struct sk_buff *skb, u16 attr_type, + const struct dim_cq_moder *profile, + u8 coal_flags) +{ + struct nlattr *profile_attr, *moder_attr; + int i, ret; + + if (!profile || !coal_flags) + return 0; + + profile_attr = nla_nest_start(skb, attr_type); + if (!profile_attr) + return -EMSGSIZE; + + for (i = 0; i < NET_DIM_PARAMS_NUM_PROFILES; i++) { + moder_attr = nla_nest_start(skb, + ETHTOOL_A_PROFILE_IRQ_MODERATION); + if (!moder_attr) { + ret = -EMSGSIZE; + goto cancel_profile; + } + + if (coal_flags & DIM_COALESCE_USEC) { + ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_USEC, + profile[i].usec); + if (ret) + goto cancel_moder; + } + + if (coal_flags & DIM_COALESCE_PKTS) { + ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_PKTS, + profile[i].pkts); + if (ret) + goto cancel_moder; + } + + if (coal_flags & DIM_COALESCE_COMPS) { + ret = nla_put_u32(skb, ETHTOOL_A_IRQ_MODERATION_COMPS, + profile[i].comps); + if (ret) + goto cancel_moder; + } + + nla_nest_end(skb, moder_attr); + } + + nla_nest_end(skb, profile_attr); + + return 0; + +cancel_moder: + nla_nest_cancel(skb, moder_attr); +cancel_profile: + nla_nest_cancel(skb, profile_attr); + return ret; +} + static int coalesce_fill_reply(struct sk_buff *skb, const struct ethnl_req_info *req_base, const struct ethnl_reply_data *reply_base) { const struct coalesce_reply_data *data = COALESCE_REPDATA(reply_base); + struct dim_irq_moder *moder = req_base->dev->irq_moder; const struct ethtool_coalesce *coal = &data->coalesce; u32 supported = data->supported_params; + int ret = 0; if (coalesce_put_u32(skb, ETHTOOL_A_COALESCE_RX_USECS, coal->rx_coalesce_usecs, supported) || @@ -173,7 +253,26 @@ static int coalesce_fill_reply(struct sk_buff *skb, coal->rate_sample_interval, supported)) return -EMSGSIZE; - return 0; + if (!moder) + return 0; + + rcu_read_lock(); + if (moder->profile_flags & DIM_PROFILE_RX) { + ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_RX_PROFILE, + rcu_dereference(moder->rx_profile), + moder->coal_flags); + if (ret) + goto out; + } + + if (moder->profile_flags & DIM_PROFILE_TX) + ret = coalesce_put_profile(skb, ETHTOOL_A_COALESCE_TX_PROFILE, + rcu_dereference(moder->tx_profile), + moder->coal_flags); + +out: + rcu_read_unlock(); + return ret; } const struct ethnl_request_ops ethnl_coalesce_request_ops = { @@ -190,6 +289,17 @@ const struct ethnl_request_ops ethnl_coalesce_request_ops = { /* COALESCE_SET */ +static const struct nla_policy coalesce_irq_moderation_policy[] = { + [ETHTOOL_A_IRQ_MODERATION_USEC] = { .type = NLA_U32 }, + [ETHTOOL_A_IRQ_MODERATION_PKTS] = { .type = NLA_U32 }, + [ETHTOOL_A_IRQ_MODERATION_COMPS] = { .type = NLA_U32 }, +}; + +static const struct nla_policy coalesce_profile_policy[] = { + [ETHTOOL_A_PROFILE_IRQ_MODERATION] = + NLA_POLICY_NESTED(coalesce_irq_moderation_policy), +}; + const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_HEADER] = NLA_POLICY_NESTED(ethnl_header_policy), @@ -215,13 +325,144 @@ const struct nla_policy ethnl_coalesce_set_policy[] = { [ETHTOOL_A_COALESCE_TX_USECS_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH] = { .type = NLA_U32 }, [ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL] = { .type = NLA_U32 }, + [ETHTOOL_A_COALESCE_RX_PROFILE] = + NLA_POLICY_NESTED(coalesce_profile_policy), + [ETHTOOL_A_COALESCE_TX_PROFILE] = + NLA_POLICY_NESTED(coalesce_profile_policy), }; +/** + * ethnl_update_irq_moder - update a specific field in the given profile + * @irq_moder: place that collects dim related information + * @irq_field: field in profile to modify + * @attr_type: attr type ETHTOOL_A_IRQ_MODERATION_* + * @tb: netlink attribute with new values or null + * @coal_bit: DIM_COALESCE_* bit from coal_flags + * @extack: netlink extended ack + * + * Return: 0 on success or a negative error code. + */ +static int ethnl_update_irq_moder(struct dim_irq_moder *irq_moder, + u16 *irq_field, u16 attr_type, + struct nlattr **tb, u8 coal_bit, + struct netlink_ext_ack *extack) +{ + int ret = 0; + + if (!tb[attr_type]) + return 0; + + if (irq_moder->coal_flags & coal_bit) { + *irq_field = nla_get_u32(tb[attr_type]); + } else { + NL_SET_BAD_ATTR(extack, tb[attr_type]); + ret = -EOPNOTSUPP; + } + + return ret; +} + +/** + * ethnl_update_profile - get a profile nest with child nests from userspace. + * @dev: netdevice to update the profile + * @dst: profile get from the driver and modified by ethnl_update_profile. + * @nests: nest attr ETHTOOL_A_COALESCE_*X_PROFILE to set profile. + * @extack: Netlink extended ack + * + * Layout of nests: + * Nested ETHTOOL_A_COALESCE_*X_PROFILE attr + * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr + * ETHTOOL_A_IRQ_MODERATION_USEC attr + * ETHTOOL_A_IRQ_MODERATION_PKTS attr + * ETHTOOL_A_IRQ_MODERATION_COMPS attr + * ... + * Nested ETHTOOL_A_PROFILE_IRQ_MODERATION attr + * ETHTOOL_A_IRQ_MODERATION_USEC attr + * ETHTOOL_A_IRQ_MODERATION_PKTS attr + * ETHTOOL_A_IRQ_MODERATION_COMPS attr + * + * Return: 0 on success or a negative error code. + */ +static int ethnl_update_profile(struct net_device *dev, + struct dim_cq_moder __rcu **dst, + const struct nlattr *nests, + struct netlink_ext_ack *extack) +{ + int len_irq_moder = ARRAY_SIZE(coalesce_irq_moderation_policy); + struct nlattr *tb[ARRAY_SIZE(coalesce_irq_moderation_policy)]; + struct dim_irq_moder *irq_moder = dev->irq_moder; + struct dim_cq_moder *new_profile, *old_profile; + int ret, rem, i = 0, len; + struct nlattr *nest; + + if (!nests) + return 0; + + if (!*dst) + return -EOPNOTSUPP; + + old_profile = rtnl_dereference(*dst); + len = NET_DIM_PARAMS_NUM_PROFILES * sizeof(*old_profile); + new_profile = kmemdup(old_profile, len, GFP_KERNEL); + if (!new_profile) + return -ENOMEM; + + nla_for_each_nested(nest, nests, rem) { + if (nla_type(nest) != ETHTOOL_A_PROFILE_IRQ_MODERATION) { + ret = -EINVAL; + goto err_out; + } + + ret = nla_parse_nested(tb, len_irq_moder - 1, nest, + coalesce_irq_moderation_policy, + extack); + if (ret) + goto err_out; + + ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].usec, + ETHTOOL_A_IRQ_MODERATION_USEC, + tb, DIM_COALESCE_USEC, + extack); + if (ret) + goto err_out; + + ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].pkts, + ETHTOOL_A_IRQ_MODERATION_PKTS, + tb, DIM_COALESCE_PKTS, + extack); + if (ret) + goto err_out; + + ret = ethnl_update_irq_moder(irq_moder, &new_profile[i].comps, + ETHTOOL_A_IRQ_MODERATION_COMPS, + tb, DIM_COALESCE_COMPS, + extack); + if (ret) + goto err_out; + + i++; + } + + /* After the profile is modified, dim itself is a dynamic + * mechanism and will quickly fit to the appropriate + * coalescing parameters according to the new profile. + */ + rcu_assign_pointer(*dst, new_profile); + kfree_rcu(old_profile, rcu); + + return 0; + +err_out: + kfree(new_profile); + return ret; +} + int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) { struct ethtool_coalesce coalesce = {}; struct ethnl_req_info req_info = {}; struct nlattr **tb = info->attrs; + struct dim_irq_moder *irq_moder; const struct ethtool_ops *ops; struct net_device *dev; u32 supported_params; @@ -242,7 +483,14 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) goto out_dev; /* make sure that only supported parameters are present */ + irq_moder = dev->irq_moder; supported_params = ops->supported_coalesce_params; + if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_RX) + supported_params |= ETHTOOL_COALESCE_RX_PROFILE; + + if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_TX) + supported_params |= ETHTOOL_COALESCE_TX_PROFILE; + for (a = ETHTOOL_A_COALESCE_RX_USECS; a < __ETHTOOL_A_COALESCE_CNT; a++) if (tb[a] && !(supported_params & attr_to_mask(a))) { ret = -EINVAL; @@ -303,6 +551,23 @@ int ethnl_set_coalesce(struct sk_buff *skb, struct genl_info *info) tb[ETHTOOL_A_COALESCE_TX_MAX_FRAMES_HIGH], &mod); ethnl_update_u32(&coalesce.rate_sample_interval, tb[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL], &mod); + + if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_RX) { + ret = ethnl_update_profile(dev, &irq_moder->rx_profile, + tb[ETHTOOL_A_COALESCE_RX_PROFILE], + info->extack); + if (ret < 0) + goto out_ops; + } + + if (irq_moder && irq_moder->profile_flags & DIM_PROFILE_TX) { + ret = ethnl_update_profile(dev, &irq_moder->tx_profile, + tb[ETHTOOL_A_COALESCE_TX_PROFILE], + info->extack); + if (ret < 0) + goto out_ops; + } + ret = 0; if (!mod) goto out_ops; diff --git a/net/ethtool/netlink.h b/net/ethtool/netlink.h index 979dee6bb88c5767f5749145ad0931c2cd421691..9434b726a42dc849b49a7a5d57ad1295d8420d6b 100644 --- a/net/ethtool/netlink.h +++ b/net/ethtool/netlink.h @@ -369,7 +369,7 @@ extern const struct nla_policy ethnl_rings_set_policy[ETHTOOL_A_RINGS_TX + 1]; extern const struct nla_policy ethnl_channels_get_policy[ETHTOOL_A_CHANNELS_HEADER + 1]; extern const struct nla_policy ethnl_channels_set_policy[ETHTOOL_A_CHANNELS_COMBINED_COUNT + 1]; extern const struct nla_policy ethnl_coalesce_get_policy[ETHTOOL_A_COALESCE_HEADER + 1]; -extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_RATE_SAMPLE_INTERVAL + 1]; +extern const struct nla_policy ethnl_coalesce_set_policy[ETHTOOL_A_COALESCE_MAX + 1]; extern const struct nla_policy ethnl_pause_get_policy[ETHTOOL_A_PAUSE_HEADER + 1]; extern const struct nla_policy ethnl_pause_set_policy[ETHTOOL_A_PAUSE_TX + 1]; extern const struct nla_policy ethnl_eee_get_policy[ETHTOOL_A_EEE_HEADER + 1];