diff --git a/Documentation/ABI/testing/sysfs-class-bdi b/Documentation/ABI/testing/sysfs-class-bdi index 5402bd74ba43b4303f95d026bcde1df86bbd737a..f1119b36f341ee5e4a2313503b7ea9de3ea4d15b 100644 --- a/Documentation/ABI/testing/sysfs-class-bdi +++ b/Documentation/ABI/testing/sysfs-class-bdi @@ -39,6 +39,21 @@ min_ratio (read-write) percentage of the write-back cache to a particular device. For example, this is useful for providing a minimum QoS. +What: /sys/class/bdi//min_ratio_fine +Date: November 2022 +Contact: Stefan Roesch +Description: + Under normal circumstances each device is given a part of the + total write-back cache that relates to its current average + writeout speed in relation to the other devices. + + The 'min_ratio_fine' parameter allows assigning a minimum reserve + of the write-back cache to a particular device. The value is + expressed as part of 1 million. For example, this is useful for + providing a minimum QoS. + + (read-write) + max_ratio (read-write) Allows limiting a particular device to use not more than the @@ -48,6 +63,59 @@ max_ratio (read-write) mount that is prone to get stuck, or a FUSE mount which cannot be trusted to play fair. + (read-write) + +What: /sys/class/bdi//max_ratio_fine +Date: November 2022 +Contact: Stefan Roesch +Description: + Allows limiting a particular device to use not more than the + given value of the write-back cache. The value is given as part + of 1 million. This is useful in situations where we want to avoid + one device taking all or most of the write-back cache. For example + in case of an NFS mount that is prone to get stuck, or a FUSE mount + which cannot be trusted to play fair. + + (read-write) + +What: /sys/class/bdi//min_bytes +Date: October 2022 +Contact: Stefan Roesch +Description: + Under normal circumstances each device is given a part of the + total write-back cache that relates to its current average + writeout speed in relation to the other devices. + + The 'min_bytes' parameter allows assigning a minimum + percentage of the write-back cache to a particular device + expressed in bytes. + For example, this is useful for providing a minimum QoS. + + (read-write) + +What: /sys/class/bdi//max_bytes +Date: October 2022 +Contact: Stefan Roesch +Description: + Allows limiting a particular device to use not more than the + given 'max_bytes' of the write-back cache. This is useful in + situations where we want to avoid one device taking all or + most of the write-back cache. For example in case of an NFS + mount that is prone to get stuck, a FUSE mount which cannot be + trusted to play fair, or a nbd device. + + (read-write) + +What: /sys/class/bdi//strict_limit +Date: October 2022 +Contact: Stefan Roesch +Description: + Forces per-BDI checks for the share of given device in the write-back + cache even before the global background dirty limit is reached. This + is useful in situations where the global limit is much higher than + affordable for given relatively slow (or untrusted) device. Turning + strictlimit on has no visible effect if max_ratio is equal to 100%. + stable_pages_required (read-only) If set, the backing device requires that all pages comprising a write diff --git a/include/linux/backing-dev.h b/include/linux/backing-dev.h index a689a21abe1002cb4dc314cf16ef7fab72ec0bbf..b10df87d6c348dc3054c14b570f702270fc3829a 100644 --- a/include/linux/backing-dev.h +++ b/include/linux/backing-dev.h @@ -104,8 +104,18 @@ static inline unsigned long wb_stat_error(void) #endif } +/* BDI ratio is expressed as part per 1000000 for finer granularity. */ +#define BDI_RATIO_SCALE 10000 + +u64 bdi_get_min_bytes(struct backing_dev_info *bdi); +u64 bdi_get_max_bytes(struct backing_dev_info *bdi); int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio); int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio); +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio); +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes); +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes); +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit); /* * Flags in backing_dev_info::capability diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 5f4f16dbff9dfcc069d0f455e48a907a9a4f728f..6e9162004a916f5ab87f41a35e05391e741e42a6 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -176,7 +176,26 @@ static ssize_t min_ratio_store(struct device *dev, return ret; } -BDI_SHOW(min_ratio, bdi->min_ratio) +BDI_SHOW(min_ratio, bdi->min_ratio / BDI_RATIO_SCALE) + +static ssize_t min_ratio_fine_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_min_ratio_no_scale(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(min_ratio_fine, bdi->min_ratio) static ssize_t max_ratio_store(struct device *dev, struct device_attribute *attr, const char *buf, size_t count) @@ -195,7 +214,82 @@ static ssize_t max_ratio_store(struct device *dev, return ret; } -BDI_SHOW(max_ratio, bdi->max_ratio) +BDI_SHOW(max_ratio, bdi->max_ratio / BDI_RATIO_SCALE) + +static ssize_t max_ratio_fine_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int ratio; + ssize_t ret; + + ret = kstrtouint(buf, 10, &ratio); + if (ret < 0) + return ret; + + ret = bdi_set_max_ratio_no_scale(bdi, ratio); + if (!ret) + ret = count; + + return ret; +} +BDI_SHOW(max_ratio_fine, bdi->max_ratio) + +static ssize_t min_bytes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%llu\n", bdi_get_min_bytes(bdi)); +} + +static ssize_t min_bytes_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + u64 bytes; + ssize_t ret; + + ret = kstrtoull(buf, 10, &bytes); + if (ret < 0) + return ret; + + ret = bdi_set_min_bytes(bdi, bytes); + if (!ret) + ret = count; + + return ret; +} +DEVICE_ATTR_RW(min_bytes); + +static ssize_t max_bytes_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%llu\n", bdi_get_max_bytes(bdi)); +} + +static ssize_t max_bytes_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + u64 bytes; + ssize_t ret; + + ret = kstrtoull(buf, 10, &bytes); + if (ret < 0) + return ret; + + ret = bdi_set_max_bytes(bdi, bytes); + if (!ret) + ret = count; + + return ret; +} +DEVICE_ATTR_RW(max_bytes); static ssize_t stable_pages_required_show(struct device *dev, struct device_attribute *attr, @@ -207,11 +301,44 @@ static ssize_t stable_pages_required_show(struct device *dev, } static DEVICE_ATTR_RO(stable_pages_required); +static ssize_t strict_limit_store(struct device *dev, + struct device_attribute *attr, const char *buf, size_t count) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + unsigned int strict_limit; + ssize_t ret; + + ret = kstrtouint(buf, 10, &strict_limit); + if (ret < 0) + return ret; + + ret = bdi_set_strict_limit(bdi, strict_limit); + if (!ret) + ret = count; + + return ret; +} + +static ssize_t strict_limit_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct backing_dev_info *bdi = dev_get_drvdata(dev); + + return sysfs_emit(buf, "%d\n", + !!(bdi->capabilities & BDI_CAP_STRICTLIMIT)); +} +static DEVICE_ATTR_RW(strict_limit); + static struct attribute *bdi_dev_attrs[] = { &dev_attr_read_ahead_kb.attr, &dev_attr_min_ratio.attr, + &dev_attr_min_ratio_fine.attr, &dev_attr_max_ratio.attr, + &dev_attr_max_ratio_fine.attr, + &dev_attr_min_bytes.attr, + &dev_attr_max_bytes.attr, &dev_attr_stable_pages_required.attr, + &dev_attr_strict_limit.attr, NULL, }; ATTRIBUTE_GROUPS(bdi_dev); @@ -780,7 +907,7 @@ static int bdi_init(struct backing_dev_info *bdi) kref_init(&bdi->refcnt); bdi->min_ratio = 0; - bdi->max_ratio = 100; + bdi->max_ratio = 100 * BDI_RATIO_SCALE; bdi->max_prop_frac = FPROP_FRAC_BASE; INIT_LIST_HEAD(&bdi->bdi_list); INIT_LIST_HEAD(&bdi->wb_list); diff --git a/mm/page-writeback.c b/mm/page-writeback.c index 0d7cc65c636708099b8925ecf51f781f1a89ff09..0062f1146a30194f230b32c2920991250044ee4a 100644 --- a/mm/page-writeback.c +++ b/mm/page-writeback.c @@ -13,6 +13,7 @@ */ #include +#include #include #include #include @@ -198,7 +199,7 @@ static void wb_min_max_ratio(struct bdi_writeback *wb, min *= this_bw; min = div64_ul(min, tot_bw); } - if (max < 100) { + if (max < 100 * BDI_RATIO_SCALE) { max *= this_bw; max = div64_ul(max, tot_bw); } @@ -683,20 +684,64 @@ void wb_domain_exit(struct wb_domain *dom) */ static unsigned int bdi_min_ratio; -int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +static int bdi_check_pages_limit(unsigned long pages) +{ + unsigned long max_dirty_pages = global_dirtyable_memory(); + + if (pages > max_dirty_pages) + return -EINVAL; + + return 0; +} + +static unsigned long bdi_ratio_from_pages(unsigned long pages) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + unsigned long ratio; + + global_dirty_limits(&background_thresh, &dirty_thresh); + ratio = div64_u64(pages * 100ULL * BDI_RATIO_SCALE, dirty_thresh); + + return ratio; +} + +static u64 bdi_get_bytes(unsigned int ratio) +{ + unsigned long background_thresh; + unsigned long dirty_thresh; + u64 bytes; + + global_dirty_limits(&background_thresh, &dirty_thresh); + bytes = (dirty_thresh * PAGE_SIZE * ratio) / BDI_RATIO_SCALE / 100; + + return bytes; +} + +static int __bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) { + unsigned int delta; int ret = 0; + if (min_ratio > 100 * BDI_RATIO_SCALE) + return -EINVAL; + spin_lock_bh(&bdi_lock); if (min_ratio > bdi->max_ratio) { ret = -EINVAL; } else { - min_ratio -= bdi->min_ratio; - if (bdi_min_ratio + min_ratio < 100) { - bdi_min_ratio += min_ratio; - bdi->min_ratio += min_ratio; + if (min_ratio < bdi->min_ratio) { + delta = bdi->min_ratio - min_ratio; + bdi_min_ratio -= delta; + bdi->min_ratio = min_ratio; } else { - ret = -EINVAL; + delta = min_ratio - bdi->min_ratio; + if (bdi_min_ratio + delta < 100 * BDI_RATIO_SCALE) { + bdi_min_ratio += delta; + bdi->min_ratio = min_ratio; + } else { + ret = -EINVAL; + } } } spin_unlock_bh(&bdi_lock); @@ -704,11 +749,11 @@ int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) return ret; } -int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) +static int __bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) { int ret = 0; - if (max_ratio > 100) + if (max_ratio > 100 * BDI_RATIO_SCALE) return -EINVAL; spin_lock_bh(&bdi_lock); @@ -716,14 +761,88 @@ int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned max_ratio) ret = -EINVAL; } else { bdi->max_ratio = max_ratio; - bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / 100; + bdi->max_prop_frac = (FPROP_FRAC_BASE * max_ratio) / + (100 * BDI_RATIO_SCALE); } spin_unlock_bh(&bdi_lock); return ret; } + +int bdi_set_min_ratio_no_scale(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio); +} + +int bdi_set_max_ratio_no_scale(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio); +} + +int bdi_set_min_ratio(struct backing_dev_info *bdi, unsigned int min_ratio) +{ + return __bdi_set_min_ratio(bdi, min_ratio * BDI_RATIO_SCALE); +} + +int bdi_set_max_ratio(struct backing_dev_info *bdi, unsigned int max_ratio) +{ + return __bdi_set_max_ratio(bdi, max_ratio * BDI_RATIO_SCALE); +} EXPORT_SYMBOL(bdi_set_max_ratio); +u64 bdi_get_min_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->min_ratio); +} + +int bdi_set_min_bytes(struct backing_dev_info *bdi, u64 min_bytes) +{ + int ret; + unsigned long pages = min_bytes >> PAGE_SHIFT; + unsigned long min_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + min_ratio = bdi_ratio_from_pages(pages); + return __bdi_set_min_ratio(bdi, min_ratio); +} + +u64 bdi_get_max_bytes(struct backing_dev_info *bdi) +{ + return bdi_get_bytes(bdi->max_ratio); +} + +int bdi_set_max_bytes(struct backing_dev_info *bdi, u64 max_bytes) +{ + int ret; + unsigned long pages = max_bytes >> PAGE_SHIFT; + unsigned long max_ratio; + + ret = bdi_check_pages_limit(pages); + if (ret) + return ret; + + max_ratio = bdi_ratio_from_pages(pages); + return __bdi_set_max_ratio(bdi, max_ratio); +} + +int bdi_set_strict_limit(struct backing_dev_info *bdi, unsigned int strict_limit) +{ + if (strict_limit > 1) + return -EINVAL; + + spin_lock_bh(&bdi_lock); + if (strict_limit) + bdi->capabilities |= BDI_CAP_STRICTLIMIT; + else + bdi->capabilities &= ~BDI_CAP_STRICTLIMIT; + spin_unlock_bh(&bdi_lock); + + return 0; +} + static unsigned long dirty_freerun_ceiling(unsigned long thresh, unsigned long bg_thresh) { @@ -786,15 +905,15 @@ static unsigned long __wb_calc_thresh(struct dirty_throttle_control *dtc) fprop_fraction_percpu(&dom->completions, dtc->wb_completions, &numerator, &denominator); - wb_thresh = (thresh * (100 - bdi_min_ratio)) / 100; + wb_thresh = (thresh * (100 * BDI_RATIO_SCALE - bdi_min_ratio)) / (100 * BDI_RATIO_SCALE); wb_thresh *= numerator; wb_thresh = div64_ul(wb_thresh, denominator); wb_min_max_ratio(dtc->wb, &wb_min_ratio, &wb_max_ratio); - wb_thresh += (thresh * wb_min_ratio) / 100; - if (wb_thresh > (thresh * wb_max_ratio) / 100) - wb_thresh = thresh * wb_max_ratio / 100; + wb_thresh += (thresh * wb_min_ratio) / (100 * BDI_RATIO_SCALE); + if (wb_thresh > (thresh * wb_max_ratio) / (100 * BDI_RATIO_SCALE)) + wb_thresh = thresh * wb_max_ratio / (100 * BDI_RATIO_SCALE); return wb_thresh; }