Sign in
Sign up
Explore
Enterprise
Education
Search
Help
Terms of use
About Us
Explore
Enterprise
Education
Gitee Premium
Blog
I know
View Details
Sign in
Sign up
Fetch the repository succeeded.
Open Source
>
Other
>
Operation System
&&
Donate
Please sign in before you donate.
Cancel
Sign in
Scan WeChat QR to Pay
Cancel
Complete
Prompt
Switch to Alipay.
OK
Cancel
Watch
Unwatch
Watching
Releases Only
Ignoring
55
Star
33
Fork
80
src-openEuler
/
kernel
Code
Issues
78
Pull Requests
7
Wiki
Insights
Pipelines
Service
Gitee Pages
JavaDoc
PHPDoc
Quality Analysis
Gitee Scan
Jenkins for Gitee
Baidu Efficiency Cloud
Tencent CloudBase
Tencent Cloud Serverless
Don’t show this again
Create your Gitee Account
Explore and code with more than 8 million developers,Free private repositories !:)
Sign up
Already have an account?
Sign in
This repository doesn't specify license. Please pay attention to the specific project description and its upstream code dependency when using it.
The license selected for the repository is subject to the license used by the main branch of the repository.
openEuler-22.03-LTS
Manage
Manage
Branches (17)
Tags (11)
openEuler-22.03-LTS
openEuler-22.09
openEuler-22.03-LTS-Next
openEuler-20.03-LTS-SP3
openEuler-20.03-LTS-SP1
master
openEuler-20.03-LTS-SP2
openEuler-22.03-LTS-LoongArch
openEuler-21.09
openEuler-20.03-LTS
openEuler-20.03-LTS-Next
openEuler-21.03
openEuler-20.09
openEuler-20.03-LTS-SP1-testing
build-5.10-rc
openEuler1.0
openEuler1.0-base
openEuler-22.03-LTS-20220331
openEuler-22.03-LTS-round5
openEuler-22.03-LTS-round3
openEuler-22.03-LTS-round2
openEuler-22.03-LTS-round1
openEuler-20.03-LTS-SP3-release
openEuler-20.03-LTS-SP2-20210624
openEuler-21.03-20210330
openEuler-20.09-20200929
openEuler-20.03-LTS-20200606
openEuler-20.03-LTS-tag
Clone or Download
HTTPS
SSH
SVN
SVN+SSH
Copy
Download ZIP
Login prompt
This operation requires login to the code cloud account. Please log in before operating.
Go to login
No account. Register
kernel
/
0001-add-preemptRT-patch.patch
Branches 17
Tags 11
0001-add-preemptRT-patch.patch
798.94 KB
From 78c687b1fc2f5c5acbb9541fd47690b48df4c6ba Mon Sep 17 00:00:00 2001 From: zhangyuanhang <zhangyuanhang@kylinos.cn> Date: Sat, 19 Mar 2022 09:44:46 +0800 Subject: [PATCH 1/2] Add preempt-RT patch Signed-off-by: zhangyuanhang <zhangyuanhang@kylinos.cn> --- .../Expedited-Grace-Periods.rst | 4 +- .../RCU/Design/Requirements/Requirements.rst | 26 +- Documentation/RCU/checklist.rst | 2 +- Documentation/RCU/rcubarrier.rst | 6 +- Documentation/RCU/stallwarn.rst | 4 +- Documentation/RCU/whatisRCU.rst | 10 +- .../admin-guide/kernel-parameters.txt | 11 + Documentation/driver-api/io-mapping.rst | 92 +- arch/Kconfig | 8 +- arch/alpha/include/asm/kmap_types.h | 15 - arch/alpha/include/asm/spinlock_types.h | 4 - arch/arc/Kconfig | 1 + arch/arc/include/asm/highmem.h | 26 +- arch/arc/include/asm/kmap_types.h | 14 - arch/arc/mm/highmem.c | 54 +- arch/arm/Kconfig | 6 +- arch/arm/include/asm/fixmap.h | 4 +- arch/arm/include/asm/hardirq.h | 11 +- arch/arm/include/asm/highmem.h | 34 +- arch/arm/include/asm/irq.h | 2 + arch/arm/include/asm/kmap_types.h | 10 - arch/arm/include/asm/spinlock_types.h | 4 - arch/arm/include/asm/thread_info.h | 10 +- arch/arm/kernel/asm-offsets.c | 1 + arch/arm/kernel/entry-armv.S | 19 +- arch/arm/kernel/entry-common.S | 9 +- arch/arm/kernel/signal.c | 3 +- arch/arm/kernel/smp.c | 2 - arch/arm/mm/Makefile | 1 - arch/arm/mm/cache-feroceon-l2.c | 6 +- arch/arm/mm/cache-xsc3l2.c | 4 +- arch/arm/mm/fault.c | 3 + arch/arm/mm/highmem.c | 121 -- arch/arm64/Kconfig | 3 + arch/arm64/include/asm/hardirq.h | 7 +- arch/arm64/include/asm/preempt.h | 28 +- arch/arm64/include/asm/spinlock_types.h | 4 - arch/arm64/include/asm/thread_info.h | 7 +- arch/arm64/kernel/asm-offsets.c | 1 + arch/arm64/kernel/entry.S | 13 +- arch/arm64/kernel/fpsimd.c | 18 +- arch/arm64/kernel/signal.c | 2 +- arch/arm64/kvm/arm.c | 6 +- arch/csky/Kconfig | 1 + arch/csky/include/asm/fixmap.h | 4 +- arch/csky/include/asm/highmem.h | 6 +- arch/csky/mm/highmem.c | 75 +- arch/hexagon/include/asm/spinlock_types.h | 4 - arch/ia64/include/asm/kmap_types.h | 13 - arch/ia64/include/asm/spinlock_types.h | 4 - arch/ia64/kernel/time.c | 20 +- arch/microblaze/Kconfig | 1 + arch/microblaze/include/asm/fixmap.h | 4 +- arch/microblaze/include/asm/highmem.h | 6 +- arch/microblaze/mm/Makefile | 1 - arch/microblaze/mm/highmem.c | 78 - arch/microblaze/mm/init.c | 6 - arch/mips/Kconfig | 1 + arch/mips/include/asm/fixmap.h | 4 +- arch/mips/include/asm/highmem.h | 6 +- arch/mips/include/asm/kmap_types.h | 13 - arch/mips/kernel/crash_dump.c | 42 +- arch/mips/mm/highmem.c | 77 - arch/mips/mm/init.c | 4 - arch/nds32/Kconfig.cpu | 1 + arch/nds32/include/asm/fixmap.h | 4 +- arch/nds32/include/asm/highmem.h | 22 +- arch/nds32/mm/Makefile | 1 - arch/nds32/mm/highmem.c | 48 - arch/openrisc/mm/init.c | 1 - arch/openrisc/mm/ioremap.c | 1 - arch/parisc/include/asm/hardirq.h | 1 - arch/parisc/include/asm/kmap_types.h | 13 - arch/powerpc/Kconfig | 4 + arch/powerpc/include/asm/cmpxchg.h | 2 +- arch/powerpc/include/asm/fixmap.h | 4 +- arch/powerpc/include/asm/highmem.h | 7 +- arch/powerpc/include/asm/kmap_types.h | 13 - .../include/asm/simple_spinlock_types.h | 2 +- arch/powerpc/include/asm/spinlock_types.h | 4 - arch/powerpc/include/asm/stackprotector.h | 4 + arch/powerpc/include/asm/thread_info.h | 17 +- arch/powerpc/kernel/asm-offsets.c | 1 + arch/powerpc/kernel/entry_32.S | 23 +- arch/powerpc/kernel/exceptions-64e.S | 16 +- arch/powerpc/kernel/irq.c | 2 + arch/powerpc/kernel/misc_32.S | 2 + arch/powerpc/kernel/misc_64.S | 2 + arch/powerpc/kernel/nvram_64.c | 12 +- arch/powerpc/kernel/syscall_64.c | 10 +- arch/powerpc/kernel/time.c | 56 +- arch/powerpc/kernel/traps.c | 8 +- arch/powerpc/kernel/watchdog.c | 5 - arch/powerpc/kexec/crash.c | 3 - arch/powerpc/kvm/Kconfig | 1 + arch/powerpc/mm/Makefile | 1 - arch/powerpc/mm/highmem.c | 67 - arch/powerpc/mm/mem.c | 7 - arch/powerpc/platforms/powernv/opal-kmsg.c | 3 +- arch/powerpc/platforms/pseries/iommu.c | 31 +- arch/powerpc/xmon/xmon.c | 6 +- arch/s390/Kconfig | 1 + arch/s390/include/asm/spinlock_types.h | 4 - arch/s390/include/asm/vtime.h | 1 - arch/s390/kernel/vtime.c | 51 +- arch/sh/include/asm/fixmap.h | 8 - arch/sh/include/asm/hardirq.h | 14 +- arch/sh/include/asm/kmap_types.h | 15 - arch/sh/include/asm/spinlock_types.h | 4 - arch/sh/kernel/irq.c | 4 +- arch/sh/kernel/traps.c | 2 +- arch/sh/mm/init.c | 8 - arch/sparc/Kconfig | 1 + arch/sparc/include/asm/highmem.h | 8 +- arch/sparc/include/asm/kmap_types.h | 11 - arch/sparc/include/asm/vaddrs.h | 4 +- arch/sparc/kernel/irq_64.c | 2 + arch/sparc/mm/Makefile | 3 - arch/sparc/mm/highmem.c | 115 -- arch/sparc/mm/srmmu.c | 2 - arch/um/include/asm/fixmap.h | 1 - arch/um/include/asm/hardirq.h | 17 +- arch/um/include/asm/kmap_types.h | 13 - arch/um/kernel/kmsg_dump.c | 13 +- arch/x86/Kconfig | 3 + arch/x86/crypto/aesni-intel_glue.c | 22 +- arch/x86/crypto/cast5_avx_glue.c | 21 +- arch/x86/crypto/glue_helper.c | 26 +- arch/x86/include/asm/fixmap.h | 5 +- arch/x86/include/asm/fpu/api.h | 24 +- arch/x86/include/asm/highmem.h | 13 +- arch/x86/include/asm/iomap.h | 13 +- arch/x86/include/asm/kmap_types.h | 13 - arch/x86/include/asm/paravirt_types.h | 1 - arch/x86/include/asm/preempt.h | 37 +- arch/x86/include/asm/signal.h | 13 + arch/x86/include/asm/stackprotector.h | 8 +- arch/x86/include/asm/thread_info.h | 11 + arch/x86/kernel/cpu/mshyperv.c | 3 +- arch/x86/kernel/crash_dump_32.c | 48 +- arch/x86/kernel/fpu/core.c | 12 + arch/x86/kernel/irq_32.c | 2 + arch/x86/kernel/irq_64.c | 2 + arch/x86/kvm/x86.c | 8 + arch/x86/mm/highmem_32.c | 59 - arch/x86/mm/init_32.c | 15 - arch/x86/mm/iomap_32.c | 57 +- arch/xtensa/Kconfig | 1 + arch/xtensa/include/asm/fixmap.h | 4 +- arch/xtensa/include/asm/highmem.h | 12 +- arch/xtensa/include/asm/spinlock_types.h | 4 - arch/xtensa/mm/highmem.c | 46 +- block/blk-mq.c | 124 +- crypto/cryptd.c | 19 +- drivers/atm/eni.c | 2 +- drivers/block/zram/zram_drv.c | 36 + drivers/block/zram/zram_drv.h | 1 + drivers/char/random.c | 11 +- drivers/char/tpm/tpm-dev-common.c | 1 - drivers/char/tpm/tpm_tis.c | 29 +- drivers/firewire/ohci.c | 4 +- drivers/firmware/efi/efi.c | 5 +- drivers/gpu/drm/i915/display/intel_sprite.c | 15 +- .../gpu/drm/i915/gem/i915_gem_execbuffer.c | 7 +- drivers/gpu/drm/i915/gt/intel_breadcrumbs.c | 5 +- drivers/gpu/drm/i915/gt/intel_engine_pm.c | 8 +- drivers/gpu/drm/i915/i915_gem.c | 40 +- drivers/gpu/drm/i915/i915_irq.c | 2 + drivers/gpu/drm/i915/i915_trace.h | 6 +- drivers/gpu/drm/i915/selftests/i915_gem.c | 4 +- drivers/gpu/drm/i915/selftests/i915_gem_gtt.c | 8 +- .../drm/nouveau/nvkm/subdev/devinit/fbmem.h | 8 +- drivers/gpu/drm/qxl/qxl_image.c | 18 +- drivers/gpu/drm/qxl/qxl_ioctl.c | 27 +- drivers/gpu/drm/qxl/qxl_object.c | 12 +- drivers/gpu/drm/qxl/qxl_object.h | 4 +- drivers/gpu/drm/qxl/qxl_release.c | 4 +- drivers/gpu/drm/radeon/radeon_display.c | 2 + drivers/gpu/drm/ttm/ttm_bo_util.c | 20 +- drivers/gpu/drm/vmwgfx/vmwgfx_blit.c | 30 +- drivers/hv/hyperv_vmbus.h | 1 + drivers/hv/vmbus_drv.c | 10 +- drivers/leds/trigger/Kconfig | 1 + drivers/md/raid5.c | 7 +- drivers/md/raid5.h | 1 + drivers/mtd/mtdoops.c | 5 +- drivers/net/arcnet/arc-rimi.c | 4 +- drivers/net/arcnet/arcdevice.h | 6 + drivers/net/arcnet/arcnet.c | 66 +- drivers/net/arcnet/com20020-isa.c | 4 +- drivers/net/arcnet/com20020-pci.c | 2 +- drivers/net/arcnet/com20020_cs.c | 2 +- drivers/net/arcnet/com90io.c | 4 +- drivers/net/arcnet/com90xx.c | 4 +- drivers/net/ethernet/chelsio/cxgb/common.h | 6 +- drivers/net/ethernet/chelsio/cxgb/cxgb2.c | 54 +- drivers/net/ethernet/chelsio/cxgb/sge.c | 53 +- drivers/net/ethernet/chelsio/cxgb/sge.h | 3 +- drivers/net/ethernet/chelsio/cxgb/subr.c | 64 +- drivers/net/ethernet/dlink/sundance.c | 2 +- drivers/net/ethernet/jme.c | 10 +- drivers/net/ethernet/jme.h | 2 +- drivers/net/wireless/ath/ath9k/beacon.c | 2 +- drivers/pci/controller/pci-hyperv.c | 2 +- drivers/scsi/fcoe/fcoe.c | 16 +- drivers/scsi/fcoe/fcoe_ctlr.c | 4 +- drivers/scsi/libfc/fc_exch.c | 4 +- drivers/tty/serial/8250/8250.h | 47 +- drivers/tty/serial/8250/8250_core.c | 17 +- drivers/tty/serial/8250/8250_fsl.c | 9 + drivers/tty/serial/8250/8250_ingenic.c | 7 + drivers/tty/serial/8250/8250_mtk.c | 29 +- drivers/tty/serial/8250/8250_port.c | 92 +- drivers/tty/serial/amba-pl011.c | 17 +- drivers/tty/serial/omap-serial.c | 12 +- drivers/tty/tty_buffer.c | 2 - fs/afs/dir_silly.c | 2 +- fs/aio.c | 3 +- fs/btrfs/ctree.h | 1 - fs/cifs/readdir.c | 2 +- fs/dcache.c | 39 +- fs/eventfd.c | 12 +- fs/fscache/internal.h | 1 - fs/fscache/main.c | 6 - fs/fscache/object.c | 13 +- fs/fuse/readdir.c | 2 +- fs/inode.c | 2 +- fs/namei.c | 4 +- fs/namespace.c | 8 +- fs/nfs/dir.c | 4 +- fs/nfs/unlink.c | 4 +- fs/proc/array.c | 4 +- fs/proc/base.c | 3 +- fs/proc/proc_sysctl.c | 2 +- fs/pstore/platform.c | 5 +- include/asm-generic/Kbuild | 2 +- include/asm-generic/hardirq.h | 6 +- include/asm-generic/kmap_size.h | 12 + include/asm-generic/kmap_types.h | 11 - include/asm-generic/preempt.h | 3 + include/linux/blkdev.h | 2 +- include/linux/bottom_half.h | 8 +- include/linux/console.h | 11 + include/linux/cpuhotplug.h | 1 + include/linux/cpumask.h | 6 + include/linux/dcache.h | 4 +- include/linux/debug_locks.h | 3 +- include/linux/delay.h | 6 + include/linux/entry-common.h | 2 +- include/linux/eventfd.h | 11 +- include/linux/fs.h | 2 +- include/linux/hardirq.h | 7 +- include/linux/highmem-internal.h | 222 +++ include/linux/highmem.h | 294 ++- include/linux/interrupt.h | 34 +- include/linux/io-mapping.h | 28 +- include/linux/irq_cpustat.h | 28 - include/linux/irq_work.h | 13 + include/linux/irqdesc.h | 1 + include/linux/irqflags.h | 23 +- include/linux/kernel.h | 26 +- include/linux/kmsg_dump.h | 52 +- include/linux/local_lock_internal.h | 111 +- include/linux/mm_types.h | 4 + include/linux/mutex.h | 34 +- include/linux/mutex_rt.h | 130 ++ include/linux/nfs_xdr.h | 2 +- include/linux/notifier.h | 6 +- include/linux/pid.h | 1 + include/linux/preempt.h | 190 +- include/linux/printk.h | 30 +- include/linux/random.h | 2 +- include/linux/rbtree.h | 27 +- include/linux/rbtree_type.h | 31 + include/linux/rcupdate.h | 10 +- include/linux/rtmutex.h | 46 +- include/linux/rwlock_rt.h | 109 ++ include/linux/rwlock_types.h | 4 + include/linux/rwlock_types_rt.h | 56 + include/linux/rwsem-rt.h | 70 + include/linux/rwsem.h | 12 + include/linux/sched.h | 123 +- include/linux/sched/hotplug.h | 2 + include/linux/sched/mm.h | 11 + include/linux/sched/rt.h | 8 - include/linux/sched/wake_q.h | 13 +- include/linux/serial_8250.h | 5 + include/linux/shmem_fs.h | 2 +- include/linux/signal.h | 1 + include/linux/skbuff.h | 7 + include/linux/smp.h | 3 + include/linux/spinlock.h | 12 +- include/linux/spinlock_api_smp.h | 4 +- include/linux/spinlock_rt.h | 155 ++ include/linux/spinlock_types.h | 92 +- include/linux/spinlock_types_nort.h | 39 + include/linux/spinlock_types_raw.h | 65 + include/linux/spinlock_types_rt.h | 38 + include/linux/spinlock_types_up.h | 2 +- include/linux/stop_machine.h | 5 + include/linux/thread_info.h | 12 +- include/linux/trace_events.h | 65 +- include/linux/u64_stats_sync.h | 42 +- include/linux/vmstat.h | 4 + include/linux/vtime.h | 42 +- include/linux/wait.h | 1 + include/linux/ww_mutex.h | 8 + include/net/gen_stats.h | 11 +- include/net/net_seq_lock.h | 15 + include/net/netns/xfrm.h | 2 +- include/net/sch_generic.h | 27 +- include/trace/events/sched.h | 12 + init/Kconfig | 7 +- kernel/Kconfig.locks | 2 +- kernel/Kconfig.preempt | 7 + kernel/cgroup/cpuset.c | 82 +- kernel/cgroup/rstat.c | 5 +- kernel/cpu.c | 9 +- kernel/debug/kdb/kdb_main.c | 10 +- kernel/entry/common.c | 14 +- kernel/exit.c | 2 +- kernel/fork.c | 28 +- kernel/futex.c | 87 +- kernel/irq/handle.c | 8 +- kernel/irq/manage.c | 12 +- kernel/irq/spurious.c | 8 + kernel/irq_work.c | 136 +- kernel/kexec_core.c | 1 - kernel/ksysfs.c | 12 + kernel/kthread.c | 16 +- kernel/locking/Makefile | 10 +- kernel/locking/lockdep.c | 2 + kernel/locking/mutex-rt.c | 224 +++ kernel/locking/rtmutex-debug.c | 102 - kernel/locking/rtmutex-debug.h | 11 - kernel/locking/rtmutex.c | 939 +++++++-- kernel/locking/rtmutex.h | 7 - kernel/locking/rtmutex_common.h | 36 +- kernel/locking/rwlock-rt.c | 334 ++++ kernel/locking/rwsem-rt.c | 317 ++++ kernel/locking/rwsem.c | 6 + kernel/locking/spinlock.c | 7 + kernel/locking/spinlock_debug.c | 5 + kernel/notifier.c | 12 +- kernel/panic.c | 33 +- kernel/printk/Makefile | 1 - kernel/printk/internal.h | 37 - kernel/printk/printk.c | 1680 +++++++++-------- kernel/printk/printk_safe.c | 425 ----- kernel/ptrace.c | 32 +- kernel/rcu/Kconfig | 4 +- kernel/rcu/tree.c | 4 +- kernel/rcu/update.c | 4 +- kernel/sched/core.c | 1270 ++++++++++--- kernel/sched/cpudeadline.c | 4 +- kernel/sched/cpupri.c | 4 +- kernel/sched/cputime.c | 36 +- kernel/sched/deadline.c | 47 +- kernel/sched/fair.c | 16 +- kernel/sched/features.h | 8 + kernel/sched/rt.c | 81 +- kernel/sched/sched.h | 80 +- kernel/sched/swait.c | 1 + kernel/sched/topology.c | 1 + kernel/signal.c | 105 +- kernel/smp.c | 14 +- kernel/softirq.c | 428 ++++- kernel/stop_machine.c | 27 +- kernel/time/hrtimer.c | 30 + kernel/time/tick-sched.c | 2 +- kernel/time/timer.c | 9 +- kernel/trace/trace.c | 93 +- kernel/trace/trace.h | 19 - kernel/trace/trace_events.c | 2 + kernel/trace/trace_output.c | 19 +- kernel/workqueue.c | 4 + lib/Kconfig.debug | 2 +- lib/bug.c | 1 + lib/cpumask.c | 18 + lib/debugobjects.c | 5 +- lib/dump_stack.c | 2 + lib/irq_poll.c | 5 + lib/locking-selftest.c | 51 + lib/nmi_backtrace.c | 6 - lib/scatterlist.c | 2 +- lib/smp_processor_id.c | 5 + lib/test_lockup.c | 16 + mm/Kconfig | 5 +- mm/highmem.c | 262 ++- mm/memcontrol.c | 66 +- mm/page_alloc.c | 184 +- mm/shmem.c | 31 +- mm/slab.c | 90 +- mm/slab.h | 2 +- mm/slub.c | 148 +- mm/vmalloc.c | 13 +- mm/vmstat.c | 12 + mm/workingset.c | 5 +- mm/z3fold.c | 17 +- mm/zsmalloc.c | 85 +- mm/zswap.c | 1 + net/Kconfig | 2 +- net/core/dev.c | 33 +- net/core/gen_estimator.c | 6 +- net/core/gen_stats.c | 12 +- net/core/sock.c | 6 +- net/ipv4/inet_hashtables.c | 19 +- net/ipv6/inet6_hashtables.c | 5 +- net/sched/sch_api.c | 2 +- net/sched/sch_generic.c | 10 + net/sunrpc/svc_xprt.c | 4 +- net/xfrm/xfrm_state.c | 3 +- 414 files changed, 9029 insertions(+), 4925 deletions(-) delete mode 100644 arch/alpha/include/asm/kmap_types.h delete mode 100644 arch/arc/include/asm/kmap_types.h delete mode 100644 arch/arm/include/asm/kmap_types.h delete mode 100644 arch/arm/mm/highmem.c delete mode 100644 arch/ia64/include/asm/kmap_types.h delete mode 100644 arch/microblaze/mm/highmem.c delete mode 100644 arch/mips/include/asm/kmap_types.h delete mode 100644 arch/nds32/mm/highmem.c delete mode 100644 arch/parisc/include/asm/kmap_types.h delete mode 100644 arch/powerpc/include/asm/kmap_types.h delete mode 100644 arch/powerpc/mm/highmem.c delete mode 100644 arch/sh/include/asm/kmap_types.h delete mode 100644 arch/sparc/include/asm/kmap_types.h delete mode 100644 arch/sparc/mm/highmem.c delete mode 100644 arch/um/include/asm/kmap_types.h delete mode 100644 arch/x86/include/asm/kmap_types.h create mode 100644 include/asm-generic/kmap_size.h delete mode 100644 include/asm-generic/kmap_types.h create mode 100644 include/linux/highmem-internal.h delete mode 100644 include/linux/irq_cpustat.h create mode 100644 include/linux/mutex_rt.h create mode 100644 include/linux/rbtree_type.h create mode 100644 include/linux/rwlock_rt.h create mode 100644 include/linux/rwlock_types_rt.h create mode 100644 include/linux/rwsem-rt.h create mode 100644 include/linux/spinlock_rt.h create mode 100644 include/linux/spinlock_types_nort.h create mode 100644 include/linux/spinlock_types_raw.h create mode 100644 include/linux/spinlock_types_rt.h create mode 100644 include/net/net_seq_lock.h create mode 100644 kernel/locking/mutex-rt.c create mode 100644 kernel/locking/rwlock-rt.c create mode 100644 kernel/locking/rwsem-rt.c @@ -156,3 +156,5 @@ x509.genkey # Documentation toolchain sphinx_*/ + +*.dtbo diff --git a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst index 72f0f6fbd..6f89cf1e5 100644 --- a/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst +++ b/Documentation/RCU/Design/Expedited-Grace-Periods/Expedited-Grace-Periods.rst @@ -38,7 +38,7 @@ sections. RCU-preempt Expedited Grace Periods =================================== -``CONFIG_PREEMPT=y`` kernels implement RCU-preempt. +``CONFIG_PREEMPTION=y`` kernels implement RCU-preempt. The overall flow of the handling of a given CPU by an RCU-preempt expedited grace period is shown in the following diagram: @@ -112,7 +112,7 @@ things. RCU-sched Expedited Grace Periods --------------------------------- -``CONFIG_PREEMPT=n`` kernels implement RCU-sched. The overall flow of +``CONFIG_PREEMPTION=n`` kernels implement RCU-sched. The overall flow of the handling of a given CPU by an RCU-sched expedited grace period is shown in the following diagram: diff --git a/Documentation/RCU/Design/Requirements/Requirements.rst b/Documentation/RCU/Design/Requirements/Requirements.rst index 1ae79a10a..17d38480e 100644 --- a/Documentation/RCU/Design/Requirements/Requirements.rst +++ b/Documentation/RCU/Design/Requirements/Requirements.rst @@ -78,7 +78,7 @@ RCU treats a nested set as one big RCU read-side critical section. Production-quality implementations of ``rcu_read_lock()`` and ``rcu_read_unlock()`` are extremely lightweight, and in fact have exactly zero overhead in Linux kernels built for production use with -``CONFIG_PREEMPT=n``. +``CONFIG_PREEMPTION=n``. This guarantee allows ordering to be enforced with extremely low overhead to readers, for example: @@ -1182,7 +1182,7 @@ and has become decreasingly so as memory sizes have expanded and memory costs have plummeted. However, as I learned from Matt Mackall's `bloatwatch <http://elinux.org/Linux_Tiny-FAQ>`__ efforts, memory footprint is critically important on single-CPU systems with -non-preemptible (``CONFIG_PREEMPT=n``) kernels, and thus `tiny +non-preemptible (``CONFIG_PREEMPTION=n``) kernels, and thus `tiny RCU <https://lkml.kernel.org/g/20090113221724.GA15307@linux.vnet.ibm.com>`__ was born. Josh Triplett has since taken over the small-memory banner with his `Linux kernel tinification <https://tiny.wiki.kernel.org/>`__ @@ -1498,7 +1498,7 @@ limitations. Implementations of RCU for which ``rcu_read_lock()`` and ``rcu_read_unlock()`` generate no code, such as Linux-kernel RCU when -``CONFIG_PREEMPT=n``, can be nested arbitrarily deeply. After all, there +``CONFIG_PREEMPTION=n``, can be nested arbitrarily deeply. After all, there is no overhead. Except that if all these instances of ``rcu_read_lock()`` and ``rcu_read_unlock()`` are visible to the compiler, compilation will eventually fail due to exhausting memory, @@ -1771,7 +1771,7 @@ implementation can be a no-op. However, once the scheduler has spawned its first kthread, this early boot trick fails for ``synchronize_rcu()`` (as well as for -``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPT=y`` kernels. The +``synchronize_rcu_expedited()``) in ``CONFIG_PREEMPTION=y`` kernels. The reason is that an RCU read-side critical section might be preempted, which means that a subsequent ``synchronize_rcu()`` really does have to wait for something, as opposed to simply returning immediately. @@ -2010,7 +2010,7 @@ the following: 5 rcu_read_unlock(); 6 do_something_with(v, user_v); -If the compiler did make this transformation in a ``CONFIG_PREEMPT=n`` kernel +If the compiler did make this transformation in a ``CONFIG_PREEMPTION=n`` kernel build, and if ``get_user()`` did page fault, the result would be a quiescent state in the middle of an RCU read-side critical section. This misplaced quiescent state could result in line 4 being a use-after-free access, @@ -2289,10 +2289,10 @@ decides to throw at it. The Linux kernel is used for real-time workloads, especially in conjunction with the `-rt -patchset <https://rt.wiki.kernel.org/index.php/Main_Page>`__. The +patchset <https://wiki.linuxfoundation.org/realtime/>`__. The real-time-latency response requirements are such that the traditional approach of disabling preemption across RCU read-side critical sections -is inappropriate. Kernels built with ``CONFIG_PREEMPT=y`` therefore use +is inappropriate. Kernels built with ``CONFIG_PREEMPTION=y`` therefore use an RCU implementation that allows RCU read-side critical sections to be preempted. This requirement made its presence known after users made it clear that an earlier `real-time @@ -2414,7 +2414,7 @@ includes ``rcu_read_lock_bh()``, ``rcu_read_unlock_bh()``, ``call_rcu_bh()``, ``rcu_barrier_bh()``, and ``rcu_read_lock_bh_held()``. However, the update-side APIs are now simple wrappers for other RCU flavors, namely RCU-sched in -CONFIG_PREEMPT=n kernels and RCU-preempt otherwise. +CONFIG_PREEMPTION=n kernels and RCU-preempt otherwise. Sched Flavor (Historical) ~~~~~~~~~~~~~~~~~~~~~~~~~ @@ -2432,11 +2432,11 @@ not have this property, given that any point in the code outside of an RCU read-side critical section can be a quiescent state. Therefore, *RCU-sched* was created, which follows “classic” RCU in that an RCU-sched grace period waits for pre-existing interrupt and NMI -handlers. In kernels built with ``CONFIG_PREEMPT=n``, the RCU and +handlers. In kernels built with ``CONFIG_PREEMPTION=n``, the RCU and RCU-sched APIs have identical implementations, while kernels built with -``CONFIG_PREEMPT=y`` provide a separate implementation for each. +``CONFIG_PREEMPTION=y`` provide a separate implementation for each. -Note well that in ``CONFIG_PREEMPT=y`` kernels, +Note well that in ``CONFIG_PREEMPTION=y`` kernels, ``rcu_read_lock_sched()`` and ``rcu_read_unlock_sched()`` disable and re-enable preemption, respectively. This means that if there was a preemption attempt during the RCU-sched read-side critical section, @@ -2599,10 +2599,10 @@ userspace execution also delimit tasks-RCU read-side critical sections. The tasks-RCU API is quite compact, consisting only of ``call_rcu_tasks()``, ``synchronize_rcu_tasks()``, and -``rcu_barrier_tasks()``. In ``CONFIG_PREEMPT=n`` kernels, trampolines +``rcu_barrier_tasks()``. In ``CONFIG_PREEMPTION=n`` kernels, trampolines cannot be preempted, so these APIs map to ``call_rcu()``, ``synchronize_rcu()``, and ``rcu_barrier()``, respectively. In -``CONFIG_PREEMPT=y`` kernels, trampolines can be preempted, and these +``CONFIG_PREEMPTION=y`` kernels, trampolines can be preempted, and these three APIs are therefore implemented by separate functions that check for voluntary context switches. diff --git a/Documentation/RCU/checklist.rst b/Documentation/RCU/checklist.rst index 2efed9926..7ed495604 100644 --- a/Documentation/RCU/checklist.rst +++ b/Documentation/RCU/checklist.rst @@ -214,7 +214,7 @@ over a rather long period of time, but improvements are always welcome! the rest of the system. 7. As of v4.20, a given kernel implements only one RCU flavor, - which is RCU-sched for PREEMPT=n and RCU-preempt for PREEMPT=y. + which is RCU-sched for PREEMPTION=n and RCU-preempt for PREEMPTION=y. If the updater uses call_rcu() or synchronize_rcu(), then the corresponding readers my use rcu_read_lock() and rcu_read_unlock(), rcu_read_lock_bh() and rcu_read_unlock_bh(), diff --git a/Documentation/RCU/rcubarrier.rst b/Documentation/RCU/rcubarrier.rst index f64f4413a..3b4a24877 100644 --- a/Documentation/RCU/rcubarrier.rst +++ b/Documentation/RCU/rcubarrier.rst @@ -9,7 +9,7 @@ RCU (read-copy update) is a synchronization mechanism that can be thought of as a replacement for read-writer locking (among other things), but with very low-overhead readers that are immune to deadlock, priority inversion, and unbounded latency. RCU read-side critical sections are delimited -by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPT +by rcu_read_lock() and rcu_read_unlock(), which, in non-CONFIG_PREEMPTION kernels, generate no code whatsoever. This means that RCU writers are unaware of the presence of concurrent @@ -329,10 +329,10 @@ Answer: This cannot happen. The reason is that on_each_cpu() has its last to smp_call_function() and further to smp_call_function_on_cpu(), causing this latter to spin until the cross-CPU invocation of rcu_barrier_func() has completed. This by itself would prevent - a grace period from completing on non-CONFIG_PREEMPT kernels, + a grace period from completing on non-CONFIG_PREEMPTION kernels, since each CPU must undergo a context switch (or other quiescent state) before the grace period can complete. However, this is - of no use in CONFIG_PREEMPT kernels. + of no use in CONFIG_PREEMPTION kernels. Therefore, on_each_cpu() disables preemption across its call to smp_call_function() and also across the local call to diff --git a/Documentation/RCU/stallwarn.rst b/Documentation/RCU/stallwarn.rst index c9ab6af4d..e97d1b487 100644 --- a/Documentation/RCU/stallwarn.rst +++ b/Documentation/RCU/stallwarn.rst @@ -25,7 +25,7 @@ warnings: - A CPU looping with bottom halves disabled. -- For !CONFIG_PREEMPT kernels, a CPU looping anywhere in the kernel +- For !CONFIG_PREEMPTION kernels, a CPU looping anywhere in the kernel without invoking schedule(). If the looping in the kernel is really expected and desirable behavior, you might need to add some calls to cond_resched(). @@ -44,7 +44,7 @@ warnings: result in the ``rcu_.*kthread starved for`` console-log message, which will include additional debugging information. -- A CPU-bound real-time task in a CONFIG_PREEMPT kernel, which might +- A CPU-bound real-time task in a CONFIG_PREEMPTION kernel, which might happen to preempt a low-priority task in the middle of an RCU read-side critical section. This is especially damaging if that low-priority task is not permitted to run on any other CPU, diff --git a/Documentation/RCU/whatisRCU.rst b/Documentation/RCU/whatisRCU.rst index fb3ff76c3..3b2b1479f 100644 --- a/Documentation/RCU/whatisRCU.rst +++ b/Documentation/RCU/whatisRCU.rst @@ -684,7 +684,7 @@ Quick Quiz #1: ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ This section presents a "toy" RCU implementation that is based on "classic RCU". It is also short on performance (but only for updates) and -on features such as hotplug CPU and the ability to run in CONFIG_PREEMPT +on features such as hotplug CPU and the ability to run in CONFIG_PREEMPTION kernels. The definitions of rcu_dereference() and rcu_assign_pointer() are the same as those shown in the preceding section, so they are omitted. :: @@ -740,7 +740,7 @@ Quick Quiz #2: Quick Quiz #3: If it is illegal to block in an RCU read-side critical section, what the heck do you do in - PREEMPT_RT, where normal spinlocks can block??? + CONFIG_PREEMPT_RT, where normal spinlocks can block??? :ref:`Answers to Quick Quiz <8_whatisRCU>` @@ -1094,7 +1094,7 @@ Quick Quiz #2: overhead is **negative**. Answer: - Imagine a single-CPU system with a non-CONFIG_PREEMPT + Imagine a single-CPU system with a non-CONFIG_PREEMPTION kernel where a routing table is used by process-context code, but can be updated by irq-context code (for example, by an "ICMP REDIRECT" packet). The usual way of handling @@ -1121,10 +1121,10 @@ Answer: Quick Quiz #3: If it is illegal to block in an RCU read-side critical section, what the heck do you do in - PREEMPT_RT, where normal spinlocks can block??? + CONFIG_PREEMPT_RT, where normal spinlocks can block??? Answer: - Just as PREEMPT_RT permits preemption of spinlock + Just as CONFIG_PREEMPT_RT permits preemption of spinlock critical sections, it permits preemption of RCU read-side critical sections. It also permits spinlocks blocking while in RCU read-side critical diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index b5524464f..82f2c77b6 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -4250,6 +4250,10 @@ value, meaning that RCU_SOFTIRQ is used by default. Specify rcutree.use_softirq=0 to use rcuc kthreads. + But note that CONFIG_PREEMPT_RT=y kernels disable + this kernel boot parameter, forcibly setting it + to zero. + rcutree.rcu_fanout_exact= [KNL] Disable autobalancing of the rcu_node combining tree. This is used by rcutorture, and might @@ -4628,6 +4632,13 @@ only normal grace-period primitives. No effect on CONFIG_TINY_RCU kernels. + But note that CONFIG_PREEMPT_RT=y kernels enables + this kernel boot parameter, forcibly setting + it to the value one, that is, converting any + post-boot attempt at an expedited RCU grace + period to instead use normal non-expedited + grace-period processing. + rcupdate.rcu_task_ipi_delay= [KNL] Set time in jiffies during which RCU tasks will avoid sending IPIs, starting with the beginning diff --git a/Documentation/driver-api/io-mapping.rst b/Documentation/driver-api/io-mapping.rst index a966239f0..a7830c594 100644 --- a/Documentation/driver-api/io-mapping.rst +++ b/Documentation/driver-api/io-mapping.rst @@ -20,78 +20,64 @@ A mapping object is created during driver initialization using:: mappable, while 'size' indicates how large a mapping region to enable. Both are in bytes. -This _wc variant provides a mapping which may only be used -with the io_mapping_map_atomic_wc or io_mapping_map_wc. +This _wc variant provides a mapping which may only be used with +io_mapping_map_local_wc() or io_mapping_map_wc(). -With this mapping object, individual pages can be mapped either atomically -or not, depending on the necessary scheduling environment. Of course, atomic -maps are more efficient:: +With this mapping object, individual pages can be mapped either temporarily +or long term, depending on the requirements. Of course, temporary maps are +more efficient. - void *io_mapping_map_atomic_wc(struct io_mapping *mapping, - unsigned long offset) + void *io_mapping_map_local_wc(struct io_mapping *mapping, + unsigned long offset) -'offset' is the offset within the defined mapping region. -Accessing addresses beyond the region specified in the -creation function yields undefined results. Using an offset -which is not page aligned yields an undefined result. The -return value points to a single page in CPU address space. +'offset' is the offset within the defined mapping region. Accessing +addresses beyond the region specified in the creation function yields +undefined results. Using an offset which is not page aligned yields an +undefined result. The return value points to a single page in CPU address +space. -This _wc variant returns a write-combining map to the -page and may only be used with mappings created by -io_mapping_create_wc +This _wc variant returns a write-combining map to the page and may only be +used with mappings created by io_mapping_create_wc() -Note that the task may not sleep while holding this page -mapped. +Temporary mappings are only valid in the context of the caller. The mapping +is not guaranteed to be globaly visible. -:: +io_mapping_map_local_wc() has a side effect on X86 32bit as it disables +migration to make the mapping code work. No caller can rely on this side +effect. - void io_mapping_unmap_atomic(void *vaddr) +Nested mappings need to be undone in reverse order because the mapping +code uses a stack for keeping track of them:: -'vaddr' must be the value returned by the last -io_mapping_map_atomic_wc call. This unmaps the specified -page and allows the task to sleep once again. + addr1 = io_mapping_map_local_wc(map1, offset1); + addr2 = io_mapping_map_local_wc(map2, offset2); + ... + io_mapping_unmap_local(addr2); + io_mapping_unmap_local(addr1); -If you need to sleep while holding the lock, you can use the non-atomic -variant, although they may be significantly slower. +The mappings are released with:: -:: + void io_mapping_unmap_local(void *vaddr) + +'vaddr' must be the value returned by the last io_mapping_map_local_wc() +call. This unmaps the specified mapping and undoes eventual side effects of +the mapping function. + +If you need to sleep while holding a mapping, you can use the regular +variant, although this may be significantly slower:: void *io_mapping_map_wc(struct io_mapping *mapping, unsigned long offset) -This works like io_mapping_map_atomic_wc except it allows -the task to sleep while holding the page mapped. +This works like io_mapping_map_local_wc() except it has no side effects and +the pointer is globaly visible. - -:: +The mappings are released with:: void io_mapping_unmap(void *vaddr) -This works like io_mapping_unmap_atomic, except it is used -for pages mapped with io_mapping_map_wc. +Use for pages mapped with io_mapping_map_wc(). At driver close time, the io_mapping object must be freed:: void io_mapping_free(struct io_mapping *mapping) - -Current Implementation -====================== - -The initial implementation of these functions uses existing mapping -mechanisms and so provides only an abstraction layer and no new -functionality. - -On 64-bit processors, io_mapping_create_wc calls ioremap_wc for the whole -range, creating a permanent kernel-visible mapping to the resource. The -map_atomic and map functions add the requested offset to the base of the -virtual address returned by ioremap_wc. - -On 32-bit processors with HIGHMEM defined, io_mapping_map_atomic_wc uses -kmap_atomic_pfn to map the specified page in an atomic fashion; -kmap_atomic_pfn isn't really supposed to be used with device pages, but it -provides an efficient mapping for this usage. - -On 32-bit processors without HIGHMEM defined, io_mapping_map_atomic_wc and -io_mapping_map_wc both use ioremap_wc, a terribly inefficient function which -performs an IPI to inform all processors about the new mapping. This results -in a significant performance penalty. diff --git a/arch/Kconfig b/arch/Kconfig index 7a8e3d45b..4dbc4c659 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -50,6 +50,7 @@ config OPROFILE tristate "OProfile system profiling" depends on PROFILING depends on HAVE_OPROFILE + depends on !PREEMPT_RT select RING_BUFFER select RING_BUFFER_ALLOW_SWAP help @@ -673,6 +674,12 @@ config HAVE_TIF_NOHZ config HAVE_VIRT_CPU_ACCOUNTING bool +config HAVE_VIRT_CPU_ACCOUNTING_IDLE + bool + help + Architecture has its own way to account idle CPU time and therefore + doesn't implement vtime_account_idle(). + config ARCH_HAS_SCALED_CPUTIME bool @@ -687,7 +694,6 @@ config HAVE_VIRT_CPU_ACCOUNTING_GEN some 32-bit arches may require multiple accesses, so proper locking is needed to protect against concurrent accesses. - config HAVE_IRQ_TIME_ACCOUNTING bool help diff --git a/arch/alpha/include/asm/kmap_types.h b/arch/alpha/include/asm/kmap_types.h deleted file mode 100644 index 651714b45..000000000 --- a/arch/alpha/include/asm/kmap_types.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -/* Dummy header just to define km_type. */ - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include <asm-generic/kmap_types.h> - -#undef __WITH_KM_FENCE - -#endif diff --git a/arch/alpha/include/asm/spinlock_types.h b/arch/alpha/include/asm/spinlock_types.h index 1d5716bc0..6883bc952 100644 --- a/arch/alpha/include/asm/spinlock_types.h +++ b/arch/alpha/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef _ALPHA_SPINLOCK_TYPES_H #define _ALPHA_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - typedef struct { volatile unsigned int lock; } arch_spinlock_t; diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 0a89cc9de..d8804001d 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -507,6 +507,7 @@ config LINUX_RAM_BASE config HIGHMEM bool "High Memory Support" select ARCH_DISCONTIGMEM_ENABLE + select KMAP_LOCAL help With ARC 2G:2G address split, only upper 2G is directly addressable by kernel. Enable this to potentially allow access to rest of 2G and PAE diff --git a/arch/arc/include/asm/highmem.h b/arch/arc/include/asm/highmem.h index 6e5eafb3a..a6b8e2c35 100644 --- a/arch/arc/include/asm/highmem.h +++ b/arch/arc/include/asm/highmem.h @@ -9,17 +9,29 @@ #ifdef CONFIG_HIGHMEM #include <uapi/asm/page.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> + +#define FIXMAP_SIZE PGDIR_SIZE +#define PKMAP_SIZE PGDIR_SIZE /* start after vmalloc area */ #define FIXMAP_BASE (PAGE_OFFSET - FIXMAP_SIZE - PKMAP_SIZE) -#define FIXMAP_SIZE PGDIR_SIZE /* only 1 PGD worth */ -#define KM_TYPE_NR ((FIXMAP_SIZE >> PAGE_SHIFT)/NR_CPUS) -#define FIXMAP_ADDR(nr) (FIXMAP_BASE + ((nr) << PAGE_SHIFT)) + +#define FIX_KMAP_SLOTS (KM_MAX_IDX * NR_CPUS) +#define FIX_KMAP_BEGIN (0UL) +#define FIX_KMAP_END ((FIX_KMAP_BEGIN + FIX_KMAP_SLOTS) - 1) + +#define FIXADDR_TOP (FIXMAP_BASE + (FIX_KMAP_END << PAGE_SHIFT)) + +/* + * This should be converted to the asm-generic version, but of course this + * is needlessly different from all other architectures. Sigh - tglx + */ +#define __fix_to_virt(x) (FIXADDR_TOP - ((x) << PAGE_SHIFT)) +#define __virt_to_fix(x) (((FIXADDR_TOP - ((x) & PAGE_MASK))) >> PAGE_SHIFT) /* start after fixmap area */ #define PKMAP_BASE (FIXMAP_BASE + FIXMAP_SIZE) -#define PKMAP_SIZE PGDIR_SIZE #define LAST_PKMAP (PKMAP_SIZE >> PAGE_SHIFT) #define LAST_PKMAP_MASK (LAST_PKMAP - 1) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) @@ -29,11 +41,13 @@ extern void kmap_init(void); +#define arch_kmap_local_post_unmap(vaddr) \ + local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE) + static inline void flush_cache_kmaps(void) { flush_cache_all(); } - #endif #endif diff --git a/arch/arc/include/asm/kmap_types.h b/arch/arc/include/asm/kmap_types.h deleted file mode 100644 index fecf7851e..000000000 --- a/arch/arc/include/asm/kmap_types.h +++ /dev/null @@ -1,14 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2015 Synopsys, Inc. (www.synopsys.com) - */ - -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -/* - * We primarily need to define KM_TYPE_NR here but that in turn - * is a function of PGDIR_SIZE etc. - * To avoid circular deps issue, put everything in asm/highmem.h - */ -#endif diff --git a/arch/arc/mm/highmem.c b/arch/arc/mm/highmem.c index 1b9f473c6..c79912a6b 100644 --- a/arch/arc/mm/highmem.c +++ b/arch/arc/mm/highmem.c @@ -36,9 +36,8 @@ * This means each only has 1 PGDIR_SIZE worth of kvaddr mappings, which means * 2M of kvaddr space for typical config (8K page and 11:8:13 traversal split) * - * - fixmap anyhow needs a limited number of mappings. So 2M kvaddr == 256 PTE - * slots across NR_CPUS would be more than sufficient (generic code defines - * KM_TYPE_NR as 20). + * - The fixed KMAP slots for kmap_local/atomic() require KM_MAX_IDX slots per + * CPU. So the number of CPUs sharing a single PTE page is limited. * * - pkmap being preemptible, in theory could do with more than 256 concurrent * mappings. However, generic pkmap code: map_new_virtual(), doesn't traverse @@ -47,48 +46,6 @@ */ extern pte_t * pkmap_page_table; -static pte_t * fixmap_page_table; - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - int idx, cpu_idx; - unsigned long vaddr; - - cpu_idx = kmap_atomic_idx_push(); - idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); - vaddr = FIXMAP_ADDR(idx); - - set_pte_at(&init_mm, vaddr, fixmap_page_table + idx, - mk_pte(page, prot)); - - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kv) -{ - unsigned long kvaddr = (unsigned long)kv; - - if (kvaddr >= FIXMAP_BASE && kvaddr < (FIXMAP_BASE + FIXMAP_SIZE)) { - - /* - * Because preemption is disabled, this vaddr can be associated - * with the current allocated index. - * But in case of multiple live kmap_atomic(), it still relies on - * callers to unmap in right order. - */ - int cpu_idx = kmap_atomic_idx(); - int idx = cpu_idx + KM_TYPE_NR * smp_processor_id(); - - WARN_ON(kvaddr != FIXMAP_ADDR(idx)); - - pte_clear(&init_mm, kvaddr, fixmap_page_table + idx); - local_flush_tlb_kernel_range(kvaddr, kvaddr + PAGE_SIZE); - - kmap_atomic_idx_pop(); - } -} -EXPORT_SYMBOL(kunmap_atomic_high); static noinline pte_t * __init alloc_kmap_pgtable(unsigned long kvaddr) { @@ -108,10 +65,9 @@ void __init kmap_init(void) { /* Due to recursive include hell, we can't do this in processor.h */ BUILD_BUG_ON(PAGE_OFFSET < (VMALLOC_END + FIXMAP_SIZE + PKMAP_SIZE)); + BUILD_BUG_ON(LAST_PKMAP > PTRS_PER_PTE); + BUILD_BUG_ON(FIX_KMAP_SLOTS > PTRS_PER_PTE); - BUILD_BUG_ON(KM_TYPE_NR > PTRS_PER_PTE); pkmap_page_table = alloc_kmap_pgtable(PKMAP_BASE); - - BUILD_BUG_ON(LAST_PKMAP > PTRS_PER_PTE); - fixmap_page_table = alloc_kmap_pgtable(FIXMAP_BASE); + alloc_kmap_pgtable(FIXMAP_BASE); } diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 29a634b7d..77c96bea6 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -31,6 +31,7 @@ config ARM select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_OPTIONAL_KERNEL_RWX_DEFAULT if CPU_V7 select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT if MMU @@ -66,7 +67,7 @@ config ARM select HARDIRQS_SW_RESEND select HAVE_ARCH_AUDITSYSCALL if AEABI && !OABI_COMPAT select HAVE_ARCH_BITREVERSE if (CPU_32v7M || CPU_32v7) && !CPU_32v6 - select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU + select HAVE_ARCH_JUMP_LABEL if !XIP_KERNEL && !CPU_ENDIAN_BE32 && MMU && !PREEMPT_RT select HAVE_ARCH_KFENCE if MMU select HAVE_ARCH_KGDB if !CPU_ENDIAN_BE32 && MMU select HAVE_ARCH_KASAN if MMU && !XIP_KERNEL @@ -108,6 +109,7 @@ config ARM select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE if SMP && ARM_LPAE select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_RSEQ @@ -123,6 +125,7 @@ config ARM select OLD_SIGSUSPEND3 select PCI_SYSCALL if PCI select PERF_USE_VMALLOC + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select RTC_LIB select SET_FS select SYS_SUPPORTS_APM_EMULATION @@ -1512,6 +1515,7 @@ config HAVE_ARCH_PFN_VALID config HIGHMEM bool "High Memory Support" depends on MMU + select KMAP_LOCAL help The address space of ARM processors is only 4 Gigabytes large and it has to accommodate user address space, kernel address diff --git a/arch/arm/include/asm/fixmap.h b/arch/arm/include/asm/fixmap.h index 9575b4040..707068f85 100644 --- a/arch/arm/include/asm/fixmap.h +++ b/arch/arm/include/asm/fixmap.h @@ -7,14 +7,14 @@ #define FIXADDR_TOP (FIXADDR_END - PAGE_SIZE) #include <linux/pgtable.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> enum fixed_addresses { FIX_EARLYCON_MEM_BASE, __end_of_permanent_fixed_addresses, FIX_KMAP_BEGIN = __end_of_permanent_fixed_addresses, - FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, /* Support writing RO kernel text via kprobes, jump labels, etc. */ FIX_TEXT_POKE0, diff --git a/arch/arm/include/asm/hardirq.h b/arch/arm/include/asm/hardirq.h index b95848ed2..706efafbf 100644 --- a/arch/arm/include/asm/hardirq.h +++ b/arch/arm/include/asm/hardirq.h @@ -2,16 +2,11 @@ #ifndef __ASM_HARDIRQ_H #define __ASM_HARDIRQ_H -#include <linux/cache.h> -#include <linux/threads.h> #include <asm/irq.h> -typedef struct { - unsigned int __softirq_pending; -} ____cacheline_aligned irq_cpustat_t; - -#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ - #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 +#define ack_bad_irq ack_bad_irq + +#include <asm-generic/hardirq.h> #endif /* __ASM_HARDIRQ_H */ diff --git a/arch/arm/include/asm/highmem.h b/arch/arm/include/asm/highmem.h index 31811be38..b22dffa8c 100644 --- a/arch/arm/include/asm/highmem.h +++ b/arch/arm/include/asm/highmem.h @@ -2,7 +2,8 @@ #ifndef _ASM_HIGHMEM_H #define _ASM_HIGHMEM_H -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> +#include <asm/fixmap.h> #define PKMAP_BASE (PAGE_OFFSET - PMD_SIZE) #define LAST_PKMAP PTRS_PER_PTE @@ -46,19 +47,32 @@ extern pte_t *pkmap_page_table; #ifdef ARCH_NEEDS_KMAP_HIGH_GET extern void *kmap_high_get(struct page *page); -#else + +static inline void *arch_kmap_local_high_get(struct page *page) +{ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !cache_is_vivt()) + return NULL; + return kmap_high_get(page); +} +#define arch_kmap_local_high_get arch_kmap_local_high_get + +#else /* ARCH_NEEDS_KMAP_HIGH_GET */ static inline void *kmap_high_get(struct page *page) { return NULL; } -#endif +#endif /* !ARCH_NEEDS_KMAP_HIGH_GET */ -/* - * The following functions are already defined by <linux/highmem.h> - * when CONFIG_HIGHMEM is not set. - */ -#ifdef CONFIG_HIGHMEM -extern void *kmap_atomic_pfn(unsigned long pfn); -#endif +#define arch_kmap_local_post_map(vaddr, pteval) \ + local_flush_tlb_kernel_page(vaddr) + +#define arch_kmap_local_pre_unmap(vaddr) \ +do { \ + if (cache_is_vivt()) \ + __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); \ +} while (0) + +#define arch_kmap_local_post_unmap(vaddr) \ + local_flush_tlb_kernel_page(vaddr) #endif diff --git a/arch/arm/include/asm/irq.h b/arch/arm/include/asm/irq.h index 54b0180c8..36d040c68 100644 --- a/arch/arm/include/asm/irq.h +++ b/arch/arm/include/asm/irq.h @@ -31,6 +31,8 @@ void handle_IRQ(unsigned int, struct pt_regs *); void init_IRQ(void); #ifdef CONFIG_SMP +#include <linux/cpumask.h> + extern bool arch_trigger_cpumask_backtrace(const cpumask_t *mask, bool exclude_self); #define arch_trigger_cpumask_backtrace arch_trigger_cpumask_backtrace diff --git a/arch/arm/include/asm/kmap_types.h b/arch/arm/include/asm/kmap_types.h deleted file mode 100644 index 5590940ee..000000000 --- a/arch/arm/include/asm/kmap_types.h +++ /dev/null @@ -1,10 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __ARM_KMAP_TYPES_H -#define __ARM_KMAP_TYPES_H - -/* - * This is the "bare minimum". AIO seems to require this. - */ -#define KM_TYPE_NR 16 - -#endif diff --git a/arch/arm/include/asm/spinlock_types.h b/arch/arm/include/asm/spinlock_types.h index 597695864..a37c08039 100644 --- a/arch/arm/include/asm/spinlock_types.h +++ b/arch/arm/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - #define TICKET_SHIFT 16 typedef struct { diff --git a/arch/arm/include/asm/thread_info.h b/arch/arm/include/asm/thread_info.h index 9f7ca79cc..0bf67b7f0 100644 --- a/arch/arm/include/asm/thread_info.h +++ b/arch/arm/include/asm/thread_info.h @@ -55,6 +55,7 @@ struct cpu_context_save { struct thread_info { unsigned long flags; /* low level flags */ int preempt_count; /* 0 => preemptable, <0 => bug */ + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ mm_segment_t addr_limit; /* address limit */ struct task_struct *task; /* main task structure */ __u32 cpu; /* cpu */ @@ -145,8 +146,9 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define TIF_SYSCALL_TRACE 4 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 5 /* syscall auditing active */ #define TIF_SYSCALL_TRACEPOINT 6 /* syscall tracepoint instrumentation */ -#define TIF_SECCOMP 7 /* seccomp syscall filtering active */ -#define TIF_PATCH_PENDING 8 /* pending live patching update */ +#define TIF_NEED_RESCHED_LAZY 7 +#define TIF_SECCOMP 8 /* seccomp syscall filtering active */ +#define TIF_PATCH_PENDING 9 /* pending live patching update */ #define TIF_USING_IWMMXT 17 #define TIF_MEMDIE 18 /* is terminating due to OOM killer */ @@ -155,6 +157,7 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_UPROBE (1 << TIF_UPROBE) #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_SYSCALL_AUDIT (1 << TIF_SYSCALL_AUDIT) @@ -171,7 +174,8 @@ extern int vfp_restore_user_hwstate(struct user_vfp *, * Change these and you break ASM code in entry-common.S */ #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ - _TIF_NOTIFY_RESUME | _TIF_UPROBE) + _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ + _TIF_NEED_RESCHED_LAZY) #endif /* __KERNEL__ */ #endif /* __ASM_ARM_THREAD_INFO_H */ diff --git a/arch/arm/kernel/asm-offsets.c b/arch/arm/kernel/asm-offsets.c index 70993af22..024c65c3a 100644 --- a/arch/arm/kernel/asm-offsets.c +++ b/arch/arm/kernel/asm-offsets.c @@ -43,6 +43,7 @@ int main(void) BLANK(); DEFINE(TI_FLAGS, offsetof(struct thread_info, flags)); DEFINE(TI_PREEMPT, offsetof(struct thread_info, preempt_count)); + DEFINE(TI_PREEMPT_LAZY, offsetof(struct thread_info, preempt_lazy_count)); DEFINE(TI_ADDR_LIMIT, offsetof(struct thread_info, addr_limit)); DEFINE(TI_TASK, offsetof(struct thread_info, task)); DEFINE(TI_CPU, offsetof(struct thread_info, cpu)); diff --git a/arch/arm/kernel/entry-armv.S b/arch/arm/kernel/entry-armv.S index 4c43295e1..ce2482591 100644 --- a/arch/arm/kernel/entry-armv.S +++ b/arch/arm/kernel/entry-armv.S @@ -207,11 +207,18 @@ __irq_svc: #ifdef CONFIG_PREEMPTION ldr r8, [tsk, #TI_PREEMPT] @ get preempt count - ldr r0, [tsk, #TI_FLAGS] @ get flags teq r8, #0 @ if preempt count != 0 + bne 1f @ return from exeption + ldr r0, [tsk, #TI_FLAGS] @ get flags + tst r0, #_TIF_NEED_RESCHED @ if NEED_RESCHED is set + blne svc_preempt @ preempt! + + ldr r8, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r8, #0 @ if preempt lazy count != 0 movne r0, #0 @ force flags to 0 - tst r0, #_TIF_NEED_RESCHED + tst r0, #_TIF_NEED_RESCHED_LAZY blne svc_preempt +1: #endif svc_exit r5, irq = 1 @ return from exception @@ -226,8 +233,14 @@ svc_preempt: 1: bl preempt_schedule_irq @ irq en/disable is done inside ldr r0, [tsk, #TI_FLAGS] @ get new tasks TI_FLAGS tst r0, #_TIF_NEED_RESCHED + bne 1b + tst r0, #_TIF_NEED_RESCHED_LAZY reteq r8 @ go again - b 1b + ldr r0, [tsk, #TI_PREEMPT_LAZY] @ get preempt lazy count + teq r0, #0 @ if preempt lazy count != 0 + beq 1b + ret r8 @ go again + #endif __und_fault: diff --git a/arch/arm/kernel/entry-common.S b/arch/arm/kernel/entry-common.S index fee279e28..0f5566da6 100644 --- a/arch/arm/kernel/entry-common.S +++ b/arch/arm/kernel/entry-common.S @@ -54,7 +54,9 @@ __ret_fast_syscall: cmp r2, r1 blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) + bne fast_work_pending + tst r1, #_TIF_SECCOMP bne fast_work_pending @@ -92,8 +94,11 @@ __ret_fast_syscall: cmp r2, r1 blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing - tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK + tst r1, #((_TIF_SYSCALL_WORK | _TIF_WORK_MASK) & ~_TIF_SECCOMP) + bne do_slower_path + tst r1, #_TIF_SECCOMP beq no_work_pending +do_slower_path: UNWIND(.fnend ) ENDPROC(ret_fast_syscall) diff --git a/arch/arm/kernel/signal.c b/arch/arm/kernel/signal.c index 2f81d3af5..6e69f7b3d 100644 --- a/arch/arm/kernel/signal.c +++ b/arch/arm/kernel/signal.c @@ -649,7 +649,8 @@ do_work_pending(struct pt_regs *regs, unsigned int thread_flags, int syscall) */ trace_hardirqs_off(); do { - if (likely(thread_flags & _TIF_NEED_RESCHED)) { + if (likely(thread_flags & (_TIF_NEED_RESCHED | + _TIF_NEED_RESCHED_LAZY))) { schedule(); } else { if (unlikely(!user_mode(regs))) diff --git a/arch/arm/kernel/smp.c b/arch/arm/kernel/smp.c index d94e39a21..44383bfbf 100644 --- a/arch/arm/kernel/smp.c +++ b/arch/arm/kernel/smp.c @@ -671,9 +671,7 @@ static void do_handle_IPI(int ipinr) break; case IPI_CPU_BACKTRACE: - printk_nmi_enter(); nmi_cpu_backtrace(get_irq_regs()); - printk_nmi_exit(); break; default: diff --git a/arch/arm/mm/Makefile b/arch/arm/mm/Makefile index 4536159bc..3510503bc 100644 --- a/arch/arm/mm/Makefile +++ b/arch/arm/mm/Makefile @@ -21,7 +21,6 @@ KASAN_SANITIZE_physaddr.o := n obj-$(CONFIG_DEBUG_VIRTUAL) += physaddr.o obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o -obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_ARM_PV_FIXUP) += pv-fixup-asm.o diff --git a/arch/arm/mm/cache-feroceon-l2.c b/arch/arm/mm/cache-feroceon-l2.c index 10f909744..fd9e3e740 100644 --- a/arch/arm/mm/cache-feroceon-l2.c +++ b/arch/arm/mm/cache-feroceon-l2.c @@ -49,9 +49,9 @@ static inline unsigned long l2_get_va(unsigned long paddr) * we simply install a virtual mapping for it only for the * TLB lookup to occur, hence no need to flush the untouched * memory mapping afterwards (note: a cache flush may happen - * in some circumstances depending on the path taken in kunmap_atomic). + * in some circumstances depending on the path taken in kunmap_local). */ - void *vaddr = kmap_atomic_pfn(paddr >> PAGE_SHIFT); + void *vaddr = kmap_local_pfn(paddr >> PAGE_SHIFT); return (unsigned long)vaddr + (paddr & ~PAGE_MASK); #else return __phys_to_virt(paddr); @@ -61,7 +61,7 @@ static inline unsigned long l2_get_va(unsigned long paddr) static inline void l2_put_va(unsigned long vaddr) { #ifdef CONFIG_HIGHMEM - kunmap_atomic((void *)vaddr); + kunmap_local((void *)vaddr); #endif } diff --git a/arch/arm/mm/cache-xsc3l2.c b/arch/arm/mm/cache-xsc3l2.c index 581473165..f34845da3 100644 --- a/arch/arm/mm/cache-xsc3l2.c +++ b/arch/arm/mm/cache-xsc3l2.c @@ -59,7 +59,7 @@ static inline void l2_unmap_va(unsigned long va) { #ifdef CONFIG_HIGHMEM if (va != -1) - kunmap_atomic((void *)va); + kunmap_local((void *)va); #endif } @@ -75,7 +75,7 @@ static inline unsigned long l2_map_va(unsigned long pa, unsigned long prev_va) * in place for it. */ l2_unmap_va(prev_va); - va = (unsigned long)kmap_atomic_pfn(pa >> PAGE_SHIFT); + va = (unsigned long)kmap_local_pfn(pa >> PAGE_SHIFT); } return va + (pa_offset >> (32 - PAGE_SHIFT)); #else diff --git a/arch/arm/mm/fault.c b/arch/arm/mm/fault.c index 91965fb04..d34166682 100644 --- a/arch/arm/mm/fault.c +++ b/arch/arm/mm/fault.c @@ -430,6 +430,9 @@ do_translation_fault(unsigned long addr, unsigned int fsr, if (addr < TASK_SIZE) return do_page_fault(addr, fsr, regs); + if (interrupts_enabled(regs)) + local_irq_enable(); + if (user_mode(regs)) goto bad_area; diff --git a/arch/arm/mm/highmem.c b/arch/arm/mm/highmem.c deleted file mode 100644 index 187fab227..000000000 --- a/arch/arm/mm/highmem.c +++ /dev/null @@ -1,121 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * arch/arm/mm/highmem.c -- ARM highmem support - * - * Author: Nicolas Pitre - * Created: september 8, 2008 - * Copyright: Marvell Semiconductors Inc. - */ - -#include <linux/module.h> -#include <linux/highmem.h> -#include <linux/interrupt.h> -#include <asm/fixmap.h> -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include "mm.h" - -static inline void set_fixmap_pte(int idx, pte_t pte) -{ - unsigned long vaddr = __fix_to_virt(idx); - pte_t *ptep = virt_to_kpte(vaddr); - - set_pte_ext(ptep, pte, 0); - local_flush_tlb_kernel_page(vaddr); -} - -static inline pte_t get_fixmap_pte(unsigned long vaddr) -{ - pte_t *ptep = virt_to_kpte(vaddr); - - return *ptep; -} - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned int idx; - unsigned long vaddr; - void *kmap; - int type; - -#ifdef CONFIG_DEBUG_HIGHMEM - /* - * There is no cache coherency issue when non VIVT, so force the - * dedicated kmap usage for better debugging purposes in that case. - */ - if (!cache_is_vivt()) - kmap = NULL; - else -#endif - kmap = kmap_high_get(page); - if (kmap) - return kmap; - - type = kmap_atomic_idx_push(); - - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(idx); -#ifdef CONFIG_DEBUG_HIGHMEM - /* - * With debugging enabled, kunmap_atomic forces that entry to 0. - * Make sure it was indeed properly unmapped. - */ - BUG_ON(!pte_none(get_fixmap_pte(vaddr))); -#endif - /* - * When debugging is off, kunmap_atomic leaves the previous mapping - * in place, so the contained TLB flush ensures the TLB is updated - * with the new mapping. - */ - set_fixmap_pte(idx, mk_pte(page, prot)); - - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int idx, type; - - if (kvaddr >= (void *)FIXADDR_START) { - type = kmap_atomic_idx(); - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); - - if (cache_is_vivt()) - __cpuc_flush_dcache_area((void *)vaddr, PAGE_SIZE); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(vaddr != __fix_to_virt(idx)); - set_fixmap_pte(idx, __pte(0)); -#else - (void) idx; /* to kill a warning */ -#endif - kmap_atomic_idx_pop(); - } else if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { - /* this address was obtained through kmap_high_get() */ - kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); - } -} -EXPORT_SYMBOL(kunmap_atomic_high); - -void *kmap_atomic_pfn(unsigned long pfn) -{ - unsigned long vaddr; - int idx, type; - struct page *page = pfn_to_page(pfn); - - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - - type = kmap_atomic_idx_push(); - idx = FIX_KMAP_BEGIN + type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(get_fixmap_pte(vaddr))); -#endif - set_fixmap_pte(idx, pfn_pte(pfn, kmap_prot)); - - return (void *)vaddr; -} diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index 259e4a183..85f711f2a 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -76,6 +76,7 @@ config ARM64 select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_INT128 if CC_HAS_INT128 && (GCC_VERSION >= 50000 || CC_IS_CLANG) select ARCH_SUPPORTS_NUMA_BALANCING + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_WANT_COMPAT_IPC_PARSE_VERSION if COMPAT select ARCH_WANT_DEFAULT_BPF_JIT select ARCH_WANT_DEFAULT_TOPDOWN_MMAP_LAYOUT @@ -177,6 +178,7 @@ config ARM64 select HAVE_PERF_EVENTS select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select HAVE_REGS_AND_STACK_ACCESS_API select HAVE_FUNCTION_ARG_ACCESS_API select HAVE_FUTEX_CMPXCHG if FUTEX @@ -199,6 +201,7 @@ config ARM64 select PCI_DOMAINS_GENERIC if PCI select PCI_ECAM if (ACPI && PCI) select PCI_SYSCALL if PCI + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select POWER_RESET select POWER_SUPPLY select SPARSE_IRQ diff --git a/arch/arm64/include/asm/hardirq.h b/arch/arm64/include/asm/hardirq.h index 5ffa4bacd..cbfa7b6f2 100644 --- a/arch/arm64/include/asm/hardirq.h +++ b/arch/arm64/include/asm/hardirq.h @@ -13,11 +13,8 @@ #include <asm/kvm_arm.h> #include <asm/sysreg.h> -typedef struct { - unsigned int __softirq_pending; -} ____cacheline_aligned irq_cpustat_t; - -#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ +#define ack_bad_irq ack_bad_irq +#include <asm-generic/hardirq.h> #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 diff --git a/arch/arm64/include/asm/preempt.h b/arch/arm64/include/asm/preempt.h index e83f0982b..7a5770d82 100644 --- a/arch/arm64/include/asm/preempt.h +++ b/arch/arm64/include/asm/preempt.h @@ -70,17 +70,43 @@ static inline bool __preempt_count_dec_and_test(void) * interrupt occurring between the non-atomic READ_ONCE/WRITE_ONCE * pair. */ - return !pc || !READ_ONCE(ti->preempt_count); + if (!pc || !READ_ONCE(ti->preempt_count)) + return true; +#ifdef CONFIG_PREEMPT_LAZY + if ((pc & ~PREEMPT_NEED_RESCHED)) + return false; + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif } static inline bool should_resched(int preempt_offset) { +#ifdef CONFIG_PREEMPT_LAZY + u64 pc = READ_ONCE(current_thread_info()->preempt_count); + if (pc == preempt_offset) + return true; + + if ((pc & ~PREEMPT_NEED_RESCHED) != preempt_offset) + return false; + + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else u64 pc = READ_ONCE(current_thread_info()->preempt_count); return pc == preempt_offset; +#endif } #ifdef CONFIG_PREEMPTION void preempt_schedule(void); +#ifdef CONFIG_PREEMPT_RT +void preempt_schedule_lock(void); +#endif #define __preempt_schedule() preempt_schedule() void preempt_schedule_notrace(void); #define __preempt_schedule_notrace() preempt_schedule_notrace() diff --git a/arch/arm64/include/asm/spinlock_types.h b/arch/arm64/include/asm/spinlock_types.h index 18782f0c4..6672b0535 100644 --- a/arch/arm64/include/asm/spinlock_types.h +++ b/arch/arm64/include/asm/spinlock_types.h @@ -5,10 +5,6 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) -# error "please don't include this file directly" -#endif - #include <asm-generic/qspinlock_types.h> #include <asm-generic/qrwlock_types.h> diff --git a/arch/arm64/include/asm/thread_info.h b/arch/arm64/include/asm/thread_info.h index af49b6190..f8ba7a6ec 100644 --- a/arch/arm64/include/asm/thread_info.h +++ b/arch/arm64/include/asm/thread_info.h @@ -27,6 +27,7 @@ struct thread_info { #ifdef CONFIG_ARM64_SW_TTBR0_PAN u64 ttbr0; /* saved TTBR0_EL1 */ #endif + int preempt_lazy_count; /* 0 => preemptable, <0 => bug */ union { u64 preempt_count; /* 0 => preemptible, <0 => bug */ struct { @@ -69,6 +70,7 @@ void arch_release_task_struct(struct task_struct *tsk); #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ #define TIF_MTE_ASYNC_FAULT 5 /* MTE Asynchronous Tag Check Fault */ +#define TIF_NEED_RESCHED_LAZY 6 #define TIF_SYSCALL_TRACE 8 /* syscall trace active */ #define TIF_SYSCALL_AUDIT 9 /* syscall auditing */ #define TIF_SYSCALL_TRACEPOINT 10 /* syscall tracepoint for ftrace */ @@ -101,14 +103,17 @@ void arch_release_task_struct(struct task_struct *tsk); #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_SVE (1 << TIF_SVE) #define _TIF_MTE_ASYNC_FAULT (1 << TIF_MTE_ASYNC_FAULT) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_32BIT_AARCH64 (1 << TIF_32BIT_AARCH64) #define _TIF_PATCH_PENDING (1 << TIF_PATCH_PENDING) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ - _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT) + _TIF_UPROBE | _TIF_MTE_ASYNC_FAULT | \ + _TIF_NEED_RESCHED_LAZY) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ _TIF_SYSCALL_EMU) diff --git a/arch/arm64/kernel/asm-offsets.c b/arch/arm64/kernel/asm-offsets.c index 5f59e24c9..d8d41b5f9 100644 --- a/arch/arm64/kernel/asm-offsets.c +++ b/arch/arm64/kernel/asm-offsets.c @@ -31,6 +31,7 @@ int main(void) DEFINE(TSK_TI_CPU, offsetof(struct task_struct, thread_info.cpu)); DEFINE(TSK_TI_FLAGS, offsetof(struct task_struct, thread_info.flags)); DEFINE(TSK_TI_PREEMPT, offsetof(struct task_struct, thread_info.preempt_count)); + DEFINE(TSK_TI_PREEMPT_LAZY, offsetof(struct task_struct, thread_info.preempt_lazy_count)); #ifdef CONFIG_ARM64_SW_TTBR0_PAN DEFINE(TSK_TI_TTBR0, offsetof(struct task_struct, thread_info.ttbr0)); #endif diff --git a/arch/arm64/kernel/entry.S b/arch/arm64/kernel/entry.S index 21bcf1e79..aa6593f0b 100644 --- a/arch/arm64/kernel/entry.S +++ b/arch/arm64/kernel/entry.S @@ -503,9 +503,18 @@ alternative_if ARM64_HAS_IRQ_PRIO_MASKING mrs x0, daif orr x24, x24, x0 alternative_else_nop_endif - cbnz x24, 1f // preempt count != 0 || NMI return path - bl arm64_preempt_schedule_irq // irq en/disable is done inside + + cbz x24, 1f // (need_resched + count) == 0 + cbnz w24, 2f // count != 0 + + ldr w24, [tsk, #TSK_TI_PREEMPT_LAZY] // get preempt lazy count + cbnz w24, 2f // preempt lazy count != 0 + + ldr x0, [tsk, #TSK_TI_FLAGS] // get flags + tbz x0, #TIF_NEED_RESCHED_LAZY, 2f // needs rescheduling? 1: + bl arm64_preempt_schedule_irq // irq en/disable is done inside +2: #endif mov x0, sp diff --git a/arch/arm64/kernel/fpsimd.c b/arch/arm64/kernel/fpsimd.c index 5335a6bd1..aa631771e 100644 --- a/arch/arm64/kernel/fpsimd.c +++ b/arch/arm64/kernel/fpsimd.c @@ -180,7 +180,7 @@ static void __get_cpu_fpsimd_context(void) */ static void get_cpu_fpsimd_context(void) { - local_bh_disable(); + preempt_disable(); __get_cpu_fpsimd_context(); } @@ -201,7 +201,7 @@ static void __put_cpu_fpsimd_context(void) static void put_cpu_fpsimd_context(void) { __put_cpu_fpsimd_context(); - local_bh_enable(); + preempt_enable(); } static bool have_cpu_fpsimd_context(void) @@ -226,6 +226,16 @@ static void sve_free(struct task_struct *task) __sve_free(task); } +static void *sve_free_atomic(struct task_struct *task) +{ + void *sve_state = task->thread.sve_state; + + WARN_ON(test_tsk_thread_flag(task, TIF_SVE)); + + task->thread.sve_state = NULL; + return sve_state; +} + /* * TIF_SVE controls whether a task can use SVE without trapping while * in userspace, and also the way a task's FPSIMD/SVE state is stored @@ -1022,6 +1032,7 @@ void fpsimd_thread_switch(struct task_struct *next) void fpsimd_flush_thread(void) { int vl, supported_vl; + void *mem = NULL; if (!system_supports_fpsimd()) return; @@ -1034,7 +1045,7 @@ void fpsimd_flush_thread(void) if (system_supports_sve()) { clear_thread_flag(TIF_SVE); - sve_free(current); + mem = sve_free_atomic(current); /* * Reset the task vector length as required. @@ -1068,6 +1079,7 @@ void fpsimd_flush_thread(void) } put_cpu_fpsimd_context(); + kfree(mem); } /* diff --git a/arch/arm64/kernel/signal.c b/arch/arm64/kernel/signal.c index d288bb4a1..3e5b354dd 100644 --- a/arch/arm64/kernel/signal.c +++ b/arch/arm64/kernel/signal.c @@ -692,7 +692,7 @@ asmlinkage void do_notify_resume(struct pt_regs *regs, unsigned long thread_flags) { do { - if (thread_flags & _TIF_NEED_RESCHED) { + if (thread_flags & _TIF_NEED_RESCHED_MASK) { /* Unmask Debug and SError for the next task */ local_daif_restore(DAIF_PROCCTX_NOIRQ); diff --git a/arch/arm64/kvm/arm.c b/arch/arm64/kvm/arm.c index 3ae13ef0c..1fa19b717 100644 --- a/arch/arm64/kvm/arm.c +++ b/arch/arm64/kvm/arm.c @@ -779,7 +779,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) * involves poking the GIC, which must be done in a * non-preemptible context. */ - preempt_disable(); + migrate_disable(); kvm_pmu_flush_hwstate(vcpu); @@ -828,7 +828,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) kvm_timer_sync_user(vcpu); kvm_vgic_sync_hwstate(vcpu); local_irq_enable(); - preempt_enable(); + migrate_enable(); continue; } @@ -907,7 +907,7 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu) /* Exit types that need handling before we can be preempted */ handle_exit_early(vcpu, ret); - preempt_enable(); + migrate_enable(); /* * The ARMv8 architecture doesn't give the hypervisor diff --git a/arch/csky/Kconfig b/arch/csky/Kconfig index 7bf0a617e..c9f2533cc 100644 --- a/arch/csky/Kconfig +++ b/arch/csky/Kconfig @@ -286,6 +286,7 @@ config NR_CPUS config HIGHMEM bool "High Memory Support" depends on !CPU_CK610 + select KMAP_LOCAL default y config FORCE_MAX_ZONEORDER diff --git a/arch/csky/include/asm/fixmap.h b/arch/csky/include/asm/fixmap.h index 81f9477d5..4b589cc20 100644 --- a/arch/csky/include/asm/fixmap.h +++ b/arch/csky/include/asm/fixmap.h @@ -8,7 +8,7 @@ #include <asm/memory.h> #ifdef CONFIG_HIGHMEM #include <linux/threads.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> #endif enum fixed_addresses { @@ -17,7 +17,7 @@ enum fixed_addresses { #endif #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, - FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #endif __end_of_fixed_addresses }; diff --git a/arch/csky/include/asm/highmem.h b/arch/csky/include/asm/highmem.h index 14645e3d5..1f4ed3f4c 100644 --- a/arch/csky/include/asm/highmem.h +++ b/arch/csky/include/asm/highmem.h @@ -9,7 +9,7 @@ #include <linux/init.h> #include <linux/interrupt.h> #include <linux/uaccess.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> #include <asm/cache.h> /* undef for production */ @@ -32,10 +32,12 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void *kmap_atomic_pfn(unsigned long pfn); #define flush_cache_kmaps() do {} while (0) +#define arch_kmap_local_post_map(vaddr, pteval) kmap_flush_tlb(vaddr) +#define arch_kmap_local_post_unmap(vaddr) kmap_flush_tlb(vaddr) + extern void kmap_init(void); #endif /* __KERNEL__ */ diff --git a/arch/csky/mm/highmem.c b/arch/csky/mm/highmem.c index 89c10800a..4161df3c6 100644 --- a/arch/csky/mm/highmem.c +++ b/arch/csky/mm/highmem.c @@ -9,8 +9,6 @@ #include <asm/tlbflush.h> #include <asm/cacheflush.h> -static pte_t *kmap_pte; - unsigned long highstart_pfn, highend_pfn; void kmap_flush_tlb(unsigned long addr) @@ -19,67 +17,7 @@ void kmap_flush_tlb(unsigned long addr) } EXPORT_SYMBOL(kmap_flush_tlb); -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte - idx))); -#endif - set_pte(kmap_pte-idx, mk_pte(page, prot)); - flush_tlb_one((unsigned long)vaddr); - - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int idx; - - if (vaddr < FIXADDR_START) - return; - -#ifdef CONFIG_DEBUG_HIGHMEM - idx = KM_TYPE_NR*smp_processor_id() + kmap_atomic_idx(); - - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); - - pte_clear(&init_mm, vaddr, kmap_pte - idx); - flush_tlb_one(vaddr); -#else - (void) idx; /* to kill a warning */ -#endif - kmap_atomic_idx_pop(); -} -EXPORT_SYMBOL(kunmap_atomic_high); - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn) -{ - unsigned long vaddr; - int idx, type; - - pagefault_disable(); - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, PAGE_KERNEL)); - flush_tlb_one(vaddr); - - return (void *) vaddr; -} - -static void __init kmap_pages_init(void) +void __init kmap_init(void) { unsigned long vaddr; pgd_t *pgd; @@ -96,14 +34,3 @@ static void __init kmap_pages_init(void) pte = pte_offset_kernel(pmd, vaddr); pkmap_page_table = pte; } - -void __init kmap_init(void) -{ - unsigned long vaddr; - - kmap_pages_init(); - - vaddr = __fix_to_virt(FIX_KMAP_BEGIN); - - kmap_pte = pte_offset_kernel((pmd_t *)pgd_offset_k(vaddr), vaddr); -} diff --git a/arch/hexagon/include/asm/spinlock_types.h b/arch/hexagon/include/asm/spinlock_types.h index 19d233497..de72fb230 100644 --- a/arch/hexagon/include/asm/spinlock_types.h +++ b/arch/hexagon/include/asm/spinlock_types.h @@ -8,10 +8,6 @@ #ifndef _ASM_SPINLOCK_TYPES_H #define _ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - typedef struct { volatile unsigned int lock; } arch_spinlock_t; diff --git a/arch/ia64/include/asm/kmap_types.h b/arch/ia64/include/asm/kmap_types.h deleted file mode 100644 index 5c268cf7c..000000000 --- a/arch/ia64/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_IA64_KMAP_TYPES_H -#define _ASM_IA64_KMAP_TYPES_H - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include <asm-generic/kmap_types.h> - -#undef __WITH_KM_FENCE - -#endif /* _ASM_IA64_KMAP_TYPES_H */ diff --git a/arch/ia64/include/asm/spinlock_types.h b/arch/ia64/include/asm/spinlock_types.h index 6e345fefc..681408d68 100644 --- a/arch/ia64/include/asm/spinlock_types.h +++ b/arch/ia64/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef _ASM_IA64_SPINLOCK_TYPES_H #define _ASM_IA64_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - typedef struct { volatile unsigned int lock; } arch_spinlock_t; diff --git a/arch/ia64/kernel/time.c b/arch/ia64/kernel/time.c index 7abc5f37b..733e0e332 100644 --- a/arch/ia64/kernel/time.c +++ b/arch/ia64/kernel/time.c @@ -138,12 +138,8 @@ void vtime_account_kernel(struct task_struct *tsk) struct thread_info *ti = task_thread_info(tsk); __u64 stime = vtime_delta(tsk); - if ((tsk->flags & PF_VCPU) && !irq_count()) + if (tsk->flags & PF_VCPU) ti->gtime += stime; - else if (hardirq_count()) - ti->hardirq_time += stime; - else if (in_serving_softirq()) - ti->softirq_time += stime; else ti->stime += stime; } @@ -156,6 +152,20 @@ void vtime_account_idle(struct task_struct *tsk) ti->idle_time += vtime_delta(tsk); } +void vtime_account_softirq(struct task_struct *tsk) +{ + struct thread_info *ti = task_thread_info(tsk); + + ti->softirq_time += vtime_delta(tsk); +} + +void vtime_account_hardirq(struct task_struct *tsk) +{ + struct thread_info *ti = task_thread_info(tsk); + + ti->hardirq_time += vtime_delta(tsk); +} + #endif /* CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ static irqreturn_t diff --git a/arch/microblaze/Kconfig b/arch/microblaze/Kconfig index 33925ffed..7f6ca0ab4 100644 --- a/arch/microblaze/Kconfig +++ b/arch/microblaze/Kconfig @@ -155,6 +155,7 @@ config XILINX_UNCACHED_SHADOW config HIGHMEM bool "High memory support" depends on MMU + select KMAP_LOCAL help The address space of Microblaze processors is only 4 Gigabytes large and it has to accommodate user address space, kernel address diff --git a/arch/microblaze/include/asm/fixmap.h b/arch/microblaze/include/asm/fixmap.h index 0379ce522..e6e9288bf 100644 --- a/arch/microblaze/include/asm/fixmap.h +++ b/arch/microblaze/include/asm/fixmap.h @@ -20,7 +20,7 @@ #include <asm/page.h> #ifdef CONFIG_HIGHMEM #include <linux/threads.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> #endif #define FIXADDR_TOP ((unsigned long)(-PAGE_SIZE)) @@ -47,7 +47,7 @@ enum fixed_addresses { FIX_HOLE, #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * num_possible_cpus()) - 1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * num_possible_cpus()) - 1, #endif __end_of_fixed_addresses }; diff --git a/arch/microblaze/include/asm/highmem.h b/arch/microblaze/include/asm/highmem.h index 284ca8fb5..4418633fb 100644 --- a/arch/microblaze/include/asm/highmem.h +++ b/arch/microblaze/include/asm/highmem.h @@ -25,7 +25,6 @@ #include <linux/uaccess.h> #include <asm/fixmap.h> -extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; /* @@ -52,6 +51,11 @@ extern pte_t *pkmap_page_table; #define flush_cache_kmaps() { flush_icache(); flush_dcache(); } +#define arch_kmap_local_post_map(vaddr, pteval) \ + local_flush_tlb_page(NULL, vaddr); +#define arch_kmap_local_post_unmap(vaddr) \ + local_flush_tlb_page(NULL, vaddr); + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/microblaze/mm/Makefile b/arch/microblaze/mm/Makefile index 1b16875ce..8ced71100 100644 --- a/arch/microblaze/mm/Makefile +++ b/arch/microblaze/mm/Makefile @@ -6,4 +6,3 @@ obj-y := consistent.o init.o obj-$(CONFIG_MMU) += pgtable.o mmu_context.o fault.o -obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/microblaze/mm/highmem.c b/arch/microblaze/mm/highmem.c deleted file mode 100644 index 92e089041..000000000 --- a/arch/microblaze/mm/highmem.c +++ /dev/null @@ -1,78 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * highmem.c: virtual kernel memory mappings for high memory - * - * PowerPC version, stolen from the i386 version. - * - * Used in CONFIG_HIGHMEM systems for memory pages which - * are not addressable by direct kernel virtual addresses. - * - * Copyright (C) 1999 Gerhard Wichert, Siemens AG - * Gerhard.Wichert@pdb.siemens.de - * - * - * Redesigned the x86 32-bit VM architecture to deal with - * up to 16 Terrabyte physical memory. With current x86 CPUs - * we now support up to 64 Gigabytes physical RAM. - * - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> - * - * Reworked for PowerPC by various contributors. Moved from - * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. - */ - -#include <linux/export.h> -#include <linux/highmem.h> - -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. - */ -#include <asm/tlbflush.h> - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte-idx))); -#endif - set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot)); - local_flush_tlb_page(NULL, vaddr); - - return (void *) vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type; - unsigned int idx; - - if (vaddr < __fix_to_virt(FIX_KMAP_END)) - return; - - type = kmap_atomic_idx(); - - idx = type + KM_TYPE_NR * smp_processor_id(); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); -#endif - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - local_flush_tlb_page(NULL, vaddr); - - kmap_atomic_idx_pop(); -} -EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/microblaze/mm/init.c b/arch/microblaze/mm/init.c index 45da639bd..1f4b5b34e 100644 --- a/arch/microblaze/mm/init.c +++ b/arch/microblaze/mm/init.c @@ -49,17 +49,11 @@ unsigned long lowmem_size; EXPORT_SYMBOL(min_low_pfn); EXPORT_SYMBOL(max_low_pfn); -#ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; -EXPORT_SYMBOL(kmap_pte); - static void __init highmem_init(void) { pr_debug("%x\n", (u32)PKMAP_BASE); map_page(PKMAP_BASE, 0, 0); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); - - kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); } static void highmem_setup(void) diff --git a/arch/mips/Kconfig b/arch/mips/Kconfig index bd4bb55ae..00398c6d4 100644 --- a/arch/mips/Kconfig +++ b/arch/mips/Kconfig @@ -2723,6 +2723,7 @@ config WAR_MIPS34K_MISSED_ITLB config HIGHMEM bool "High Memory Support" depends on 32BIT && CPU_SUPPORTS_HIGHMEM && SYS_SUPPORTS_HIGHMEM && !CPU_MIPS32_3_5_EVA + select KMAP_LOCAL config CPU_SUPPORTS_HIGHMEM bool diff --git a/arch/mips/include/asm/fixmap.h b/arch/mips/include/asm/fixmap.h index 743535be7..beea14761 100644 --- a/arch/mips/include/asm/fixmap.h +++ b/arch/mips/include/asm/fixmap.h @@ -17,7 +17,7 @@ #include <spaces.h> #ifdef CONFIG_HIGHMEM #include <linux/threads.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> #endif /* @@ -52,7 +52,7 @@ enum fixed_addresses { #ifdef CONFIG_HIGHMEM /* reserved pte's for temporary kernel mappings */ FIX_KMAP_BEGIN = FIX_CMAP_END + 1, - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #endif __end_of_fixed_addresses }; diff --git a/arch/mips/include/asm/highmem.h b/arch/mips/include/asm/highmem.h index 9f021cf51..1716181ea 100644 --- a/arch/mips/include/asm/highmem.h +++ b/arch/mips/include/asm/highmem.h @@ -24,7 +24,7 @@ #include <linux/interrupt.h> #include <linux/uaccess.h> #include <asm/cpu-features.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> /* declarations for highmem.c */ extern unsigned long highstart_pfn, highend_pfn; @@ -48,11 +48,11 @@ extern pte_t *pkmap_page_table; #define ARCH_HAS_KMAP_FLUSH_TLB extern void kmap_flush_tlb(unsigned long addr); -extern void *kmap_atomic_pfn(unsigned long pfn); #define flush_cache_kmaps() BUG_ON(cpu_has_dc_aliases) -extern void kmap_init(void); +#define arch_kmap_local_post_map(vaddr, pteval) local_flush_tlb_one(vaddr) +#define arch_kmap_local_post_unmap(vaddr) local_flush_tlb_one(vaddr) #endif /* __KERNEL__ */ diff --git a/arch/mips/include/asm/kmap_types.h b/arch/mips/include/asm/kmap_types.h deleted file mode 100644 index 16665dc24..000000000 --- a/arch/mips/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include <asm-generic/kmap_types.h> - -#undef __WITH_KM_FENCE - -#endif diff --git a/arch/mips/kernel/crash_dump.c b/arch/mips/kernel/crash_dump.c index 01b2bd95b..9aba83e1e 100644 --- a/arch/mips/kernel/crash_dump.c +++ b/arch/mips/kernel/crash_dump.c @@ -5,8 +5,6 @@ #include <linux/uaccess.h> #include <linux/slab.h> -static void *kdump_buf_page; - /** * copy_oldmem_page - copy one page from "oldmem" * @pfn: page frame number to be copied @@ -17,51 +15,25 @@ static void *kdump_buf_page; * @userbuf: if set, @buf is in user address space, use copy_to_user(), * otherwise @buf is in kernel address space, use memcpy(). * - * Copy a page from "oldmem". For this page, there is no pte mapped + * Copy a page from "oldmem". For this page, there might be no pte mapped * in the current kernel. - * - * Calling copy_to_user() in atomic context is not desirable. Hence first - * copying the data to a pre-allocated kernel page and then copying to user - * space in non-atomic context. */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) { void *vaddr; if (!csize) return 0; - vaddr = kmap_atomic_pfn(pfn); + vaddr = kmap_local_pfn(pfn); if (!userbuf) { - memcpy(buf, (vaddr + offset), csize); - kunmap_atomic(vaddr); + memcpy(buf, vaddr + offset, csize); } else { - if (!kdump_buf_page) { - pr_warn("Kdump: Kdump buffer page not allocated\n"); - - return -EFAULT; - } - copy_page(kdump_buf_page, vaddr); - kunmap_atomic(vaddr); - if (copy_to_user(buf, (kdump_buf_page + offset), csize)) - return -EFAULT; + if (copy_to_user(buf, vaddr + offset, csize)) + csize = -EFAULT; } return csize; } - -static int __init kdump_buf_page_init(void) -{ - int ret = 0; - - kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!kdump_buf_page) { - pr_warn("Kdump: Failed to allocate kdump buffer page\n"); - ret = -ENOMEM; - } - - return ret; -} -arch_initcall(kdump_buf_page_init); diff --git a/arch/mips/mm/highmem.c b/arch/mips/mm/highmem.c index 5fec7f45d..57e2f08f0 100644 --- a/arch/mips/mm/highmem.c +++ b/arch/mips/mm/highmem.c @@ -8,8 +8,6 @@ #include <asm/fixmap.h> #include <asm/tlbflush.h> -static pte_t *kmap_pte; - unsigned long highstart_pfn, highend_pfn; void kmap_flush_tlb(unsigned long addr) @@ -17,78 +15,3 @@ void kmap_flush_tlb(unsigned long addr) flush_tlb_one(addr); } EXPORT_SYMBOL(kmap_flush_tlb); - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte - idx))); -#endif - set_pte(kmap_pte-idx, mk_pte(page, prot)); - local_flush_tlb_one((unsigned long)vaddr); - - return (void*) vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type __maybe_unused; - - if (vaddr < FIXADDR_START) - return; - - type = kmap_atomic_idx(); -#ifdef CONFIG_DEBUG_HIGHMEM - { - int idx = type + KM_TYPE_NR * smp_processor_id(); - - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - local_flush_tlb_one(vaddr); - } -#endif - kmap_atomic_idx_pop(); -} -EXPORT_SYMBOL(kunmap_atomic_high); - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn) -{ - unsigned long vaddr; - int idx, type; - - preempt_disable(); - pagefault_disable(); - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte-idx, pfn_pte(pfn, PAGE_KERNEL)); - flush_tlb_one(vaddr); - - return (void*) vaddr; -} - -void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = virt_to_kpte(kmap_vstart); -} diff --git a/arch/mips/mm/init.c b/arch/mips/mm/init.c index 07e84a774..bc80893e5 100644 --- a/arch/mips/mm/init.c +++ b/arch/mips/mm/init.c @@ -36,7 +36,6 @@ #include <asm/cachectl.h> #include <asm/cpu.h> #include <asm/dma.h> -#include <asm/kmap_types.h> #include <asm/maar.h> #include <asm/mmu_context.h> #include <asm/sections.h> @@ -402,9 +401,6 @@ void __init paging_init(void) pagetable_init(); -#ifdef CONFIG_HIGHMEM - kmap_init(); -#endif #ifdef CONFIG_ZONE_DMA max_zone_pfns[ZONE_DMA] = MAX_DMA_PFN; #endif diff --git a/arch/nds32/Kconfig.cpu b/arch/nds32/Kconfig.cpu index f88a12fdf..c10759952 100644 --- a/arch/nds32/Kconfig.cpu +++ b/arch/nds32/Kconfig.cpu @@ -157,6 +157,7 @@ config HW_SUPPORT_UNALIGNMENT_ACCESS config HIGHMEM bool "High Memory Support" depends on MMU && !CPU_CACHE_ALIASING + select KMAP_LOCAL help The address space of Andes processors is only 4 Gigabytes large and it has to accommodate user address space, kernel address diff --git a/arch/nds32/include/asm/fixmap.h b/arch/nds32/include/asm/fixmap.h index 5a4bf11e5..2fa09a2de 100644 --- a/arch/nds32/include/asm/fixmap.h +++ b/arch/nds32/include/asm/fixmap.h @@ -6,7 +6,7 @@ #ifdef CONFIG_HIGHMEM #include <linux/threads.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> #endif enum fixed_addresses { @@ -14,7 +14,7 @@ enum fixed_addresses { FIX_KMAP_RESERVED, FIX_KMAP_BEGIN, #ifdef CONFIG_HIGHMEM - FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS), + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #endif FIX_EARLYCON_MEM_BASE, __end_of_fixed_addresses diff --git a/arch/nds32/include/asm/highmem.h b/arch/nds32/include/asm/highmem.h index fe986d0e6..16159a871 100644 --- a/arch/nds32/include/asm/highmem.h +++ b/arch/nds32/include/asm/highmem.h @@ -5,7 +5,6 @@ #define _ASM_HIGHMEM_H #include <asm/proc-fns.h> -#include <asm/kmap_types.h> #include <asm/fixmap.h> /* @@ -45,11 +44,22 @@ extern pte_t *pkmap_page_table; extern void kmap_init(void); /* - * The following functions are already defined by <linux/highmem.h> - * when CONFIG_HIGHMEM is not set. + * FIXME: The below looks broken vs. a kmap_atomic() in task context which + * is interupted and another kmap_atomic() happens in interrupt context. + * But what do I know about nds32. -- tglx */ -#ifdef CONFIG_HIGHMEM -extern void *kmap_atomic_pfn(unsigned long pfn); -#endif +#define arch_kmap_local_post_map(vaddr, pteval) \ + do { \ + __nds32__tlbop_inv(vaddr); \ + __nds32__mtsr_dsb(vaddr, NDS32_SR_TLB_VPN); \ + __nds32__tlbop_rwr(pteval); \ + __nds32__isb(); \ + } while (0) + +#define arch_kmap_local_pre_unmap(vaddr) \ + do { \ + __nds32__tlbop_inv(vaddr); \ + __nds32__isb(); \ + } while (0) #endif diff --git a/arch/nds32/mm/Makefile b/arch/nds32/mm/Makefile index 897ecaf5c..14fb2e8eb 100644 --- a/arch/nds32/mm/Makefile +++ b/arch/nds32/mm/Makefile @@ -3,7 +3,6 @@ obj-y := extable.o tlb.o fault.o init.o mmap.o \ mm-nds32.o cacheflush.o proc.o obj-$(CONFIG_ALIGNMENT_TRAP) += alignment.o -obj-$(CONFIG_HIGHMEM) += highmem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_proc.o = $(CC_FLAGS_FTRACE) diff --git a/arch/nds32/mm/highmem.c b/arch/nds32/mm/highmem.c deleted file mode 100644 index 4284cd59e..000000000 --- a/arch/nds32/mm/highmem.c +++ /dev/null @@ -1,48 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -// Copyright (C) 2005-2017 Andes Technology Corporation - -#include <linux/export.h> -#include <linux/highmem.h> -#include <linux/sched.h> -#include <linux/smp.h> -#include <linux/interrupt.h> -#include <linux/memblock.h> -#include <asm/fixmap.h> -#include <asm/tlbflush.h> - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned int idx; - unsigned long vaddr, pte; - int type; - pte_t *ptep; - - type = kmap_atomic_idx_push(); - - idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - pte = (page_to_pfn(page) << PAGE_SHIFT) | prot; - ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); - set_pte(ptep, pte); - - __nds32__tlbop_inv(vaddr); - __nds32__mtsr_dsb(vaddr, NDS32_SR_TLB_VPN); - __nds32__tlbop_rwr(pte); - __nds32__isb(); - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - if (kvaddr >= (void *)FIXADDR_START) { - unsigned long vaddr = (unsigned long)kvaddr; - pte_t *ptep; - kmap_atomic_idx_pop(); - __nds32__tlbop_inv(vaddr); - __nds32__isb(); - ptep = pte_offset_kernel(pmd_off_k(vaddr), vaddr); - set_pte(ptep, 0); - } -} -EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/openrisc/mm/init.c b/arch/openrisc/mm/init.c index 5e88c351e..f3fa02b88 100644 --- a/arch/openrisc/mm/init.c +++ b/arch/openrisc/mm/init.c @@ -33,7 +33,6 @@ #include <asm/io.h> #include <asm/tlb.h> #include <asm/mmu_context.h> -#include <asm/kmap_types.h> #include <asm/fixmap.h> #include <asm/tlbflush.h> #include <asm/sections.h> diff --git a/arch/openrisc/mm/ioremap.c b/arch/openrisc/mm/ioremap.c index a978590d8..5aed97a18 100644 --- a/arch/openrisc/mm/ioremap.c +++ b/arch/openrisc/mm/ioremap.c @@ -15,7 +15,6 @@ #include <linux/io.h> #include <linux/pgtable.h> #include <asm/pgalloc.h> -#include <asm/kmap_types.h> #include <asm/fixmap.h> #include <asm/bug.h> #include <linux/sched.h> diff --git a/arch/parisc/include/asm/hardirq.h b/arch/parisc/include/asm/hardirq.h index 7f7039516..fad29aa6f 100644 --- a/arch/parisc/include/asm/hardirq.h +++ b/arch/parisc/include/asm/hardirq.h @@ -32,7 +32,6 @@ typedef struct { DECLARE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); #define __ARCH_IRQ_STAT -#define __IRQ_STAT(cpu, member) (irq_stat[cpu].member) #define inc_irq_stat(member) this_cpu_inc(irq_stat.member) #define __inc_irq_stat(member) __this_cpu_inc(irq_stat.member) #define ack_bad_irq(irq) WARN(1, "unexpected IRQ trap at vector %02x\n", irq) diff --git a/arch/parisc/include/asm/kmap_types.h b/arch/parisc/include/asm/kmap_types.h deleted file mode 100644 index 3e70b5cd1..000000000 --- a/arch/parisc/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include <asm-generic/kmap_types.h> - -#undef __WITH_KM_FENCE - -#endif diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 4e6f30473..42851014e 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -146,6 +146,7 @@ config PPC select ARCH_MIGHT_HAVE_PC_SERIO select ARCH_OPTIONAL_KERNEL_RWX if ARCH_HAS_STRICT_KERNEL_RWX select ARCH_SUPPORTS_ATOMIC_RMW + select ARCH_SUPPORTS_RT if HAVE_POSIX_CPU_TIMERS_TASK_WORK select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_CMPXCHG_LOCKREF if PPC64 select ARCH_USE_QUEUED_RWLOCKS if PPC_QUEUED_SPINLOCKS @@ -234,6 +235,7 @@ config PPC select HAVE_HARDLOCKUP_DETECTOR_PERF if PERF_EVENTS && HAVE_PERF_EVENTS_NMI && !HAVE_HARDLOCKUP_DETECTOR_ARCH select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE select MMU_GATHER_PAGE_SIZE select HAVE_REGS_AND_STACK_ACCESS_API @@ -241,6 +243,7 @@ config PPC select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING select HAVE_IRQ_TIME_ACCOUNTING + select HAVE_POSIX_CPU_TIMERS_TASK_WORK if !KVM select HAVE_RSEQ select IOMMU_HELPER if PPC64 select IRQ_DOMAIN @@ -414,6 +417,7 @@ menu "Kernel options" config HIGHMEM bool "High memory support" depends on PPC32 + select KMAP_LOCAL source "kernel/Kconfig.hz" diff --git a/arch/powerpc/include/asm/cmpxchg.h b/arch/powerpc/include/asm/cmpxchg.h index cf091c4c2..7371f7e23 100644 --- a/arch/powerpc/include/asm/cmpxchg.h +++ b/arch/powerpc/include/asm/cmpxchg.h @@ -5,7 +5,7 @@ #ifdef __KERNEL__ #include <linux/compiler.h> #include <asm/synch.h> -#include <linux/bug.h> +#include <linux/bits.h> #ifdef __BIG_ENDIAN #define BITOFF_CAL(size, off) ((sizeof(u32) - size - off) * BITS_PER_BYTE) diff --git a/arch/powerpc/include/asm/fixmap.h b/arch/powerpc/include/asm/fixmap.h index 591b2f4de..947b5b9c4 100644 --- a/arch/powerpc/include/asm/fixmap.h +++ b/arch/powerpc/include/asm/fixmap.h @@ -20,7 +20,7 @@ #include <asm/page.h> #ifdef CONFIG_HIGHMEM #include <linux/threads.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> #endif #ifdef CONFIG_PPC64 @@ -61,7 +61,7 @@ enum fixed_addresses { FIX_EARLY_DEBUG_BASE = FIX_EARLY_DEBUG_TOP+(ALIGN(SZ_128K, PAGE_SIZE)/PAGE_SIZE)-1, #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #endif #ifdef CONFIG_PPC_8xx /* For IMMR we need an aligned 512K area */ diff --git a/arch/powerpc/include/asm/highmem.h b/arch/powerpc/include/asm/highmem.h index 104026f7d..80a5ae771 100644 --- a/arch/powerpc/include/asm/highmem.h +++ b/arch/powerpc/include/asm/highmem.h @@ -24,12 +24,10 @@ #ifdef __KERNEL__ #include <linux/interrupt.h> -#include <asm/kmap_types.h> #include <asm/cacheflush.h> #include <asm/page.h> #include <asm/fixmap.h> -extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; /* @@ -60,6 +58,11 @@ extern pte_t *pkmap_page_table; #define flush_cache_kmaps() flush_cache_all() +#define arch_kmap_local_post_map(vaddr, pteval) \ + local_flush_tlb_page(NULL, vaddr) +#define arch_kmap_local_post_unmap(vaddr) \ + local_flush_tlb_page(NULL, vaddr) + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/powerpc/include/asm/kmap_types.h b/arch/powerpc/include/asm/kmap_types.h deleted file mode 100644 index c8fa182d4..000000000 --- a/arch/powerpc/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -#ifndef _ASM_POWERPC_KMAP_TYPES_H -#define _ASM_POWERPC_KMAP_TYPES_H - -#ifdef __KERNEL__ - -/* - */ - -#define KM_TYPE_NR 16 - -#endif /* __KERNEL__ */ -#endif /* _ASM_POWERPC_KMAP_TYPES_H */ diff --git a/arch/powerpc/include/asm/simple_spinlock_types.h b/arch/powerpc/include/asm/simple_spinlock_types.h index 0f3cdd8fa..d45561e9e 100644 --- a/arch/powerpc/include/asm/simple_spinlock_types.h +++ b/arch/powerpc/include/asm/simple_spinlock_types.h @@ -2,7 +2,7 @@ #ifndef _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H #define _ASM_POWERPC_SIMPLE_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__LINUX_RT_MUTEX_H) # error "please don't include this file directly" #endif diff --git a/arch/powerpc/include/asm/spinlock_types.h b/arch/powerpc/include/asm/spinlock_types.h index c5d742f18..cc6922a01 100644 --- a/arch/powerpc/include/asm/spinlock_types.h +++ b/arch/powerpc/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef _ASM_POWERPC_SPINLOCK_TYPES_H #define _ASM_POWERPC_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - #ifdef CONFIG_PPC_QUEUED_SPINLOCKS #include <asm-generic/qspinlock_types.h> #include <asm-generic/qrwlock_types.h> diff --git a/arch/powerpc/include/asm/stackprotector.h b/arch/powerpc/include/asm/stackprotector.h index 1c8460e23..b1653c160 100644 --- a/arch/powerpc/include/asm/stackprotector.h +++ b/arch/powerpc/include/asm/stackprotector.h @@ -24,7 +24,11 @@ static __always_inline void boot_init_stack_canary(void) unsigned long canary; /* Try to get a semi random initial value. */ +#ifdef CONFIG_PREEMPT_RT + canary = (unsigned long)&canary; +#else canary = get_random_canary(); +#endif canary ^= mftb(); canary ^= LINUX_VERSION_CODE; canary &= CANARY_MASK; diff --git a/arch/powerpc/include/asm/thread_info.h b/arch/powerpc/include/asm/thread_info.h index 28d2908af..3a8128d2b 100644 --- a/arch/powerpc/include/asm/thread_info.h +++ b/arch/powerpc/include/asm/thread_info.h @@ -48,6 +48,8 @@ struct thread_info { int preempt_count; /* 0 => preemptable, <0 => BUG */ + int preempt_lazy_count; /* 0 => preemptable, + <0 => BUG */ #ifdef CONFIG_SMP unsigned int cpu; #endif @@ -100,11 +102,12 @@ void arch_setup_new_exec(void); #define TIF_SINGLESTEP 8 /* singlestepping active */ #define TIF_NOHZ 9 /* in adaptive nohz mode */ #define TIF_SECCOMP 10 /* secure computing */ -#define TIF_RESTOREALL 11 /* Restore all regs (implies NOERROR) */ -#define TIF_NOERROR 12 /* Force successful syscall return */ + +#define TIF_NEED_RESCHED_LAZY 11 /* lazy rescheduling necessary */ +#define TIF_SYSCALL_TRACEPOINT 12 /* syscall tracepoint instrumentation */ + #define TIF_NOTIFY_RESUME 13 /* callback before returning to user */ #define TIF_UPROBE 14 /* breakpointed or single-stepping */ -#define TIF_SYSCALL_TRACEPOINT 15 /* syscall tracepoint instrumentation */ #define TIF_EMULATE_STACK_STORE 16 /* Is an instruction emulation for stack store? */ #define TIF_MEMDIE 17 /* is terminating due to OOM killer */ @@ -113,6 +116,9 @@ void arch_setup_new_exec(void); #endif #define TIF_POLLING_NRFLAG 19 /* true if poll_idle() is polling TIF_NEED_RESCHED */ #define TIF_32BIT 20 /* 32 bit binary */ +#define TIF_RESTOREALL 21 /* Restore all regs (implies NOERROR) */ +#define TIF_NOERROR 22 /* Force successful syscall return */ + /* as above, but as bit values */ #define _TIF_SYSCALL_TRACE (1<<TIF_SYSCALL_TRACE) @@ -132,6 +138,7 @@ void arch_setup_new_exec(void); #define _TIF_SYSCALL_TRACEPOINT (1<<TIF_SYSCALL_TRACEPOINT) #define _TIF_EMULATE_STACK_STORE (1<<TIF_EMULATE_STACK_STORE) #define _TIF_NOHZ (1<<TIF_NOHZ) +#define _TIF_NEED_RESCHED_LAZY (1<<TIF_NEED_RESCHED_LAZY) #define _TIF_SYSCALL_EMU (1<<TIF_SYSCALL_EMU) #define _TIF_SYSCALL_DOTRACE (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SECCOMP | _TIF_SYSCALL_TRACEPOINT | \ @@ -139,8 +146,10 @@ void arch_setup_new_exec(void); #define _TIF_USER_WORK_MASK (_TIF_SIGPENDING | _TIF_NEED_RESCHED | \ _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_RESTORE_TM | _TIF_PATCH_PENDING) + _TIF_RESTORE_TM | _TIF_PATCH_PENDING | \ + _TIF_NEED_RESCHED_LAZY) #define _TIF_PERSYSCALL_MASK (_TIF_RESTOREALL|_TIF_NOERROR) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) /* Bits in local_flags */ /* Don't move TLF_NAPPING without adjusting the code in entry_32.S */ diff --git a/arch/powerpc/kernel/asm-offsets.c b/arch/powerpc/kernel/asm-offsets.c index 760f656ef..0a351a99d 100644 --- a/arch/powerpc/kernel/asm-offsets.c +++ b/arch/powerpc/kernel/asm-offsets.c @@ -187,6 +187,7 @@ int main(void) OFFSET(TI_FLAGS, thread_info, flags); OFFSET(TI_LOCAL_FLAGS, thread_info, local_flags); OFFSET(TI_PREEMPT, thread_info, preempt_count); + OFFSET(TI_PREEMPT_LAZY, thread_info, preempt_lazy_count); #ifdef CONFIG_PPC64 OFFSET(DCACHEL1BLOCKSIZE, ppc64_caches, l1d.block_size); diff --git a/arch/powerpc/kernel/entry_32.S b/arch/powerpc/kernel/entry_32.S index 459f5d00b..fc9517a97 100644 --- a/arch/powerpc/kernel/entry_32.S +++ b/arch/powerpc/kernel/entry_32.S @@ -414,7 +414,9 @@ ret_from_syscall: mtmsr r10 lwz r9,TI_FLAGS(r2) li r8,-MAX_ERRNO - andi. r0,r9,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK) + lis r0,(_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)@h + ori r0,r0, (_TIF_SYSCALL_DOTRACE|_TIF_SINGLESTEP|_TIF_USER_WORK_MASK|_TIF_PERSYSCALL_MASK)@l + and. r0,r9,r0 bne- syscall_exit_work cmplw 0,r3,r8 blt+ syscall_exit_cont @@ -530,13 +532,13 @@ syscall_dotrace: b syscall_dotrace_cont syscall_exit_work: - andi. r0,r9,_TIF_RESTOREALL + andis. r0,r9,_TIF_RESTOREALL@h beq+ 0f REST_NVGPRS(r1) b 2f 0: cmplw 0,r3,r8 blt+ 1f - andi. r0,r9,_TIF_NOERROR + andis. r0,r9,_TIF_NOERROR@h bne- 1f lwz r11,_CCR(r1) /* Load CR */ neg r3,r3 @@ -545,12 +547,12 @@ syscall_exit_work: 1: stw r6,RESULT(r1) /* Save result */ stw r3,GPR3(r1) /* Update return value */ -2: andi. r0,r9,(_TIF_PERSYSCALL_MASK) +2: andis. r0,r9,(_TIF_PERSYSCALL_MASK)@h beq 4f /* Clear per-syscall TIF flags if any are set. */ - li r11,_TIF_PERSYSCALL_MASK + lis r11,(_TIF_PERSYSCALL_MASK)@h addi r12,r2,TI_FLAGS 3: lwarx r8,0,r12 andc r8,r8,r11 @@ -927,7 +929,14 @@ resume_kernel: cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ bne restore_kuap andi. r8,r8,_TIF_NEED_RESCHED + bne+ 1f + lwz r0,TI_PREEMPT_LAZY(r2) + cmpwi 0,r0,0 /* if non-zero, just restore regs and return */ + bne restore_kuap + lwz r0,TI_FLAGS(r2) + andi. r0,r0,_TIF_NEED_RESCHED_LAZY beq+ restore_kuap +1: lwz r3,_MSR(r1) andi. r0,r3,MSR_EE /* interrupts off? */ beq restore_kuap /* don't schedule if so */ @@ -1248,7 +1257,7 @@ global_dbcr0: #endif /* !(CONFIG_4xx || CONFIG_BOOKE) */ do_work: /* r10 contains MSR_KERNEL here */ - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,_TIF_NEED_RESCHED_MASK beq do_user_signal do_resched: /* r10 contains MSR_KERNEL here */ @@ -1267,7 +1276,7 @@ recheck: LOAD_REG_IMMEDIATE(r10,MSR_KERNEL) mtmsr r10 /* disable interrupts */ lwz r9,TI_FLAGS(r2) - andi. r0,r9,_TIF_NEED_RESCHED + andi. r0,r9,_TIF_NEED_RESCHED_MASK bne- do_resched andi. r0,r9,_TIF_USER_WORK_MASK beq restore_user diff --git a/arch/powerpc/kernel/exceptions-64e.S b/arch/powerpc/kernel/exceptions-64e.S index e91d3ba1e..5ce64cfe6 100644 --- a/arch/powerpc/kernel/exceptions-64e.S +++ b/arch/powerpc/kernel/exceptions-64e.S @@ -1080,7 +1080,7 @@ _GLOBAL(ret_from_except_lite) li r10, -1 mtspr SPRN_DBSR,r10 b restore -1: andi. r0,r4,_TIF_NEED_RESCHED +1: andi. r0,r4,_TIF_NEED_RESCHED_MASK beq 2f bl restore_interrupts SCHEDULE_USER @@ -1132,12 +1132,20 @@ resume_kernel: bne- 0b 1: -#ifdef CONFIG_PREEMPT +#ifdef CONFIG_PREEMPTION /* Check if we need to preempt */ + lwz r8,TI_PREEMPT(r9) + cmpwi 0,r8,0 /* if non-zero, just restore regs and return */ + bne restore andi. r0,r4,_TIF_NEED_RESCHED + bne+ check_count + + andi. r0,r4,_TIF_NEED_RESCHED_LAZY beq+ restore + lwz r8,TI_PREEMPT_LAZY(r9) + /* Check that preempt_count() == 0 and interrupts are enabled */ - lwz r8,TI_PREEMPT(r9) +check_count: cmpwi cr0,r8,0 bne restore ld r0,SOFTE(r1) @@ -1158,7 +1166,7 @@ resume_kernel: * interrupted after loading SRR0/1. */ wrteei 0 -#endif /* CONFIG_PREEMPT */ +#endif /* CONFIG_PREEMPTION */ restore: /* diff --git a/arch/powerpc/kernel/irq.c b/arch/powerpc/kernel/irq.c index e8a548447..5ad4f27cb 100644 --- a/arch/powerpc/kernel/irq.c +++ b/arch/powerpc/kernel/irq.c @@ -753,10 +753,12 @@ void *mcheckirq_ctx[NR_CPUS] __read_mostly; void *softirq_ctx[NR_CPUS] __read_mostly; void *hardirq_ctx[NR_CPUS] __read_mostly; +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { call_do_softirq(softirq_ctx[smp_processor_id()]); } +#endif irq_hw_number_t virq_to_hw(unsigned int virq) { diff --git a/arch/powerpc/kernel/misc_32.S b/arch/powerpc/kernel/misc_32.S index 717e658b9..08ee95ad6 100644 --- a/arch/powerpc/kernel/misc_32.S +++ b/arch/powerpc/kernel/misc_32.S @@ -31,6 +31,7 @@ * We store the saved ksp_limit in the unused part * of the STACK_FRAME_OVERHEAD */ +#ifndef CONFIG_PREEMPT_RT _GLOBAL(call_do_softirq) mflr r0 stw r0,4(r1) @@ -46,6 +47,7 @@ _GLOBAL(call_do_softirq) stw r10,THREAD+KSP_LIMIT(r2) mtlr r0 blr +#endif /* * void call_do_irq(struct pt_regs *regs, void *sp); diff --git a/arch/powerpc/kernel/misc_64.S b/arch/powerpc/kernel/misc_64.S index 070465825..a6b33f7b3 100644 --- a/arch/powerpc/kernel/misc_64.S +++ b/arch/powerpc/kernel/misc_64.S @@ -27,6 +27,7 @@ .text +#ifndef CONFIG_PREEMPT_RT _GLOBAL(call_do_softirq) mflr r0 std r0,16(r1) @@ -37,6 +38,7 @@ _GLOBAL(call_do_softirq) ld r0,16(r1) mtlr r0 blr +#endif _GLOBAL(call_do_irq) mflr r0 diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 532f22637..1ef55f4b3 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -73,7 +73,8 @@ static const char *nvram_os_partitions[] = { }; static void oops_to_nvram(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason); + enum kmsg_dump_reason reason, + struct kmsg_dumper_iter *iter); static struct kmsg_dumper nvram_kmsg_dumper = { .dump = oops_to_nvram @@ -643,7 +644,8 @@ void __init nvram_init_oops_partition(int rtas_partition_exists) * partition. If that's too much, go back and capture uncompressed text. */ static void oops_to_nvram(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) + enum kmsg_dump_reason reason, + struct kmsg_dumper_iter *iter) { struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; static unsigned int oops_count = 0; @@ -681,13 +683,13 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, return; if (big_oops_buf) { - kmsg_dump_get_buffer(dumper, false, + kmsg_dump_get_buffer(iter, false, big_oops_buf, big_oops_buf_sz, &text_len); rc = zip_oops(text_len); } if (rc != 0) { - kmsg_dump_rewind(dumper); - kmsg_dump_get_buffer(dumper, false, + kmsg_dump_rewind(iter); + kmsg_dump_get_buffer(iter, false, oops_data, oops_data_sz, &text_len); err_type = ERR_TYPE_KERNEL_PANIC; oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); diff --git a/arch/powerpc/kernel/syscall_64.c b/arch/powerpc/kernel/syscall_64.c index 310bcd768..ae3212dcf 100644 --- a/arch/powerpc/kernel/syscall_64.c +++ b/arch/powerpc/kernel/syscall_64.c @@ -193,7 +193,7 @@ notrace unsigned long syscall_exit_prepare(unsigned long r3, ti_flags = READ_ONCE(*ti_flagsp); while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { local_irq_enable(); - if (ti_flags & _TIF_NEED_RESCHED) { + if (ti_flags & _TIF_NEED_RESCHED_MASK) { schedule(); } else { /* @@ -277,7 +277,7 @@ notrace unsigned long interrupt_exit_user_prepare(struct pt_regs *regs, unsigned ti_flags = READ_ONCE(*ti_flagsp); while (unlikely(ti_flags & (_TIF_USER_WORK_MASK & ~_TIF_RESTORE_TM))) { local_irq_enable(); /* returning to user: may enable */ - if (ti_flags & _TIF_NEED_RESCHED) { + if (ti_flags & _TIF_NEED_RESCHED_MASK) { schedule(); } else { if (ti_flags & _TIF_SIGPENDING) @@ -361,11 +361,15 @@ notrace unsigned long interrupt_exit_kernel_prepare(struct pt_regs *regs, unsign /* Returning to a kernel context with local irqs enabled. */ WARN_ON_ONCE(!(regs->msr & MSR_EE)); again: - if (IS_ENABLED(CONFIG_PREEMPT)) { + if (IS_ENABLED(CONFIG_PREEMPTION)) { /* Return to preemptible kernel context */ if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED)) { if (preempt_count() == 0) preempt_schedule_irq(); + } else if (unlikely(*ti_flagsp & _TIF_NEED_RESCHED_LAZY)) { + if ((preempt_count() == 0) && + (current_thread_info()->preempt_lazy_count == 0)) + preempt_schedule_irq(); } } diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index 1d20f0f77..7e0a497a3 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -312,12 +312,11 @@ static unsigned long vtime_delta_scaled(struct cpu_accounting_data *acct, return stime_scaled; } -static unsigned long vtime_delta(struct task_struct *tsk, +static unsigned long vtime_delta(struct cpu_accounting_data *acct, unsigned long *stime_scaled, unsigned long *steal_time) { unsigned long now, stime; - struct cpu_accounting_data *acct = get_accounting(tsk); WARN_ON_ONCE(!irqs_disabled()); @@ -332,29 +331,30 @@ static unsigned long vtime_delta(struct task_struct *tsk, return stime; } +static void vtime_delta_kernel(struct cpu_accounting_data *acct, + unsigned long *stime, unsigned long *stime_scaled) +{ + unsigned long steal_time; + + *stime = vtime_delta(acct, stime_scaled, &steal_time); + *stime -= min(*stime, steal_time); + acct->steal_time += steal_time; +} + void vtime_account_kernel(struct task_struct *tsk) { - unsigned long stime, stime_scaled, steal_time; struct cpu_accounting_data *acct = get_accounting(tsk); + unsigned long stime, stime_scaled; - stime = vtime_delta(tsk, &stime_scaled, &steal_time); - - stime -= min(stime, steal_time); - acct->steal_time += steal_time; + vtime_delta_kernel(acct, &stime, &stime_scaled); - if ((tsk->flags & PF_VCPU) && !irq_count()) { + if (tsk->flags & PF_VCPU) { acct->gtime += stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->utime_scaled += stime_scaled; #endif } else { - if (hardirq_count()) - acct->hardirq_time += stime; - else if (in_serving_softirq()) - acct->softirq_time += stime; - else - acct->stime += stime; - + acct->stime += stime; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME acct->stime_scaled += stime_scaled; #endif @@ -367,10 +367,34 @@ void vtime_account_idle(struct task_struct *tsk) unsigned long stime, stime_scaled, steal_time; struct cpu_accounting_data *acct = get_accounting(tsk); - stime = vtime_delta(tsk, &stime_scaled, &steal_time); + stime = vtime_delta(acct, &stime_scaled, &steal_time); acct->idle_time += stime + steal_time; } +static void vtime_account_irq_field(struct cpu_accounting_data *acct, + unsigned long *field) +{ + unsigned long stime, stime_scaled; + + vtime_delta_kernel(acct, &stime, &stime_scaled); + *field += stime; +#ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME + acct->stime_scaled += stime_scaled; +#endif +} + +void vtime_account_softirq(struct task_struct *tsk) +{ + struct cpu_accounting_data *acct = get_accounting(tsk); + vtime_account_irq_field(acct, &acct->softirq_time); +} + +void vtime_account_hardirq(struct task_struct *tsk) +{ + struct cpu_accounting_data *acct = get_accounting(tsk); + vtime_account_irq_field(acct, &acct->hardirq_time); +} + static void vtime_flush_scaled(struct task_struct *tsk, struct cpu_accounting_data *acct) { diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 77dffea3d..34861e39d 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -170,7 +170,6 @@ extern void panic_flush_kmsg_start(void) extern void panic_flush_kmsg_end(void) { - printk_safe_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); bust_spinlocks(0); debug_locks_off(); @@ -260,12 +259,17 @@ static char *get_mmu_str(void) static int __die(const char *str, struct pt_regs *regs, long err) { + const char *pr = ""; + printk("Oops: %s, sig: %ld [#%d]\n", str, err, ++die_counter); + if (IS_ENABLED(CONFIG_PREEMPTION)) + pr = IS_ENABLED(CONFIG_PREEMPT_RT) ? " PREEMPT_RT" : " PREEMPT"; + printk("%s PAGE_SIZE=%luK%s%s%s%s%s%s %s\n", IS_ENABLED(CONFIG_CPU_LITTLE_ENDIAN) ? "LE" : "BE", PAGE_SIZE / 1024, get_mmu_str(), - IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", + pr, IS_ENABLED(CONFIG_SMP) ? " SMP" : "", IS_ENABLED(CONFIG_SMP) ? (" NR_CPUS=" __stringify(NR_CPUS)) : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", diff --git a/arch/powerpc/kernel/watchdog.c b/arch/powerpc/kernel/watchdog.c index af3c15a1d..8ae46c594 100644 --- a/arch/powerpc/kernel/watchdog.c +++ b/arch/powerpc/kernel/watchdog.c @@ -181,11 +181,6 @@ static void watchdog_smp_panic(int cpu, u64 tb) wd_smp_unlock(&flags); - printk_safe_flush(); - /* - * printk_safe_flush() seems to require another print - * before anything actually goes out to console. - */ if (sysctl_hardlockup_all_cpu_backtrace) trigger_allbutself_cpu_backtrace(); diff --git a/arch/powerpc/kexec/crash.c b/arch/powerpc/kexec/crash.c index c9a889880..d488311ef 100644 --- a/arch/powerpc/kexec/crash.c +++ b/arch/powerpc/kexec/crash.c @@ -311,9 +311,6 @@ void default_machine_crash_shutdown(struct pt_regs *regs) unsigned int i; int (*old_handler)(struct pt_regs *regs); - /* Avoid hardlocking with irresponsive CPU holding logbuf_lock */ - printk_nmi_enter(); - /* * This function is only called after the system * has panicked or is otherwise in a critical state. diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index 549591d9a..efb5bfe93 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -178,6 +178,7 @@ config KVM_E500MC config KVM_MPIC bool "KVM in-kernel MPIC emulation" depends on KVM && E500 + depends on !PREEMPT_RT select HAVE_KVM_IRQCHIP select HAVE_KVM_IRQFD select HAVE_KVM_IRQ_ROUTING diff --git a/arch/powerpc/mm/Makefile b/arch/powerpc/mm/Makefile index 55b4a8bd4..3b4e9e4e2 100644 --- a/arch/powerpc/mm/Makefile +++ b/arch/powerpc/mm/Makefile @@ -16,7 +16,6 @@ obj-$(CONFIG_NEED_MULTIPLE_NODES) += numa.o obj-$(CONFIG_PPC_MM_SLICES) += slice.o obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o obj-$(CONFIG_NOT_COHERENT_CACHE) += dma-noncoherent.o -obj-$(CONFIG_HIGHMEM) += highmem.o obj-$(CONFIG_PPC_COPRO_BASE) += copro_fault.o obj-$(CONFIG_PPC_PTDUMP) += ptdump/ obj-$(CONFIG_KASAN) += kasan/ diff --git a/arch/powerpc/mm/highmem.c b/arch/powerpc/mm/highmem.c deleted file mode 100644 index 624b4438a..000000000 --- a/arch/powerpc/mm/highmem.c +++ /dev/null @@ -1,67 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * highmem.c: virtual kernel memory mappings for high memory - * - * PowerPC version, stolen from the i386 version. - * - * Used in CONFIG_HIGHMEM systems for memory pages which - * are not addressable by direct kernel virtual addresses. - * - * Copyright (C) 1999 Gerhard Wichert, Siemens AG - * Gerhard.Wichert@pdb.siemens.de - * - * - * Redesigned the x86 32-bit VM architecture to deal with - * up to 16 Terrabyte physical memory. With current x86 CPUs - * we now support up to 64 Gigabytes physical RAM. - * - * Copyright (C) 1999 Ingo Molnar <mingo@redhat.com> - * - * Reworked for PowerPC by various contributors. Moved from - * highmem.h by Benjamin Herrenschmidt (c) 2009 IBM Corp. - */ - -#include <linux/highmem.h> -#include <linux/module.h> - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - WARN_ON(IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !pte_none(*(kmap_pte - idx))); - __set_pte_at(&init_mm, vaddr, kmap_pte-idx, mk_pte(page, prot), 1); - local_flush_tlb_page(NULL, vaddr); - - return (void*) vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - - if (vaddr < __fix_to_virt(FIX_KMAP_END)) - return; - - if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM)) { - int type = kmap_atomic_idx(); - unsigned int idx; - - idx = type + KM_TYPE_NR * smp_processor_id(); - WARN_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - local_flush_tlb_page(NULL, vaddr); - } - - kmap_atomic_idx_pop(); -} -EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/powerpc/mm/mem.c b/arch/powerpc/mm/mem.c index 22eb1c718..1b74565b3 100644 --- a/arch/powerpc/mm/mem.c +++ b/arch/powerpc/mm/mem.c @@ -62,11 +62,6 @@ unsigned long long memory_limit; bool init_mem_is_free; -#ifdef CONFIG_HIGHMEM -pte_t *kmap_pte; -EXPORT_SYMBOL(kmap_pte); -#endif - pgprot_t phys_mem_access_prot(struct file *file, unsigned long pfn, unsigned long size, pgprot_t vma_prot) { @@ -236,8 +231,6 @@ void __init paging_init(void) map_kernel_page(PKMAP_BASE, 0, __pgprot(0)); /* XXX gross */ pkmap_page_table = virt_to_kpte(PKMAP_BASE); - - kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); #endif /* CONFIG_HIGHMEM */ printk(KERN_DEBUG "Top of RAM: 0x%llx, Total RAM: 0x%llx\n", diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c index 6c3bc4b4d..ec862846b 100644 --- a/arch/powerpc/platforms/powernv/opal-kmsg.c +++ b/arch/powerpc/platforms/powernv/opal-kmsg.c @@ -20,7 +20,8 @@ * message, it just ensures that OPAL completely flushes the console buffer. */ static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) + enum kmsg_dump_reason reason, + struct kmsg_dumper_iter *iter) { /* * Outside of a panic context the pollers will continue to run, diff --git a/arch/powerpc/platforms/pseries/iommu.c b/arch/powerpc/platforms/pseries/iommu.c index 245f1f8df..f05555dde 100644 --- a/arch/powerpc/platforms/pseries/iommu.c +++ b/arch/powerpc/platforms/pseries/iommu.c @@ -24,6 +24,7 @@ #include <linux/of.h> #include <linux/iommu.h> #include <linux/rculist.h> +#include <linux/local_lock.h> #include <asm/io.h> #include <asm/prom.h> #include <asm/rtas.h> @@ -190,7 +191,13 @@ static int tce_build_pSeriesLP(unsigned long liobn, long tcenum, long tceshift, return ret; } -static DEFINE_PER_CPU(__be64 *, tce_page); +struct tce_page { + __be64 * page; + local_lock_t lock; +}; +static DEFINE_PER_CPU(struct tce_page, tce_page) = { + .lock = INIT_LOCAL_LOCK(lock), +}; static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, long npages, unsigned long uaddr, @@ -212,9 +219,10 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, direction, attrs); } - local_irq_save(flags); /* to protect tcep and the page behind it */ + /* to protect tcep and the page behind it */ + local_lock_irqsave(&tce_page.lock, flags); - tcep = __this_cpu_read(tce_page); + tcep = __this_cpu_read(tce_page.page); /* This is safe to do since interrupts are off when we're called * from iommu_alloc{,_sg}() @@ -223,12 +231,12 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcep = (__be64 *)__get_free_page(GFP_ATOMIC); /* If allocation fails, fall back to the loop implementation */ if (!tcep) { - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); return tce_build_pSeriesLP(tbl->it_index, tcenum, tbl->it_page_shift, npages, uaddr, direction, attrs); } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } rpn = __pa(uaddr) >> TCE_SHIFT; @@ -258,7 +266,7 @@ static int tce_buildmulti_pSeriesLP(struct iommu_table *tbl, long tcenum, tcenum += limit; } while (npages > 0 && !rc); - local_irq_restore(flags); + local_unlock_irqrestore(&tce_page.lock, flags); if (unlikely(rc == H_NOT_ENOUGH_RESOURCES)) { ret = (int)rc; @@ -429,16 +437,17 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, DMA_BIDIRECTIONAL, 0); } - local_irq_disable(); /* to protect tcep and the page behind it */ - tcep = __this_cpu_read(tce_page); + /* to protect tcep and the page behind it */ + local_lock_irq(&tce_page.lock); + tcep = __this_cpu_read(tce_page.page); if (!tcep) { tcep = (__be64 *)__get_free_page(GFP_ATOMIC); if (!tcep) { - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return -ENOMEM; } - __this_cpu_write(tce_page, tcep); + __this_cpu_write(tce_page.page, tcep); } proto_tce = TCE_PCI_READ | TCE_PCI_WRITE; @@ -481,7 +490,7 @@ static int tce_setrange_multi_pSeriesLP(unsigned long start_pfn, /* error cleanup: caller will clear whole range */ - local_irq_enable(); + local_unlock_irq(&tce_page.lock); return rc; } diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 5559edf36..d62b8e053 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3005,7 +3005,7 @@ print_address(unsigned long addr) static void dump_log_buf(void) { - struct kmsg_dumper dumper = { .active = 1 }; + struct kmsg_dumper_iter iter = { .active = 1 }; unsigned char buf[128]; size_t len; @@ -3017,9 +3017,9 @@ dump_log_buf(void) catch_memory_errors = 1; sync(); - kmsg_dump_rewind_nolock(&dumper); + kmsg_dump_rewind(&iter); xmon_start_pagination(); - while (kmsg_dump_get_line_nolock(&dumper, false, buf, sizeof(buf), &len)) { + while (kmsg_dump_get_line(&iter, false, buf, sizeof(buf), &len)) { buf[len] = '\0'; printf("%s", buf); } diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index a7e386000..d3ed5b2ae 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -183,6 +183,7 @@ config S390 select HAVE_RSEQ select HAVE_SYSCALL_TRACEPOINTS select HAVE_VIRT_CPU_ACCOUNTING + select HAVE_VIRT_CPU_ACCOUNTING_IDLE select IOMMU_HELPER if PCI select IOMMU_SUPPORT if PCI select MODULES_USE_ELF_RELA diff --git a/arch/s390/include/asm/spinlock_types.h b/arch/s390/include/asm/spinlock_types.h index cfed272e4..8e28e8176 100644 --- a/arch/s390/include/asm/spinlock_types.h +++ b/arch/s390/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - typedef struct { int lock; } __attribute__ ((aligned (4))) arch_spinlock_t; diff --git a/arch/s390/include/asm/vtime.h b/arch/s390/include/asm/vtime.h index 3622d4ebc..fac6a6798 100644 --- a/arch/s390/include/asm/vtime.h +++ b/arch/s390/include/asm/vtime.h @@ -2,7 +2,6 @@ #ifndef _S390_VTIME_H #define _S390_VTIME_H -#define __ARCH_HAS_VTIME_ACCOUNT #define __ARCH_HAS_VTIME_TASK_SWITCH #endif /* _S390_VTIME_H */ diff --git a/arch/s390/kernel/vtime.c b/arch/s390/kernel/vtime.c index 579ec3a8c..9b3c5978b 100644 --- a/arch/s390/kernel/vtime.c +++ b/arch/s390/kernel/vtime.c @@ -223,35 +223,50 @@ void vtime_flush(struct task_struct *tsk) S390_lowcore.avg_steal_timer = avg_steal; } +static u64 vtime_delta(void) +{ + u64 timer = S390_lowcore.last_update_timer; + + S390_lowcore.last_update_timer = get_vtimer(); + + return timer - S390_lowcore.last_update_timer; +} + /* * Update process times based on virtual cpu times stored by entry.S * to the lowcore fields user_timer, system_timer & steal_clock. */ -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_account_kernel(struct task_struct *tsk) { - u64 timer; - - timer = S390_lowcore.last_update_timer; - S390_lowcore.last_update_timer = get_vtimer(); - timer -= S390_lowcore.last_update_timer; + u64 delta = vtime_delta(); - if ((tsk->flags & PF_VCPU) && (irq_count() == 0)) - S390_lowcore.guest_timer += timer; - else if (hardirq_count()) - S390_lowcore.hardirq_timer += timer; - else if (in_serving_softirq()) - S390_lowcore.softirq_timer += timer; + if (tsk->flags & PF_VCPU) + S390_lowcore.guest_timer += delta; else - S390_lowcore.system_timer += timer; + S390_lowcore.system_timer += delta; - virt_timer_forward(timer); + virt_timer_forward(delta); } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); - -void vtime_account_kernel(struct task_struct *tsk) -__attribute__((alias("vtime_account_irq_enter"))); EXPORT_SYMBOL_GPL(vtime_account_kernel); +void vtime_account_softirq(struct task_struct *tsk) +{ + u64 delta = vtime_delta(); + + S390_lowcore.softirq_timer += delta; + + virt_timer_forward(delta); +} + +void vtime_account_hardirq(struct task_struct *tsk) +{ + u64 delta = vtime_delta(); + + S390_lowcore.hardirq_timer += delta; + + virt_timer_forward(delta); +} + /* * Sorted add to a list. List is linear searched until first bigger * element is found. diff --git a/arch/sh/include/asm/fixmap.h b/arch/sh/include/asm/fixmap.h index f38adc189..b07fbc7f7 100644 --- a/arch/sh/include/asm/fixmap.h +++ b/arch/sh/include/asm/fixmap.h @@ -13,9 +13,6 @@ #include <linux/kernel.h> #include <linux/threads.h> #include <asm/page.h> -#ifdef CONFIG_HIGHMEM -#include <asm/kmap_types.h> -#endif /* * Here we define all the compile-time 'special' virtual @@ -53,11 +50,6 @@ enum fixed_addresses { FIX_CMAP_BEGIN, FIX_CMAP_END = FIX_CMAP_BEGIN + (FIX_N_COLOURS * NR_CPUS) - 1, -#ifdef CONFIG_HIGHMEM - FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_TYPE_NR * NR_CPUS) - 1, -#endif - #ifdef CONFIG_IOREMAP_FIXED /* * FIX_IOREMAP entries are useful for mapping physical address diff --git a/arch/sh/include/asm/hardirq.h b/arch/sh/include/asm/hardirq.h index edaea3559..9fe4495a8 100644 --- a/arch/sh/include/asm/hardirq.h +++ b/arch/sh/include/asm/hardirq.h @@ -2,16 +2,10 @@ #ifndef __ASM_SH_HARDIRQ_H #define __ASM_SH_HARDIRQ_H -#include <linux/threads.h> -#include <linux/irq.h> - -typedef struct { - unsigned int __softirq_pending; - unsigned int __nmi_count; /* arch dependent */ -} ____cacheline_aligned irq_cpustat_t; - -#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ - extern void ack_bad_irq(unsigned int irq); +#define ack_bad_irq ack_bad_irq +#define ARCH_WANTS_NMI_IRQSTAT + +#include <asm-generic/hardirq.h> #endif /* __ASM_SH_HARDIRQ_H */ diff --git a/arch/sh/include/asm/kmap_types.h b/arch/sh/include/asm/kmap_types.h deleted file mode 100644 index b78107f92..000000000 --- a/arch/sh/include/asm/kmap_types.h +++ /dev/null @@ -1,15 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __SH_KMAP_TYPES_H -#define __SH_KMAP_TYPES_H - -/* Dummy header just to define km_type. */ - -#ifdef CONFIG_DEBUG_HIGHMEM -#define __WITH_KM_FENCE -#endif - -#include <asm-generic/kmap_types.h> - -#undef __WITH_KM_FENCE - -#endif diff --git a/arch/sh/include/asm/spinlock_types.h b/arch/sh/include/asm/spinlock_types.h index e82369f28..22ca9a98b 100644 --- a/arch/sh/include/asm/spinlock_types.h +++ b/arch/sh/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef __ASM_SH_SPINLOCK_TYPES_H #define __ASM_SH_SPINLOCK_TYPES_H -#ifndef __LINUX_SPINLOCK_TYPES_H -# error "please don't include this file directly" -#endif - typedef struct { volatile unsigned int lock; } arch_spinlock_t; diff --git a/arch/sh/kernel/irq.c b/arch/sh/kernel/irq.c index 5717c7cbd..5db7af565 100644 --- a/arch/sh/kernel/irq.c +++ b/arch/sh/kernel/irq.c @@ -44,7 +44,7 @@ int arch_show_interrupts(struct seq_file *p, int prec) seq_printf(p, "%*s: ", prec, "NMI"); for_each_online_cpu(j) - seq_printf(p, "%10u ", nmi_count(j)); + seq_printf(p, "%10u ", per_cpu(irq_stat.__nmi_count, j)); seq_printf(p, " Non-maskable interrupts\n"); seq_printf(p, "%*s: %10u\n", prec, "ERR", atomic_read(&irq_err_count)); @@ -148,6 +148,7 @@ void irq_ctx_exit(int cpu) hardirq_ctx[cpu] = NULL; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct thread_info *curctx; @@ -175,6 +176,7 @@ void do_softirq_own_stack(void) "r5", "r6", "r7", "r8", "r9", "r15", "t", "pr" ); } +#endif #else static inline void handle_one_irq(unsigned int irq) { diff --git a/arch/sh/kernel/traps.c b/arch/sh/kernel/traps.c index 9c3d32b80..f5beecdac 100644 --- a/arch/sh/kernel/traps.c +++ b/arch/sh/kernel/traps.c @@ -186,7 +186,7 @@ BUILD_TRAP_HANDLER(nmi) arch_ftrace_nmi_enter(); nmi_enter(); - nmi_count(cpu)++; + this_cpu_inc(irq_stat.__nmi_count); switch (notify_die(DIE_NMI, "NMI", regs, 0, vec & 0xff, SIGINT)) { case NOTIFY_OK: diff --git a/arch/sh/mm/init.c b/arch/sh/mm/init.c index 3348e0c4d..0db6919af 100644 --- a/arch/sh/mm/init.c +++ b/arch/sh/mm/init.c @@ -362,9 +362,6 @@ void __init mem_init(void) mem_init_print_info(NULL); pr_info("virtual kernel memory layout:\n" " fixmap : 0x%08lx - 0x%08lx (%4ld kB)\n" -#ifdef CONFIG_HIGHMEM - " pkmap : 0x%08lx - 0x%08lx (%4ld kB)\n" -#endif " vmalloc : 0x%08lx - 0x%08lx (%4ld MB)\n" " lowmem : 0x%08lx - 0x%08lx (%4ld MB) (cached)\n" #ifdef CONFIG_UNCACHED_MAPPING @@ -376,11 +373,6 @@ void __init mem_init(void) FIXADDR_START, FIXADDR_TOP, (FIXADDR_TOP - FIXADDR_START) >> 10, -#ifdef CONFIG_HIGHMEM - PKMAP_BASE, PKMAP_BASE+LAST_PKMAP*PAGE_SIZE, - (LAST_PKMAP*PAGE_SIZE) >> 10, -#endif - (unsigned long)VMALLOC_START, VMALLOC_END, (VMALLOC_END - VMALLOC_START) >> 20, diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index 530b7ec5d..a38d00d8b 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -139,6 +139,7 @@ config MMU config HIGHMEM bool default y if SPARC32 + select KMAP_LOCAL config ZONE_DMA bool diff --git a/arch/sparc/include/asm/highmem.h b/arch/sparc/include/asm/highmem.h index 6c35f0d27..875116209 100644 --- a/arch/sparc/include/asm/highmem.h +++ b/arch/sparc/include/asm/highmem.h @@ -24,7 +24,6 @@ #include <linux/interrupt.h> #include <linux/pgtable.h> #include <asm/vaddrs.h> -#include <asm/kmap_types.h> #include <asm/pgtsrmmu.h> /* declarations for highmem.c */ @@ -33,8 +32,6 @@ extern unsigned long highstart_pfn, highend_pfn; #define kmap_prot __pgprot(SRMMU_ET_PTE | SRMMU_PRIV | SRMMU_CACHE) extern pte_t *pkmap_page_table; -void kmap_init(void) __init; - /* * Right now we initialize only a single pte table. It can be extended * easily, subsequent pte tables have to be allocated in one physical @@ -53,6 +50,11 @@ void kmap_init(void) __init; #define flush_cache_kmaps() flush_cache_all() +/* FIXME: Use __flush_tlb_one(vaddr) instead of flush_cache_all() -- Anton */ +#define arch_kmap_local_post_map(vaddr, pteval) flush_cache_all() +#define arch_kmap_local_post_unmap(vaddr) flush_cache_all() + + #endif /* __KERNEL__ */ #endif /* _ASM_HIGHMEM_H */ diff --git a/arch/sparc/include/asm/kmap_types.h b/arch/sparc/include/asm/kmap_types.h deleted file mode 100644 index 55a99b6bd..000000000 --- a/arch/sparc/include/asm/kmap_types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_KMAP_TYPES_H -#define _ASM_KMAP_TYPES_H - -/* Dummy header just to define km_type. None of this - * is actually used on sparc. -DaveM - */ - -#include <asm-generic/kmap_types.h> - -#endif diff --git a/arch/sparc/include/asm/vaddrs.h b/arch/sparc/include/asm/vaddrs.h index 84d054b07..4fec0341e 100644 --- a/arch/sparc/include/asm/vaddrs.h +++ b/arch/sparc/include/asm/vaddrs.h @@ -32,13 +32,13 @@ #define SRMMU_NOCACHE_ALCRATIO 64 /* 256 pages per 64MB of system RAM */ #ifndef __ASSEMBLY__ -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> enum fixed_addresses { FIX_HOLE, #ifdef CONFIG_HIGHMEM FIX_KMAP_BEGIN, - FIX_KMAP_END = (KM_TYPE_NR * NR_CPUS), + FIX_KMAP_END = (KM_MAX_IDX * NR_CPUS), #endif __end_of_fixed_addresses }; diff --git a/arch/sparc/kernel/irq_64.c b/arch/sparc/kernel/irq_64.c index 3ec9f1402..eb21682ab 100644 --- a/arch/sparc/kernel/irq_64.c +++ b/arch/sparc/kernel/irq_64.c @@ -854,6 +854,7 @@ void __irq_entry handler_irq(int pil, struct pt_regs *regs) set_irq_regs(old_regs); } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { void *orig_sp, *sp = softirq_stack[smp_processor_id()]; @@ -868,6 +869,7 @@ void do_softirq_own_stack(void) __asm__ __volatile__("mov %0, %%sp" : : "r" (orig_sp)); } +#endif #ifdef CONFIG_HOTPLUG_CPU void fixup_irqs(void) diff --git a/arch/sparc/mm/Makefile b/arch/sparc/mm/Makefile index b078205b7..68db1f859 100644 --- a/arch/sparc/mm/Makefile +++ b/arch/sparc/mm/Makefile @@ -15,6 +15,3 @@ obj-$(CONFIG_SPARC32) += leon_mm.o # Only used by sparc64 obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o - -# Only used by sparc32 -obj-$(CONFIG_HIGHMEM) += highmem.o diff --git a/arch/sparc/mm/highmem.c b/arch/sparc/mm/highmem.c deleted file mode 100644 index 8f2a2afb0..000000000 --- a/arch/sparc/mm/highmem.c +++ /dev/null @@ -1,115 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * highmem.c: virtual kernel memory mappings for high memory - * - * Provides kernel-static versions of atomic kmap functions originally - * found as inlines in include/asm-sparc/highmem.h. These became - * needed as kmap_atomic() and kunmap_atomic() started getting - * called from within modules. - * -- Tomas Szepe <szepe@pinerecords.com>, September 2002 - * - * But kmap_atomic() and kunmap_atomic() cannot be inlined in - * modules because they are loaded with btfixup-ped functions. - */ - -/* - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need it. - * - * XXX This is an old text. Actually, it's good to use atomic kmaps, - * provided you remember that they are atomic and not try to sleep - * with a kmap taken, much like a spinlock. Non-atomic kmaps are - * shared by CPUs, and so precious, and establishing them requires IPI. - * Atomic kmaps are lightweight and we may have NCPUS more of them. - */ -#include <linux/highmem.h> -#include <linux/export.h> -#include <linux/mm.h> - -#include <asm/cacheflush.h> -#include <asm/tlbflush.h> -#include <asm/vaddrs.h> - -static pte_t *kmap_pte; - -void __init kmap_init(void) -{ - unsigned long address = __fix_to_virt(FIX_KMAP_BEGIN); - - /* cache the first kmap pte */ - kmap_pte = virt_to_kpte(address); -} - -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - long idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - -/* XXX Fix - Anton */ -#if 0 - __flush_cache_one(vaddr); -#else - flush_cache_all(); -#endif - -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte-idx))); -#endif - set_pte(kmap_pte-idx, mk_pte(page, prot)); -/* XXX Fix - Anton */ -#if 0 - __flush_tlb_one(vaddr); -#else - flush_tlb_all(); -#endif - - return (void*) vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - int type; - - if (vaddr < FIXADDR_START) - return; - - type = kmap_atomic_idx(); - -#ifdef CONFIG_DEBUG_HIGHMEM - { - unsigned long idx; - - idx = type + KM_TYPE_NR * smp_processor_id(); - BUG_ON(vaddr != __fix_to_virt(FIX_KMAP_BEGIN+idx)); - - /* XXX Fix - Anton */ -#if 0 - __flush_cache_one(vaddr); -#else - flush_cache_all(); -#endif - - /* - * force other mappings to Oops if they'll try to access - * this pte without first remap it - */ - pte_clear(&init_mm, vaddr, kmap_pte-idx); - /* XXX Fix - Anton */ -#if 0 - __flush_tlb_one(vaddr); -#else - flush_tlb_all(); -#endif - } -#endif - - kmap_atomic_idx_pop(); -} -EXPORT_SYMBOL(kunmap_atomic_high); diff --git a/arch/sparc/mm/srmmu.c b/arch/sparc/mm/srmmu.c index 0070f8b9a..a03caa5f6 100644 --- a/arch/sparc/mm/srmmu.c +++ b/arch/sparc/mm/srmmu.c @@ -971,8 +971,6 @@ void __init srmmu_paging_init(void) sparc_context_init(num_contexts); - kmap_init(); - { unsigned long max_zone_pfn[MAX_NR_ZONES] = { 0 }; diff --git a/arch/um/include/asm/fixmap.h b/arch/um/include/asm/fixmap.h index 2c697a145..2efac5827 100644 --- a/arch/um/include/asm/fixmap.h +++ b/arch/um/include/asm/fixmap.h @@ -3,7 +3,6 @@ #define __UM_FIXMAP_H #include <asm/processor.h> -#include <asm/kmap_types.h> #include <asm/archparam.h> #include <asm/page.h> #include <linux/threads.h> diff --git a/arch/um/include/asm/hardirq.h b/arch/um/include/asm/hardirq.h index b426796d2..52e2c3626 100644 --- a/arch/um/include/asm/hardirq.h +++ b/arch/um/include/asm/hardirq.h @@ -2,22 +2,7 @@ #ifndef __ASM_UM_HARDIRQ_H #define __ASM_UM_HARDIRQ_H -#include <linux/cache.h> -#include <linux/threads.h> - -typedef struct { - unsigned int __softirq_pending; -} ____cacheline_aligned irq_cpustat_t; - -#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ -#include <linux/irq.h> - -#ifndef ack_bad_irq -static inline void ack_bad_irq(unsigned int irq) -{ - printk(KERN_CRIT "unexpected IRQ trap at vector %02x\n", irq); -} -#endif +#include <asm-generic/hardirq.h> #define __ARCH_IRQ_EXIT_IRQS_DISABLED 1 diff --git a/arch/um/include/asm/kmap_types.h b/arch/um/include/asm/kmap_types.h deleted file mode 100644 index b0bd12de1..000000000 --- a/arch/um/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -/* - * Copyright (C) 2002 Jeff Dike (jdike@karaya.com) - */ - -#ifndef __UM_KMAP_TYPES_H -#define __UM_KMAP_TYPES_H - -/* No more #include "asm/arch/kmap_types.h" ! */ - -#define KM_TYPE_NR 14 - -#endif diff --git a/arch/um/kernel/kmsg_dump.c b/arch/um/kernel/kmsg_dump.c index e4abac6c9..173999422 100644 --- a/arch/um/kernel/kmsg_dump.c +++ b/arch/um/kernel/kmsg_dump.c @@ -1,15 +1,19 @@ // SPDX-License-Identifier: GPL-2.0 #include <linux/kmsg_dump.h> +#include <linux/spinlock.h> #include <linux/console.h> #include <shared/init.h> #include <shared/kern.h> #include <os.h> static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) + enum kmsg_dump_reason reason, + struct kmsg_dumper_iter *iter) { + static DEFINE_SPINLOCK(lock); static char line[1024]; struct console *con; + unsigned long flags; size_t len = 0; /* only dump kmsg when no console is available */ @@ -24,11 +28,16 @@ static void kmsg_dumper_stdout(struct kmsg_dumper *dumper, if (con) return; + if (!spin_trylock_irqsave(&lock, flags)) + return; + printf("kmsg_dump:\n"); - while (kmsg_dump_get_line(dumper, true, line, sizeof(line), &len)) { + while (kmsg_dump_get_line(iter, true, line, sizeof(line), &len)) { line[len] = '\0'; printf("%s", line); } + + spin_unlock_irqrestore(&lock, flags); } static struct kmsg_dumper kmsg_dumper = { diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 040fb7736..79c0da581 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -16,6 +16,7 @@ config X86_32 select CLKSRC_I8253 select CLONE_BACKWARDS select HAVE_DEBUG_STACKOVERFLOW + select KMAP_LOCAL select MODULES_USE_ELF_REL select OLD_SIGACTION select GENERIC_VDSO_32 @@ -95,6 +96,7 @@ config X86 select ARCH_SUPPORTS_ACPI select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING if X86_64 + select ARCH_SUPPORTS_RT select ARCH_USE_BUILTIN_BSWAP select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS @@ -216,6 +218,7 @@ config X86 select HAVE_PCI select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP + select HAVE_PREEMPT_LAZY select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT select HAVE_POSIX_CPU_TIMERS_TASK_WORK select HAVE_REGS_AND_STACK_ACCESS_API diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c index be891fdf8..29c716ed1 100644 --- a/arch/x86/crypto/aesni-intel_glue.c +++ b/arch/x86/crypto/aesni-intel_glue.c @@ -379,14 +379,14 @@ static int ecb_encrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, true); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_ecb_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -401,14 +401,14 @@ static int ecb_decrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, true); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_ecb_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -423,14 +423,14 @@ static int cbc_encrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, true); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_cbc_enc(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -445,14 +445,14 @@ static int cbc_decrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, true); - kernel_fpu_begin(); while ((nbytes = walk.nbytes)) { + kernel_fpu_begin(); aesni_cbc_dec(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } - kernel_fpu_end(); return err; } @@ -500,18 +500,20 @@ static int ctr_crypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, true); - kernel_fpu_begin(); while ((nbytes = walk.nbytes) >= AES_BLOCK_SIZE) { + kernel_fpu_begin(); aesni_ctr_enc_tfm(ctx, walk.dst.virt.addr, walk.src.virt.addr, nbytes & AES_BLOCK_MASK, walk.iv); + kernel_fpu_end(); nbytes &= AES_BLOCK_SIZE - 1; err = skcipher_walk_done(&walk, nbytes); } if (walk.nbytes) { + kernel_fpu_begin(); ctr_crypt_final(ctx, &walk); + kernel_fpu_end(); err = skcipher_walk_done(&walk, 0); } - kernel_fpu_end(); return err; } diff --git a/arch/x86/crypto/cast5_avx_glue.c b/arch/x86/crypto/cast5_avx_glue.c index 384ccb00f..2f8df8ef8 100644 --- a/arch/x86/crypto/cast5_avx_glue.c +++ b/arch/x86/crypto/cast5_avx_glue.c @@ -46,7 +46,7 @@ static inline void cast5_fpu_end(bool fpu_enabled) static int ecb_crypt(struct skcipher_request *req, bool enc) { - bool fpu_enabled = false; + bool fpu_enabled; struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); struct skcipher_walk walk; @@ -61,7 +61,7 @@ static int ecb_crypt(struct skcipher_request *req, bool enc) u8 *wsrc = walk.src.virt.addr; u8 *wdst = walk.dst.virt.addr; - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); + fpu_enabled = cast5_fpu_begin(false, &walk, nbytes); /* Process multi-block batch */ if (nbytes >= bsize * CAST5_PARALLEL_BLOCKS) { @@ -90,10 +90,9 @@ static int ecb_crypt(struct skcipher_request *req, bool enc) } while (nbytes >= bsize); done: + cast5_fpu_end(fpu_enabled); err = skcipher_walk_done(&walk, nbytes); } - - cast5_fpu_end(fpu_enabled); return err; } @@ -197,7 +196,7 @@ static int cbc_decrypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - bool fpu_enabled = false; + bool fpu_enabled; struct skcipher_walk walk; unsigned int nbytes; int err; @@ -205,12 +204,11 @@ static int cbc_decrypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes)) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); + fpu_enabled = cast5_fpu_begin(false, &walk, nbytes); nbytes = __cbc_decrypt(ctx, &walk); + cast5_fpu_end(fpu_enabled); err = skcipher_walk_done(&walk, nbytes); } - - cast5_fpu_end(fpu_enabled); return err; } @@ -277,7 +275,7 @@ static int ctr_crypt(struct skcipher_request *req) { struct crypto_skcipher *tfm = crypto_skcipher_reqtfm(req); struct cast5_ctx *ctx = crypto_skcipher_ctx(tfm); - bool fpu_enabled = false; + bool fpu_enabled; struct skcipher_walk walk; unsigned int nbytes; int err; @@ -285,13 +283,12 @@ static int ctr_crypt(struct skcipher_request *req) err = skcipher_walk_virt(&walk, req, false); while ((nbytes = walk.nbytes) >= CAST5_BLOCK_SIZE) { - fpu_enabled = cast5_fpu_begin(fpu_enabled, &walk, nbytes); + fpu_enabled = cast5_fpu_begin(false, &walk, nbytes); nbytes = __ctr_crypt(&walk, ctx); + cast5_fpu_end(fpu_enabled); err = skcipher_walk_done(&walk, nbytes); } - cast5_fpu_end(fpu_enabled); - if (walk.nbytes) { ctr_crypt_final(&walk, ctx); err = skcipher_walk_done(&walk, 0); diff --git a/arch/x86/crypto/glue_helper.c b/arch/x86/crypto/glue_helper.c index d3d91a0ab..6d0774721 100644 --- a/arch/x86/crypto/glue_helper.c +++ b/arch/x86/crypto/glue_helper.c @@ -24,7 +24,7 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); const unsigned int bsize = 128 / 8; struct skcipher_walk walk; - bool fpu_enabled = false; + bool fpu_enabled; unsigned int nbytes; int err; @@ -37,7 +37,7 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, unsigned int i; fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); + &walk, false, nbytes); for (i = 0; i < gctx->num_funcs; i++) { func_bytes = bsize * gctx->funcs[i].num_blocks; @@ -55,10 +55,9 @@ int glue_ecb_req_128bit(const struct common_glue_ctx *gctx, if (nbytes < bsize) break; } + glue_fpu_end(fpu_enabled); err = skcipher_walk_done(&walk, nbytes); } - - glue_fpu_end(fpu_enabled); return err; } EXPORT_SYMBOL_GPL(glue_ecb_req_128bit); @@ -101,7 +100,7 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); const unsigned int bsize = 128 / 8; struct skcipher_walk walk; - bool fpu_enabled = false; + bool fpu_enabled; unsigned int nbytes; int err; @@ -115,7 +114,7 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, u128 last_iv; fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); + &walk, false, nbytes); /* Start of the last block. */ src += nbytes / bsize - 1; dst += nbytes / bsize - 1; @@ -148,10 +147,10 @@ int glue_cbc_decrypt_req_128bit(const struct common_glue_ctx *gctx, done: u128_xor(dst, dst, (u128 *)walk.iv); *(u128 *)walk.iv = last_iv; + glue_fpu_end(fpu_enabled); err = skcipher_walk_done(&walk, nbytes); } - glue_fpu_end(fpu_enabled); return err; } EXPORT_SYMBOL_GPL(glue_cbc_decrypt_req_128bit); @@ -162,7 +161,7 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, void *ctx = crypto_skcipher_ctx(crypto_skcipher_reqtfm(req)); const unsigned int bsize = 128 / 8; struct skcipher_walk walk; - bool fpu_enabled = false; + bool fpu_enabled; unsigned int nbytes; int err; @@ -176,7 +175,7 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, le128 ctrblk; fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, - &walk, fpu_enabled, nbytes); + &walk, false, nbytes); be128_to_le128(&ctrblk, (be128 *)walk.iv); @@ -202,11 +201,10 @@ int glue_ctr_req_128bit(const struct common_glue_ctx *gctx, } le128_to_be128((be128 *)walk.iv, &ctrblk); + glue_fpu_end(fpu_enabled); err = skcipher_walk_done(&walk, nbytes); } - glue_fpu_end(fpu_enabled); - if (nbytes) { le128 ctrblk; u128 tmp; @@ -306,8 +304,14 @@ int glue_xts_req_128bit(const struct common_glue_ctx *gctx, tweak_fn(tweak_ctx, walk.iv, walk.iv); while (nbytes) { + fpu_enabled = glue_fpu_begin(bsize, gctx->fpu_blocks_limit, + &walk, fpu_enabled, + nbytes < bsize ? bsize : nbytes); nbytes = __glue_xts_req_128bit(gctx, crypt_ctx, &walk); + glue_fpu_end(fpu_enabled); + fpu_enabled = false; + err = skcipher_walk_done(&walk, nbytes); nbytes = walk.nbytes; } diff --git a/arch/x86/include/asm/fixmap.h b/arch/x86/include/asm/fixmap.h index 77217bd29..8eba66a33 100644 --- a/arch/x86/include/asm/fixmap.h +++ b/arch/x86/include/asm/fixmap.h @@ -31,7 +31,7 @@ #include <asm/pgtable_types.h> #ifdef CONFIG_X86_32 #include <linux/threads.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> #else #include <uapi/asm/vsyscall.h> #endif @@ -94,7 +94,7 @@ enum fixed_addresses { #endif #ifdef CONFIG_X86_32 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ - FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, + FIX_KMAP_END = FIX_KMAP_BEGIN + (KM_MAX_IDX * NR_CPUS) - 1, #ifdef CONFIG_PCI_MMCONFIG FIX_PCIE_MCFG, #endif @@ -151,7 +151,6 @@ extern void reserve_top_address(unsigned long reserve); extern int fixmaps_set; -extern pte_t *kmap_pte; extern pte_t *pkmap_page_table; void __native_set_fixmap(enum fixed_addresses idx, pte_t pte); diff --git a/arch/x86/include/asm/fpu/api.h b/arch/x86/include/asm/fpu/api.h index 8b9bfaad6..d31b08865 100644 --- a/arch/x86/include/asm/fpu/api.h +++ b/arch/x86/include/asm/fpu/api.h @@ -28,6 +28,7 @@ extern void kernel_fpu_begin_mask(unsigned int kfpu_mask); extern void kernel_fpu_end(void); extern bool irq_fpu_usable(void); extern void fpregs_mark_activate(void); +extern void kernel_fpu_resched(void); /* Code that is unaware of kernel_fpu_begin_mask() can use this */ static inline void kernel_fpu_begin(void) @@ -40,17 +41,32 @@ static inline void kernel_fpu_begin(void) * A context switch will (and softirq might) save CPU's FPU registers to * fpu->state and set TIF_NEED_FPU_LOAD leaving CPU's FPU registers in * a random state. + * + * local_bh_disable() protects against both preemption and soft interrupts + * on !RT kernels. + * + * On RT kernels local_bh_disable() is not sufficient because it only + * serializes soft interrupt related sections via a local lock, but stays + * preemptible. Disabling preemption is the right choice here as bottom + * half processing is always in thread context on RT kernels so it + * implicitly prevents bottom half processing as well. + * + * Disabling preemption also serializes against kernel_fpu_begin(). */ static inline void fpregs_lock(void) { - preempt_disable(); - local_bh_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_disable(); + else + preempt_disable(); } static inline void fpregs_unlock(void) { - local_bh_enable(); - preempt_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_bh_enable(); + else + preempt_enable(); } #ifdef CONFIG_X86_DEBUG_FPU diff --git a/arch/x86/include/asm/highmem.h b/arch/x86/include/asm/highmem.h index 0f420b24e..032e02085 100644 --- a/arch/x86/include/asm/highmem.h +++ b/arch/x86/include/asm/highmem.h @@ -23,7 +23,6 @@ #include <linux/interrupt.h> #include <linux/threads.h> -#include <asm/kmap_types.h> #include <asm/tlbflush.h> #include <asm/paravirt.h> #include <asm/fixmap.h> @@ -58,11 +57,17 @@ extern unsigned long highstart_pfn, highend_pfn; #define PKMAP_NR(virt) ((virt-PKMAP_BASE) >> PAGE_SHIFT) #define PKMAP_ADDR(nr) (PKMAP_BASE + ((nr) << PAGE_SHIFT)) -void *kmap_atomic_pfn(unsigned long pfn); -void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); - #define flush_cache_kmaps() do { } while (0) +#define arch_kmap_local_post_map(vaddr, pteval) \ + arch_flush_lazy_mmu_mode() + +#define arch_kmap_local_post_unmap(vaddr) \ + do { \ + flush_tlb_one_kernel((vaddr)); \ + arch_flush_lazy_mmu_mode(); \ + } while (0) + extern void add_highpages_with_active_regions(int nid, unsigned long start_pfn, unsigned long end_pfn); diff --git a/arch/x86/include/asm/iomap.h b/arch/x86/include/asm/iomap.h index bacf68c4d..e2de092fc 100644 --- a/arch/x86/include/asm/iomap.h +++ b/arch/x86/include/asm/iomap.h @@ -9,19 +9,14 @@ #include <linux/fs.h> #include <linux/mm.h> #include <linux/uaccess.h> +#include <linux/highmem.h> #include <asm/cacheflush.h> #include <asm/tlbflush.h> -void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot); +void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot); -void -iounmap_atomic(void __iomem *kvaddr); +int iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); -int -iomap_create_wc(resource_size_t base, unsigned long size, pgprot_t *prot); - -void -iomap_free(resource_size_t base, unsigned long size); +void iomap_free(resource_size_t base, unsigned long size); #endif /* _ASM_X86_IOMAP_H */ diff --git a/arch/x86/include/asm/kmap_types.h b/arch/x86/include/asm/kmap_types.h deleted file mode 100644 index 04ab8266e..000000000 --- a/arch/x86/include/asm/kmap_types.h +++ /dev/null @@ -1,13 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_X86_KMAP_TYPES_H -#define _ASM_X86_KMAP_TYPES_H - -#if defined(CONFIG_X86_32) && defined(CONFIG_DEBUG_HIGHMEM) -#define __WITH_KM_FENCE -#endif - -#include <asm-generic/kmap_types.h> - -#undef __WITH_KM_FENCE - -#endif /* _ASM_X86_KMAP_TYPES_H */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index b30b56d47..9632218bf 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -43,7 +43,6 @@ #ifndef __ASSEMBLY__ #include <asm/desc_defs.h> -#include <asm/kmap_types.h> #include <asm/pgtable_types.h> #include <asm/nospec-branch.h> diff --git a/arch/x86/include/asm/preempt.h b/arch/x86/include/asm/preempt.h index 2380df419..aacfaad6c 100644 --- a/arch/x86/include/asm/preempt.h +++ b/arch/x86/include/asm/preempt.h @@ -90,21 +90,54 @@ static __always_inline void __preempt_count_sub(int val) * a decrement which hits zero means we have no preempt_count and should * reschedule. */ -static __always_inline bool __preempt_count_dec_and_test(void) +static __always_inline bool ____preempt_count_dec_and_test(void) { return GEN_UNARY_RMWcc("decl", __preempt_count, e, __percpu_arg([var])); } +static __always_inline bool __preempt_count_dec_and_test(void) +{ + if (____preempt_count_dec_and_test()) + return true; +#ifdef CONFIG_PREEMPT_LAZY + if (preempt_count()) + return false; + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else + return false; +#endif +} + /* * Returns true when we need to resched and can (barring IRQ state). */ static __always_inline bool should_resched(int preempt_offset) { +#ifdef CONFIG_PREEMPT_LAZY + u32 tmp; + tmp = raw_cpu_read_4(__preempt_count); + if (tmp == preempt_offset) + return true; + + /* preempt count == 0 ? */ + tmp &= ~PREEMPT_NEED_RESCHED; + if (tmp != preempt_offset) + return false; + /* XXX PREEMPT_LOCK_OFFSET */ + if (current_thread_info()->preempt_lazy_count) + return false; + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +#else return unlikely(raw_cpu_read_4(__preempt_count) == preempt_offset); +#endif } #ifdef CONFIG_PREEMPTION - +#ifdef CONFIG_PREEMPT_RT + extern void preempt_schedule_lock(void); +#endif extern asmlinkage void preempt_schedule(void); extern asmlinkage void preempt_schedule_thunk(void); diff --git a/arch/x86/include/asm/signal.h b/arch/x86/include/asm/signal.h index 6fd8410a3..f3bf2f515 100644 --- a/arch/x86/include/asm/signal.h +++ b/arch/x86/include/asm/signal.h @@ -28,6 +28,19 @@ typedef struct { #define SA_IA32_ABI 0x02000000u #define SA_X32_ABI 0x01000000u +/* + * Because some traps use the IST stack, we must keep preemption + * disabled while calling do_trap(), but do_trap() may call + * force_sig_info() which will grab the signal spin_locks for the + * task, which in PREEMPT_RT are mutexes. By defining + * ARCH_RT_DELAYS_SIGNAL_SEND the force_sig_info() will set + * TIF_NOTIFY_RESUME and set up the signal to be sent on exit of the + * trap. + */ +#if defined(CONFIG_PREEMPT_RT) +#define ARCH_RT_DELAYS_SIGNAL_SEND +#endif + #ifndef CONFIG_COMPAT typedef sigset_t compat_sigset_t; #endif diff --git a/arch/x86/include/asm/stackprotector.h b/arch/x86/include/asm/stackprotector.h index 7fb482f0f..3df0a95c9 100644 --- a/arch/x86/include/asm/stackprotector.h +++ b/arch/x86/include/asm/stackprotector.h @@ -65,7 +65,7 @@ */ static __always_inline void boot_init_stack_canary(void) { - u64 canary; + u64 canary = 0; u64 tsc; #ifdef CONFIG_X86_64 @@ -76,8 +76,14 @@ static __always_inline void boot_init_stack_canary(void) * of randomness. The TSC only matters for very early init, * there it already has some randomness on most systems. Later * on during the bootup the random pool has true entropy too. + * For preempt-rt we need to weaken the randomness a bit, as + * we can't call into the random generator from atomic context + * due to locking constraints. We just leave canary + * uninitialized and use the TSC based randomness on top of it. */ +#ifndef CONFIG_PREEMPT_RT get_random_bytes(&canary, sizeof(canary)); +#endif tsc = rdtsc(); canary += tsc + (tsc << 32UL); canary &= CANARY_MASK; diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h index a225c6e2c..ad34b468a 100644 --- a/arch/x86/include/asm/thread_info.h +++ b/arch/x86/include/asm/thread_info.h @@ -60,6 +60,8 @@ struct thread_info { #ifdef CONFIG_SMP u32 cpu; /* current CPU */ #endif + int preempt_lazy_count; /* 0 => lazy preemptable + <0 => BUG */ KABI_RESERVE(1) KABI_RESERVE(2) }; @@ -67,12 +69,17 @@ struct thread_info { #define INIT_THREAD_INFO(tsk) \ { \ .flags = 0, \ + .preempt_lazy_count = 0, \ } #else /* !__ASSEMBLY__ */ #include <asm/asm-offsets.h> +#define GET_THREAD_INFO(reg) \ + _ASM_MOV PER_CPU_VAR(cpu_current_top_of_stack),reg ; \ + _ASM_SUB $(THREAD_SIZE),reg ; + #endif /* @@ -99,6 +106,7 @@ struct thread_info { #define TIF_NOTSC 16 /* TSC is not accessible in userland */ #define TIF_IA32 17 /* IA32 compatibility process */ #define TIF_SLD 18 /* Restore split lock detection on context switch */ +#define TIF_NEED_RESCHED_LAZY 19 /* lazy rescheduling necessary */ #define TIF_MEMDIE 20 /* is terminating due to OOM killer */ #define TIF_POLLING_NRFLAG 21 /* idle is polling for TIF_NEED_RESCHED */ #define TIF_IO_BITMAP 22 /* uses I/O bitmap */ @@ -128,6 +136,7 @@ struct thread_info { #define _TIF_NOTSC (1 << TIF_NOTSC) #define _TIF_IA32 (1 << TIF_IA32) #define _TIF_SLD (1 << TIF_SLD) +#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) #define _TIF_POLLING_NRFLAG (1 << TIF_POLLING_NRFLAG) #define _TIF_IO_BITMAP (1 << TIF_IO_BITMAP) #define _TIF_FORCED_TF (1 << TIF_FORCED_TF) @@ -160,6 +169,8 @@ struct thread_info { #define _TIF_WORK_CTXSW_NEXT (_TIF_WORK_CTXSW) +#define _TIF_NEED_RESCHED_MASK (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY) + #define STACK_WARN (THREAD_SIZE/8) /* diff --git a/arch/x86/kernel/cpu/mshyperv.c b/arch/x86/kernel/cpu/mshyperv.c index 65d11711c..180981654 100644 --- a/arch/x86/kernel/cpu/mshyperv.c +++ b/arch/x86/kernel/cpu/mshyperv.c @@ -80,11 +80,12 @@ EXPORT_SYMBOL_GPL(hv_remove_vmbus_irq); DEFINE_IDTENTRY_SYSVEC(sysvec_hyperv_stimer0) { struct pt_regs *old_regs = set_irq_regs(regs); + u64 ip = regs ? instruction_pointer(regs) : 0; inc_irq_stat(hyperv_stimer0_count); if (hv_stimer0_handler) hv_stimer0_handler(); - add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0); + add_interrupt_randomness(HYPERV_STIMER0_VECTOR, 0, ip); ack_APIC_irq(); set_irq_regs(old_regs); diff --git a/arch/x86/kernel/crash_dump_32.c b/arch/x86/kernel/crash_dump_32.c index 33ee47670..5fcac46aa 100644 --- a/arch/x86/kernel/crash_dump_32.c +++ b/arch/x86/kernel/crash_dump_32.c @@ -13,8 +13,6 @@ #include <linux/uaccess.h> -static void *kdump_buf_page; - static inline bool is_crashed_pfn_valid(unsigned long pfn) { #ifndef CONFIG_X86_PAE @@ -41,15 +39,11 @@ static inline bool is_crashed_pfn_valid(unsigned long pfn) * @userbuf: if set, @buf is in user address space, use copy_to_user(), * otherwise @buf is in kernel address space, use memcpy(). * - * Copy a page from "oldmem". For this page, there is no pte mapped - * in the current kernel. We stitch up a pte, similar to kmap_atomic. - * - * Calling copy_to_user() in atomic context is not desirable. Hence first - * copying the data to a pre-allocated kernel page and then copying to user - * space in non-atomic context. + * Copy a page from "oldmem". For this page, there might be no pte mapped + * in the current kernel. */ -ssize_t copy_oldmem_page(unsigned long pfn, char *buf, - size_t csize, unsigned long offset, int userbuf) +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, size_t csize, + unsigned long offset, int userbuf) { void *vaddr; @@ -59,38 +53,16 @@ ssize_t copy_oldmem_page(unsigned long pfn, char *buf, if (!is_crashed_pfn_valid(pfn)) return -EFAULT; - vaddr = kmap_atomic_pfn(pfn); + vaddr = kmap_local_pfn(pfn); if (!userbuf) { - memcpy(buf, (vaddr + offset), csize); - kunmap_atomic(vaddr); + memcpy(buf, vaddr + offset, csize); } else { - if (!kdump_buf_page) { - printk(KERN_WARNING "Kdump: Kdump buffer page not" - " allocated\n"); - kunmap_atomic(vaddr); - return -EFAULT; - } - copy_page(kdump_buf_page, vaddr); - kunmap_atomic(vaddr); - if (copy_to_user(buf, (kdump_buf_page + offset), csize)) - return -EFAULT; + if (copy_to_user(buf, vaddr + offset, csize)) + csize = -EFAULT; } - return csize; -} + kunmap_local(vaddr); -static int __init kdump_buf_page_init(void) -{ - int ret = 0; - - kdump_buf_page = kmalloc(PAGE_SIZE, GFP_KERNEL); - if (!kdump_buf_page) { - printk(KERN_WARNING "Kdump: Failed to allocate kdump buffer" - " page\n"); - ret = -ENOMEM; - } - - return ret; + return csize; } -arch_initcall(kdump_buf_page_init); diff --git a/arch/x86/kernel/fpu/core.c b/arch/x86/kernel/fpu/core.c index 571220ac8..d315d45b6 100644 --- a/arch/x86/kernel/fpu/core.c +++ b/arch/x86/kernel/fpu/core.c @@ -159,6 +159,18 @@ void kernel_fpu_end(void) } EXPORT_SYMBOL_GPL(kernel_fpu_end); +void kernel_fpu_resched(void) +{ + WARN_ON_FPU(!this_cpu_read(in_kernel_fpu)); + + if (should_resched(PREEMPT_OFFSET)) { + kernel_fpu_end(); + cond_resched(); + kernel_fpu_begin(); + } +} +EXPORT_SYMBOL_GPL(kernel_fpu_resched); + /* * Save the FPU state (mark it for reload if necessary): * diff --git a/arch/x86/kernel/irq_32.c b/arch/x86/kernel/irq_32.c index 0b79efc87..93c6b88b3 100644 --- a/arch/x86/kernel/irq_32.c +++ b/arch/x86/kernel/irq_32.c @@ -131,6 +131,7 @@ int irq_init_percpu_irqstack(unsigned int cpu) return 0; } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { struct irq_stack *irqstk; @@ -147,6 +148,7 @@ void do_softirq_own_stack(void) call_on_stack(__do_softirq, isp); } +#endif void __handle_irq(struct irq_desc *desc, struct pt_regs *regs) { diff --git a/arch/x86/kernel/irq_64.c b/arch/x86/kernel/irq_64.c index 440eed558..7cfc4e6b7 100644 --- a/arch/x86/kernel/irq_64.c +++ b/arch/x86/kernel/irq_64.c @@ -72,7 +72,9 @@ int irq_init_percpu_irqstack(unsigned int cpu) return map_irq_stack(cpu); } +#ifndef CONFIG_PREEMPT_RT void do_softirq_own_stack(void) { run_on_irqstack_cond(__do_softirq, NULL); } +#endif diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index db64955c6..82d4534e9 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -8035,6 +8035,14 @@ int kvm_arch_init(void *opaque) goto out; } +#ifdef CONFIG_PREEMPT_RT + if (!boot_cpu_has(X86_FEATURE_CONSTANT_TSC)) { + pr_err("RT requires X86_FEATURE_CONSTANT_TSC\n"); + r = -EOPNOTSUPP; + goto out; + } +#endif + r = -ENOMEM; x86_fpu_cache = kmem_cache_create("x86_fpu", sizeof(struct fpu), __alignof__(struct fpu), SLAB_ACCOUNT, diff --git a/arch/x86/mm/highmem_32.c b/arch/x86/mm/highmem_32.c index 075fe5131..2c54b76d8 100644 --- a/arch/x86/mm/highmem_32.c +++ b/arch/x86/mm/highmem_32.c @@ -4,65 +4,6 @@ #include <linux/swap.h> /* for totalram_pages */ #include <linux/memblock.h> -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR*smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - BUG_ON(!pte_none(*(kmap_pte-idx))); - set_pte(kmap_pte-idx, mk_pte(page, prot)); - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -} -EXPORT_SYMBOL(kmap_atomic_high_prot); - -/* - * This is the same as kmap_atomic() but can map memory that doesn't - * have a struct page associated with it. - */ -void *kmap_atomic_pfn(unsigned long pfn) -{ - return kmap_atomic_prot_pfn(pfn, kmap_prot); -} -EXPORT_SYMBOL_GPL(kmap_atomic_pfn); - -void kunmap_atomic_high(void *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - - if (vaddr >= __fix_to_virt(FIX_KMAP_END) && - vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { - int idx, type; - - type = kmap_atomic_idx(); - idx = type + KM_TYPE_NR * smp_processor_id(); - -#ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); -#endif - /* - * Force other mappings to Oops if they'll try to access this - * pte without first remap it. Keeping stale mappings around - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ - kpte_clear_flush(kmap_pte-idx, vaddr); - kmap_atomic_idx_pop(); - arch_flush_lazy_mmu_mode(); - } -#ifdef CONFIG_DEBUG_HIGHMEM - else { - BUG_ON(vaddr < PAGE_OFFSET); - BUG_ON(vaddr >= (unsigned long)high_memory); - } -#endif -} -EXPORT_SYMBOL(kunmap_atomic_high); - void __init set_highmem_pages_init(void) { struct zone *zone; diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c index 7c055259d..da31c2635 100644 --- a/arch/x86/mm/init_32.c +++ b/arch/x86/mm/init_32.c @@ -394,19 +394,6 @@ kernel_physical_mapping_init(unsigned long start, return last_map_addr; } -pte_t *kmap_pte; - -static void __init kmap_init(void) -{ - unsigned long kmap_vstart; - - /* - * Cache the first kmap pte: - */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = virt_to_kpte(kmap_vstart); -} - #ifdef CONFIG_HIGHMEM static void __init permanent_kmaps_init(pgd_t *pgd_base) { @@ -712,8 +699,6 @@ void __init paging_init(void) __flush_tlb_all(); - kmap_init(); - /* * NOTE: at this point the bootmem allocator is fully available. */ diff --git a/arch/x86/mm/iomap_32.c b/arch/x86/mm/iomap_32.c index f60398aeb..9aaa756dd 100644 --- a/arch/x86/mm/iomap_32.c +++ b/arch/x86/mm/iomap_32.c @@ -44,28 +44,7 @@ void iomap_free(resource_size_t base, unsigned long size) } EXPORT_SYMBOL_GPL(iomap_free); -void *kmap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) -{ - unsigned long vaddr; - int idx, type; - - preempt_disable(); - pagefault_disable(); - - type = kmap_atomic_idx_push(); - idx = type + KM_TYPE_NR * smp_processor_id(); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); - set_pte(kmap_pte - idx, pfn_pte(pfn, prot)); - arch_flush_lazy_mmu_mode(); - - return (void *)vaddr; -} - -/* - * Map 'pfn' using protections 'prot' - */ -void __iomem * -iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) +void __iomem *__iomap_local_pfn_prot(unsigned long pfn, pgprot_t prot) { /* * For non-PAT systems, translate non-WB request to UC- just in @@ -81,36 +60,6 @@ iomap_atomic_prot_pfn(unsigned long pfn, pgprot_t prot) /* Filter out unsupported __PAGE_KERNEL* bits: */ pgprot_val(prot) &= __default_kernel_pte_mask; - return (void __force __iomem *) kmap_atomic_prot_pfn(pfn, prot); -} -EXPORT_SYMBOL_GPL(iomap_atomic_prot_pfn); - -void -iounmap_atomic(void __iomem *kvaddr) -{ - unsigned long vaddr = (unsigned long) kvaddr & PAGE_MASK; - - if (vaddr >= __fix_to_virt(FIX_KMAP_END) && - vaddr <= __fix_to_virt(FIX_KMAP_BEGIN)) { - int idx, type; - - type = kmap_atomic_idx(); - idx = type + KM_TYPE_NR * smp_processor_id(); - -#ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(vaddr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); -#endif - /* - * Force other mappings to Oops if they'll try to access this - * pte without first remap it. Keeping stale mappings around - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ - kpte_clear_flush(kmap_pte-idx, vaddr); - kmap_atomic_idx_pop(); - } - - pagefault_enable(); - preempt_enable(); + return (void __force __iomem *)__kmap_local_pfn_prot(pfn, prot); } -EXPORT_SYMBOL_GPL(iounmap_atomic); +EXPORT_SYMBOL_GPL(__iomap_local_pfn_prot); diff --git a/arch/xtensa/Kconfig b/arch/xtensa/Kconfig index 87e08ad38..03cbf6b53 100644 --- a/arch/xtensa/Kconfig +++ b/arch/xtensa/Kconfig @@ -666,6 +666,7 @@ endchoice config HIGHMEM bool "High Memory Support" depends on MMU + select KMAP_LOCAL help Linux can use the full amount of RAM in the system by default. However, the default MMUv2 setup only maps the diff --git a/arch/xtensa/include/asm/fixmap.h b/arch/xtensa/include/asm/fixmap.h index a06ffb0c6..92049b61c 100644 --- a/arch/xtensa/include/asm/fixmap.h +++ b/arch/xtensa/include/asm/fixmap.h @@ -16,7 +16,7 @@ #ifdef CONFIG_HIGHMEM #include <linux/threads.h> #include <linux/pgtable.h> -#include <asm/kmap_types.h> +#include <asm/kmap_size.h> #endif /* @@ -39,7 +39,7 @@ enum fixed_addresses { /* reserved pte's for temporary kernel mappings */ FIX_KMAP_BEGIN, FIX_KMAP_END = FIX_KMAP_BEGIN + - (KM_TYPE_NR * NR_CPUS * DCACHE_N_COLORS) - 1, + (KM_MAX_IDX * NR_CPUS * DCACHE_N_COLORS) - 1, #endif __end_of_fixed_addresses }; diff --git a/arch/xtensa/include/asm/highmem.h b/arch/xtensa/include/asm/highmem.h index eac503215..0fc3b1ceb 100644 --- a/arch/xtensa/include/asm/highmem.h +++ b/arch/xtensa/include/asm/highmem.h @@ -16,9 +16,8 @@ #include <linux/pgtable.h> #include <asm/cacheflush.h> #include <asm/fixmap.h> -#include <asm/kmap_types.h> -#define PKMAP_BASE ((FIXADDR_START - \ +#define PKMAP_BASE ((FIXADDR_START - \ (LAST_PKMAP + 1) * PAGE_SIZE) & PMD_MASK) #define LAST_PKMAP (PTRS_PER_PTE * DCACHE_N_COLORS) #define LAST_PKMAP_MASK (LAST_PKMAP - 1) @@ -68,6 +67,15 @@ static inline void flush_cache_kmaps(void) flush_cache_all(); } +enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn); +#define arch_kmap_local_map_idx kmap_local_map_idx + +enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr); +#define arch_kmap_local_unmap_idx kmap_local_unmap_idx + +#define arch_kmap_local_post_unmap(vaddr) \ + local_flush_tlb_kernel_range(vaddr, vaddr + PAGE_SIZE) + void kmap_init(void); #endif diff --git a/arch/xtensa/include/asm/spinlock_types.h b/arch/xtensa/include/asm/spinlock_types.h index 64c938925..dc846323b 100644 --- a/arch/xtensa/include/asm/spinlock_types.h +++ b/arch/xtensa/include/asm/spinlock_types.h @@ -2,10 +2,6 @@ #ifndef __ASM_SPINLOCK_TYPES_H #define __ASM_SPINLOCK_TYPES_H -#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__ASM_SPINLOCK_H) -# error "please don't include this file directly" -#endif - #include <asm-generic/qspinlock_types.h> #include <asm-generic/qrwlock_types.h> diff --git a/arch/xtensa/mm/highmem.c b/arch/xtensa/mm/highmem.c index 673196fe8..0735ca5e8 100644 --- a/arch/xtensa/mm/highmem.c +++ b/arch/xtensa/mm/highmem.c @@ -12,8 +12,6 @@ #include <linux/highmem.h> #include <asm/tlbflush.h> -static pte_t *kmap_pte; - #if DCACHE_WAY_SIZE > PAGE_SIZE unsigned int last_pkmap_nr_arr[DCACHE_N_COLORS]; wait_queue_head_t pkmap_map_wait_arr[DCACHE_N_COLORS]; @@ -33,59 +31,25 @@ static inline void kmap_waitqueues_init(void) static inline enum fixed_addresses kmap_idx(int type, unsigned long color) { - return (type + KM_TYPE_NR * smp_processor_id()) * DCACHE_N_COLORS + + return (type + KM_MAX_IDX * smp_processor_id()) * DCACHE_N_COLORS + color; } -void *kmap_atomic_high_prot(struct page *page, pgprot_t prot) +enum fixed_addresses kmap_local_map_idx(int type, unsigned long pfn) { - enum fixed_addresses idx; - unsigned long vaddr; - - idx = kmap_idx(kmap_atomic_idx_push(), - DCACHE_ALIAS(page_to_phys(page))); - vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); -#ifdef CONFIG_DEBUG_HIGHMEM - BUG_ON(!pte_none(*(kmap_pte + idx))); -#endif - set_pte(kmap_pte + idx, mk_pte(page, prot)); - - return (void *)vaddr; + return kmap_idx(type, DCACHE_ALIAS(pfn << PAGE_SHIFT)); } -EXPORT_SYMBOL(kmap_atomic_high_prot); -void kunmap_atomic_high(void *kvaddr) +enum fixed_addresses kmap_local_unmap_idx(int type, unsigned long addr) { - if (kvaddr >= (void *)FIXADDR_START && - kvaddr < (void *)FIXADDR_TOP) { - int idx = kmap_idx(kmap_atomic_idx(), - DCACHE_ALIAS((unsigned long)kvaddr)); - - /* - * Force other mappings to Oops if they'll try to access this - * pte without first remap it. Keeping stale mappings around - * is a bad idea also, in case the page changes cacheability - * attributes or becomes a protected page in a hypervisor. - */ - pte_clear(&init_mm, kvaddr, kmap_pte + idx); - local_flush_tlb_kernel_range((unsigned long)kvaddr, - (unsigned long)kvaddr + PAGE_SIZE); - - kmap_atomic_idx_pop(); - } + return kmap_idx(type, DCACHE_ALIAS(addr)); } -EXPORT_SYMBOL(kunmap_atomic_high); void __init kmap_init(void) { - unsigned long kmap_vstart; - /* Check if this memory layout is broken because PKMAP overlaps * page table. */ BUILD_BUG_ON(PKMAP_BASE < TLBTEMP_BASE_1 + TLBTEMP_SIZE); - /* cache the first kmap pte */ - kmap_vstart = __fix_to_virt(FIX_KMAP_BEGIN); - kmap_pte = virt_to_kpte(kmap_vstart); kmap_waitqueues_init(); } diff --git a/block/blk-mq.c b/block/blk-mq.c index c3beaca1f..08055a866 100644 --- a/block/blk-mq.c +++ b/block/blk-mq.c @@ -44,7 +44,7 @@ bool mq_unfair_dtag = true; module_param_named(unfair_dtag, mq_unfair_dtag, bool, 0444); -static DEFINE_PER_CPU(struct list_head, blk_cpu_done); +static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); static void blk_mq_poll_stats_start(struct request_queue *q); static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); @@ -590,80 +590,29 @@ void blk_mq_end_request(struct request *rq, blk_status_t error) } EXPORT_SYMBOL(blk_mq_end_request); -/* - * Softirq action handler - move entries to local list and loop over them - * while passing them to the queue registered handler. - */ -static __latent_entropy void blk_done_softirq(struct softirq_action *h) +static void blk_complete_reqs(struct llist_head *list) { - struct list_head *cpu_list, local_list; - - local_irq_disable(); - cpu_list = this_cpu_ptr(&blk_cpu_done); - list_replace_init(cpu_list, &local_list); - local_irq_enable(); - - while (!list_empty(&local_list)) { - struct request *rq; + struct llist_node *entry = llist_reverse_order(llist_del_all(list)); + struct request *rq, *next; - rq = list_entry(local_list.next, struct request, ipi_list); - list_del_init(&rq->ipi_list); + llist_for_each_entry_safe(rq, next, entry, ipi_list) rq->q->mq_ops->complete(rq); - } } -static void blk_mq_trigger_softirq(struct request *rq) +static __latent_entropy void blk_done_softirq(struct softirq_action *h) { - struct list_head *list; - unsigned long flags; - - local_irq_save(flags); - list = this_cpu_ptr(&blk_cpu_done); - list_add_tail(&rq->ipi_list, list); - - /* - * If the list only contains our just added request, signal a raise of - * the softirq. If there are already entries there, someone already - * raised the irq but it hasn't run yet. - */ - if (list->next == &rq->ipi_list) - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_restore(flags); + blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); } static int blk_softirq_cpu_dead(unsigned int cpu) { - /* - * If a CPU goes away, splice its entries to the current CPU - * and trigger a run of the softirq - */ - local_irq_disable(); - list_splice_init(&per_cpu(blk_cpu_done, cpu), - this_cpu_ptr(&blk_cpu_done)); - raise_softirq_irqoff(BLOCK_SOFTIRQ); - local_irq_enable(); - + blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); return 0; } - static void __blk_mq_complete_request_remote(void *data) { - struct request *rq = data; - - /* - * For most of single queue controllers, there is only one irq vector - * for handling I/O completion, and the only irq's affinity is set - * to all possible CPUs. On most of ARCHs, this affinity means the irq - * is handled on one specific CPU. - * - * So complete I/O requests in softirq context in case of single queue - * devices to avoid degrading I/O performance due to irqsoff latency. - */ - if (rq->q->nr_hw_queues == 1) - blk_mq_trigger_softirq(rq); - else - rq->q->mq_ops->complete(rq); + __raise_softirq_irqoff(BLOCK_SOFTIRQ); } static inline bool blk_mq_complete_need_ipi(struct request *rq) @@ -673,6 +622,14 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) if (!IS_ENABLED(CONFIG_SMP) || !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) return false; + /* + * With force threaded interrupts enabled, raising softirq from an SMP + * function call will always result in waking the ksoftirqd thread. + * This is probably worse than completing the request on a different + * cache domain. + */ + if (force_irqthreads) + return false; /* same CPU or cache domain? Complete locally */ if (cpu == rq->mq_ctx->cpu || @@ -684,6 +641,31 @@ static inline bool blk_mq_complete_need_ipi(struct request *rq) return cpu_online(rq->mq_ctx->cpu); } +static void blk_mq_complete_send_ipi(struct request *rq) +{ + struct llist_head *list; + unsigned int cpu; + + cpu = rq->mq_ctx->cpu; + list = &per_cpu(blk_cpu_done, cpu); + if (llist_add(&rq->ipi_list, list)) { + rq->csd.func = __blk_mq_complete_request_remote; + rq->csd.info = rq; + smp_call_function_single_async(cpu, &rq->csd); + } +} + +static void blk_mq_raise_softirq(struct request *rq) +{ + struct llist_head *list; + + preempt_disable(); + list = this_cpu_ptr(&blk_cpu_done); + if (llist_add(&rq->ipi_list, list)) + raise_softirq(BLOCK_SOFTIRQ); + preempt_enable(); +} + bool blk_mq_complete_request_remote(struct request *rq) { WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); @@ -696,15 +678,15 @@ bool blk_mq_complete_request_remote(struct request *rq) return false; if (blk_mq_complete_need_ipi(rq)) { - INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); - smp_call_function_single_async(rq->mq_ctx->cpu, &rq->csd); - } else { - if (rq->q->nr_hw_queues > 1) - return false; - blk_mq_trigger_softirq(rq); + blk_mq_complete_send_ipi(rq); + return true; } - return true; + if (rq->q->nr_hw_queues == 1) { + blk_mq_raise_softirq(rq); + return true; + } + return false; } EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); @@ -1617,14 +1599,14 @@ static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, return; if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { - int cpu = get_cpu(); + int cpu = get_cpu_light(); if (cpumask_test_cpu(cpu, hctx->cpumask)) { __blk_mq_run_hw_queue(hctx); - put_cpu(); + put_cpu_light(); return; } - put_cpu(); + put_cpu_light(); } kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, @@ -4084,7 +4066,7 @@ static int __init blk_mq_init(void) int i; for_each_possible_cpu(i) - INIT_LIST_HEAD(&per_cpu(blk_cpu_done, i)); + init_llist_head(&per_cpu(blk_cpu_done, i)); open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, diff --git a/crypto/cryptd.c b/crypto/cryptd.c index a1bea0f4b..5f8ca8c1f 100644 --- a/crypto/cryptd.c +++ b/crypto/cryptd.c @@ -36,6 +36,7 @@ static struct workqueue_struct *cryptd_wq; struct cryptd_cpu_queue { struct crypto_queue queue; struct work_struct work; + spinlock_t qlock; }; struct cryptd_queue { @@ -105,6 +106,7 @@ static int cryptd_init_queue(struct cryptd_queue *queue, cpu_queue = per_cpu_ptr(queue->cpu_queue, cpu); crypto_init_queue(&cpu_queue->queue, max_cpu_qlen); INIT_WORK(&cpu_queue->work, cryptd_queue_worker); + spin_lock_init(&cpu_queue->qlock); } pr_info("cryptd: max_cpu_qlen set to %d\n", max_cpu_qlen); return 0; @@ -129,8 +131,10 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue, struct cryptd_cpu_queue *cpu_queue; refcount_t *refcnt; - cpu = get_cpu(); - cpu_queue = this_cpu_ptr(queue->cpu_queue); + cpu_queue = raw_cpu_ptr(queue->cpu_queue); + spin_lock_bh(&cpu_queue->qlock); + cpu = smp_processor_id(); + err = crypto_enqueue_request(&cpu_queue->queue, request); refcnt = crypto_tfm_ctx(request->tfm); @@ -146,7 +150,7 @@ static int cryptd_enqueue_request(struct cryptd_queue *queue, refcount_inc(refcnt); out_put_cpu: - put_cpu(); + spin_unlock_bh(&cpu_queue->qlock); return err; } @@ -162,16 +166,11 @@ static void cryptd_queue_worker(struct work_struct *work) cpu_queue = container_of(work, struct cryptd_cpu_queue, work); /* * Only handle one request at a time to avoid hogging crypto workqueue. - * preempt_disable/enable is used to prevent being preempted by - * cryptd_enqueue_request(). local_bh_disable/enable is used to prevent - * cryptd_enqueue_request() being accessed from software interrupts. */ - local_bh_disable(); - preempt_disable(); + spin_lock_bh(&cpu_queue->qlock); backlog = crypto_get_backlog(&cpu_queue->queue); req = crypto_dequeue_request(&cpu_queue->queue); - preempt_enable(); - local_bh_enable(); + spin_unlock_bh(&cpu_queue->qlock); if (!req) return; diff --git a/drivers/atm/eni.c b/drivers/atm/eni.c index b574cce98..422753d52 100644 --- a/drivers/atm/eni.c +++ b/drivers/atm/eni.c @@ -2054,7 +2054,7 @@ static int eni_send(struct atm_vcc *vcc,struct sk_buff *skb) } submitted++; ATM_SKB(skb)->vcc = vcc; - tasklet_disable(&ENI_DEV(vcc->dev)->task); + tasklet_disable_in_atomic(&ENI_DEV(vcc->dev)->task); res = do_tx(skb); tasklet_enable(&ENI_DEV(vcc->dev)->task); if (res == enq_ok) return 0; diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0636df6b6..1a7523cef 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -59,6 +59,40 @@ static void zram_free_page(struct zram *zram, size_t index); static int zram_bvec_read(struct zram *zram, struct bio_vec *bvec, u32 index, int offset, struct bio *bio); +#ifdef CONFIG_PREEMPT_RT +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) +{ + size_t index; + + for (index = 0; index < num_pages; index++) + spin_lock_init(&zram->table[index].lock); +} + +static int zram_slot_trylock(struct zram *zram, u32 index) +{ + int ret; + + ret = spin_trylock(&zram->table[index].lock); + if (ret) + __set_bit(ZRAM_LOCK, &zram->table[index].flags); + return ret; +} + +static void zram_slot_lock(struct zram *zram, u32 index) +{ + spin_lock(&zram->table[index].lock); + __set_bit(ZRAM_LOCK, &zram->table[index].flags); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + __clear_bit(ZRAM_LOCK, &zram->table[index].flags); + spin_unlock(&zram->table[index].lock); +} + +#else + +static void zram_meta_init_table_locks(struct zram *zram, size_t num_pages) { } static int zram_slot_trylock(struct zram *zram, u32 index) { @@ -74,6 +108,7 @@ static void zram_slot_unlock(struct zram *zram, u32 index) { bit_spin_unlock(ZRAM_LOCK, &zram->table[index].flags); } +#endif static inline bool init_done(struct zram *zram) { @@ -1165,6 +1200,7 @@ static bool zram_meta_alloc(struct zram *zram, u64 disksize) if (!huge_class_size) huge_class_size = zs_huge_class_size(zram->mem_pool); + zram_meta_init_table_locks(zram, num_pages); return true; } diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index f2fd46daa..7e4dd447e 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -63,6 +63,7 @@ struct zram_table_entry { unsigned long element; }; unsigned long flags; + spinlock_t lock; #ifdef CONFIG_ZRAM_MEMORY_TRACKING ktime_t ac_time; #endif diff --git a/drivers/char/random.c b/drivers/char/random.c index 4d659c4fc..175c3ac88 100644 --- a/drivers/char/random.c +++ b/drivers/char/random.c @@ -1252,28 +1252,27 @@ static __u32 get_reg(struct fast_pool *f, struct pt_regs *regs) return *ptr; } -void add_interrupt_randomness(int irq, int irq_flags) +void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) { struct entropy_store *r; struct fast_pool *fast_pool = this_cpu_ptr(&irq_randomness); - struct pt_regs *regs = get_irq_regs(); unsigned long now = jiffies; cycles_t cycles = random_get_entropy(); __u32 c_high, j_high; - __u64 ip; unsigned long seed; int credit = 0; if (cycles == 0) - cycles = get_reg(fast_pool, regs); + cycles = get_reg(fast_pool, NULL); c_high = (sizeof(cycles) > 4) ? cycles >> 32 : 0; j_high = (sizeof(now) > 4) ? now >> 32 : 0; fast_pool->pool[0] ^= cycles ^ j_high ^ irq; fast_pool->pool[1] ^= now ^ c_high; - ip = regs ? instruction_pointer(regs) : _RET_IP_; + if (!ip) + ip = _RET_IP_; fast_pool->pool[2] ^= ip; fast_pool->pool[3] ^= (sizeof(ip) > 4) ? ip >> 32 : - get_reg(fast_pool, regs); + get_reg(fast_pool, NULL); fast_mix(fast_pool); add_interrupt_bench(cycles); diff --git a/drivers/char/tpm/tpm-dev-common.c b/drivers/char/tpm/tpm-dev-common.c index 1784530b8..c08cbb306 100644 --- a/drivers/char/tpm/tpm-dev-common.c +++ b/drivers/char/tpm/tpm-dev-common.c @@ -20,7 +20,6 @@ #include "tpm-dev.h" static struct workqueue_struct *tpm_dev_wq; -static DEFINE_MUTEX(tpm_dev_wq_lock); static ssize_t tpm_dev_transmit(struct tpm_chip *chip, struct tpm_space *space, u8 *buf, size_t bufsiz) diff --git a/drivers/char/tpm/tpm_tis.c b/drivers/char/tpm/tpm_tis.c index 4ed6e6602..c2bd0d40b 100644 --- a/drivers/char/tpm/tpm_tis.c +++ b/drivers/char/tpm/tpm_tis.c @@ -50,6 +50,31 @@ static inline struct tpm_tis_tcg_phy *to_tpm_tis_tcg_phy(struct tpm_tis_data *da return container_of(data, struct tpm_tis_tcg_phy, priv); } +#ifdef CONFIG_PREEMPT_RT +/* + * Flushes previous write operations to chip so that a subsequent + * ioread*()s won't stall a cpu. + */ +static inline void tpm_tis_flush(void __iomem *iobase) +{ + ioread8(iobase + TPM_ACCESS(0)); +} +#else +#define tpm_tis_flush(iobase) do { } while (0) +#endif + +static inline void tpm_tis_iowrite8(u8 b, void __iomem *iobase, u32 addr) +{ + iowrite8(b, iobase + addr); + tpm_tis_flush(iobase); +} + +static inline void tpm_tis_iowrite32(u32 b, void __iomem *iobase, u32 addr) +{ + iowrite32(b, iobase + addr); + tpm_tis_flush(iobase); +} + static int interrupts = -1; module_param(interrupts, int, 0444); MODULE_PARM_DESC(interrupts, "Enable interrupts"); @@ -169,7 +194,7 @@ static int tpm_tcg_write_bytes(struct tpm_tis_data *data, u32 addr, u16 len, struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); while (len--) - iowrite8(*value++, phy->iobase + addr); + tpm_tis_iowrite8(*value++, phy->iobase, addr); return 0; } @@ -196,7 +221,7 @@ static int tpm_tcg_write32(struct tpm_tis_data *data, u32 addr, u32 value) { struct tpm_tis_tcg_phy *phy = to_tpm_tis_tcg_phy(data); - iowrite32(value, phy->iobase + addr); + tpm_tis_iowrite32(value, phy->iobase, addr); return 0; } diff --git a/drivers/firewire/ohci.c b/drivers/firewire/ohci.c index 9811c4095..17c9d8251 100644 --- a/drivers/firewire/ohci.c +++ b/drivers/firewire/ohci.c @@ -2545,7 +2545,7 @@ static int ohci_cancel_packet(struct fw_card *card, struct fw_packet *packet) struct driver_data *driver_data = packet->driver_data; int ret = -ENOENT; - tasklet_disable(&ctx->tasklet); + tasklet_disable_in_atomic(&ctx->tasklet); if (packet->ack != 0) goto out; @@ -3465,7 +3465,7 @@ static int ohci_flush_iso_completions(struct fw_iso_context *base) struct iso_context *ctx = container_of(base, struct iso_context, base); int ret = 0; - tasklet_disable(&ctx->context.tasklet); + tasklet_disable_in_atomic(&ctx->context.tasklet); if (!test_and_set_bit_lock(0, &ctx->flushing_completions)) { context_tasklet((unsigned long)&ctx->context); diff --git a/drivers/firmware/efi/efi.c b/drivers/firmware/efi/efi.c index 098d87961..2a8aa5212 100644 --- a/drivers/firmware/efi/efi.c +++ b/drivers/firmware/efi/efi.c @@ -66,7 +66,7 @@ struct mm_struct efi_mm = { struct workqueue_struct *efi_rts_wq; -static bool disable_runtime; +static bool disable_runtime = IS_ENABLED(CONFIG_PREEMPT_RT); static int __init setup_noefi(char *arg) { disable_runtime = true; @@ -97,6 +97,9 @@ static int __init parse_efi_cmdline(char *str) if (parse_option_str(str, "noruntime")) disable_runtime = true; + if (parse_option_str(str, "runtime")) + disable_runtime = false; + if (parse_option_str(str, "nosoftreserve")) set_bit(EFI_MEM_NO_SOFT_RESERVE, &efi.flags); diff --git a/drivers/gpu/drm/i915/display/intel_sprite.c b/drivers/gpu/drm/i915/display/intel_sprite.c index 12f7128b7..a65061e3e 100644 --- a/drivers/gpu/drm/i915/display/intel_sprite.c +++ b/drivers/gpu/drm/i915/display/intel_sprite.c @@ -118,7 +118,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) "PSR idle timed out 0x%x, atomic update may fail\n", psr_status); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); crtc->debug.min_vbl = min; crtc->debug.max_vbl = max; @@ -143,11 +144,13 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) break; } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); timeout = schedule_timeout(timeout); - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } finish_wait(wq, &wait); @@ -180,7 +183,8 @@ void intel_pipe_update_start(const struct intel_crtc_state *new_crtc_state) return; irq_disable: - local_irq_disable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_disable(); } /** @@ -218,7 +222,8 @@ void intel_pipe_update_end(struct intel_crtc_state *new_crtc_state) new_crtc_state->uapi.event = NULL; } - local_irq_enable(); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + local_irq_enable(); if (intel_vgpu_active(dev_priv)) return; diff --git a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c index 0c083af5a..2abf043d3 100644 --- a/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c +++ b/drivers/gpu/drm/i915/gem/i915_gem_execbuffer.c @@ -1080,7 +1080,7 @@ static void reloc_cache_reset(struct reloc_cache *cache, struct i915_execbuffer struct i915_ggtt *ggtt = cache_to_ggtt(cache); intel_gt_flush_ggtt_writes(ggtt->vm.gt); - io_mapping_unmap_atomic((void __iomem *)vaddr); + io_mapping_unmap_local((void __iomem *)vaddr); if (drm_mm_node_allocated(&cache->node)) { ggtt->vm.clear_range(&ggtt->vm, @@ -1146,7 +1146,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj, if (cache->vaddr) { intel_gt_flush_ggtt_writes(ggtt->vm.gt); - io_mapping_unmap_atomic((void __force __iomem *) unmask_page(cache->vaddr)); + io_mapping_unmap_local((void __force __iomem *) unmask_page(cache->vaddr)); } else { struct i915_vma *vma; int err; @@ -1194,8 +1194,7 @@ static void *reloc_iomap(struct drm_i915_gem_object *obj, offset += page << PAGE_SHIFT; } - vaddr = (void __force *)io_mapping_map_atomic_wc(&ggtt->iomap, - offset); + vaddr = (void __force *)io_mapping_map_local_wc(&ggtt->iomap, offset); cache->page = page; cache->vaddr = (unsigned long)vaddr; diff --git a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c index 0040b4765..3f4f85478 100644 --- a/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c +++ b/drivers/gpu/drm/i915/gt/intel_breadcrumbs.c @@ -342,10 +342,9 @@ void intel_breadcrumbs_park(struct intel_breadcrumbs *b) /* Kick the work once more to drain the signalers */ irq_work_sync(&b->irq_work); while (unlikely(READ_ONCE(b->irq_armed))) { - local_irq_disable(); - signal_irq_work(&b->irq_work); - local_irq_enable(); + irq_work_queue(&b->irq_work); cond_resched(); + irq_work_sync(&b->irq_work); } GEM_BUG_ON(!list_empty(&b->signalers)); } diff --git a/drivers/gpu/drm/i915/gt/intel_engine_pm.c b/drivers/gpu/drm/i915/gt/intel_engine_pm.c index f7b2e07e2..313d8a28e 100644 --- a/drivers/gpu/drm/i915/gt/intel_engine_pm.c +++ b/drivers/gpu/drm/i915/gt/intel_engine_pm.c @@ -60,9 +60,10 @@ static int __engine_unpark(struct intel_wakeref *wf) static inline unsigned long __timeline_mark_lock(struct intel_context *ce) { - unsigned long flags; + unsigned long flags = 0; - local_irq_save(flags); + if (!force_irqthreads) + local_irq_save(flags); mutex_acquire(&ce->timeline->mutex.dep_map, 2, 0, _THIS_IP_); return flags; @@ -72,7 +73,8 @@ static inline void __timeline_mark_unlock(struct intel_context *ce, unsigned long flags) { mutex_release(&ce->timeline->mutex.dep_map, _THIS_IP_); - local_irq_restore(flags); + if (!force_irqthreads) + local_irq_restore(flags); } #else diff --git a/drivers/gpu/drm/i915/i915_gem.c b/drivers/gpu/drm/i915/i915_gem.c index 58276694c..88944c3b1 100644 --- a/drivers/gpu/drm/i915/i915_gem.c +++ b/drivers/gpu/drm/i915/i915_gem.c @@ -355,22 +355,15 @@ gtt_user_read(struct io_mapping *mapping, char __user *user_data, int length) { void __iomem *vaddr; - unsigned long unwritten; + bool fail = false; /* We can use the cpu mem copy function because this is X86. */ - vaddr = io_mapping_map_atomic_wc(mapping, base); - unwritten = __copy_to_user_inatomic(user_data, - (void __force *)vaddr + offset, - length); - io_mapping_unmap_atomic(vaddr); - if (unwritten) { - vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE); - unwritten = copy_to_user(user_data, - (void __force *)vaddr + offset, - length); - io_mapping_unmap(vaddr); - } - return unwritten; + vaddr = io_mapping_map_local_wc(mapping, base); + if (copy_to_user(user_data, (void __force *)vaddr + offset, length)) + fail = true; + io_mapping_unmap_local(vaddr); + + return fail; } static int @@ -539,21 +532,14 @@ ggtt_write(struct io_mapping *mapping, char __user *user_data, int length) { void __iomem *vaddr; - unsigned long unwritten; + bool fail = false; /* We can use the cpu mem copy function because this is X86. */ - vaddr = io_mapping_map_atomic_wc(mapping, base); - unwritten = __copy_from_user_inatomic_nocache((void __force *)vaddr + offset, - user_data, length); - io_mapping_unmap_atomic(vaddr); - if (unwritten) { - vaddr = io_mapping_map_wc(mapping, base, PAGE_SIZE); - unwritten = copy_from_user((void __force *)vaddr + offset, - user_data, length); - io_mapping_unmap(vaddr); - } - - return unwritten; + vaddr = io_mapping_map_local_wc(mapping, base); + if (copy_from_user((void __force *)vaddr + offset, user_data, length)) + fail = true; + io_mapping_unmap_local(vaddr); + return fail; } /** diff --git a/drivers/gpu/drm/i915/i915_irq.c b/drivers/gpu/drm/i915/i915_irq.c index 759f523c6..7339a42ab 100644 --- a/drivers/gpu/drm/i915/i915_irq.c +++ b/drivers/gpu/drm/i915/i915_irq.c @@ -847,6 +847,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, spin_lock_irqsave(&dev_priv->uncore.lock, irqflags); /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ + preempt_disable_rt(); /* Get optional system timestamp before query. */ if (stime) @@ -898,6 +899,7 @@ static bool i915_get_crtc_scanoutpos(struct drm_crtc *_crtc, *etime = ktime_get(); /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ + preempt_enable_rt(); spin_unlock_irqrestore(&dev_priv->uncore.lock, irqflags); diff --git a/drivers/gpu/drm/i915/i915_trace.h b/drivers/gpu/drm/i915/i915_trace.h index a4addcc64..396b65986 100644 --- a/drivers/gpu/drm/i915/i915_trace.h +++ b/drivers/gpu/drm/i915/i915_trace.h @@ -2,6 +2,10 @@ #if !defined(_I915_TRACE_H_) || defined(TRACE_HEADER_MULTI_READ) #define _I915_TRACE_H_ +#ifdef CONFIG_PREEMPT_RT +#define NOTRACE +#endif + #include <linux/stringify.h> #include <linux/types.h> #include <linux/tracepoint.h> @@ -778,7 +782,7 @@ DEFINE_EVENT(i915_request, i915_request_add, TP_ARGS(rq) ); -#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) +#if defined(CONFIG_DRM_I915_LOW_LEVEL_TRACEPOINTS) && !defined(NOTRACE) DEFINE_EVENT(i915_request, i915_request_submit, TP_PROTO(struct i915_request *rq), TP_ARGS(rq) diff --git a/drivers/gpu/drm/i915/selftests/i915_gem.c b/drivers/gpu/drm/i915/selftests/i915_gem.c index 412e21604..432493183 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem.c @@ -57,12 +57,12 @@ static void trash_stolen(struct drm_i915_private *i915) ggtt->vm.insert_page(&ggtt->vm, dma, slot, I915_CACHE_NONE, 0); - s = io_mapping_map_atomic_wc(&ggtt->iomap, slot); + s = io_mapping_map_local_wc(&ggtt->iomap, slot); for (x = 0; x < PAGE_SIZE / sizeof(u32); x++) { prng = next_pseudo_random32(prng); iowrite32(prng, &s[x]); } - io_mapping_unmap_atomic(s); + io_mapping_unmap_local(s); } ggtt->vm.clear_range(&ggtt->vm, slot, PAGE_SIZE); diff --git a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c index 65e28c4cd..ca483285f 100644 --- a/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c +++ b/drivers/gpu/drm/i915/selftests/i915_gem_gtt.c @@ -1201,9 +1201,9 @@ static int igt_ggtt_page(void *arg) u64 offset = tmp.start + order[n] * PAGE_SIZE; u32 __iomem *vaddr; - vaddr = io_mapping_map_atomic_wc(&ggtt->iomap, offset); + vaddr = io_mapping_map_local_wc(&ggtt->iomap, offset); iowrite32(n, vaddr + n); - io_mapping_unmap_atomic(vaddr); + io_mapping_unmap_local(vaddr); } intel_gt_flush_ggtt_writes(ggtt->vm.gt); @@ -1213,9 +1213,9 @@ static int igt_ggtt_page(void *arg) u32 __iomem *vaddr; u32 val; - vaddr = io_mapping_map_atomic_wc(&ggtt->iomap, offset); + vaddr = io_mapping_map_local_wc(&ggtt->iomap, offset); val = ioread32(vaddr + n); - io_mapping_unmap_atomic(vaddr); + io_mapping_unmap_local(vaddr); if (val != n) { pr_err("insert page failed: found %d, expected %d\n", diff --git a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h index 6c5bbff12..411f91ee2 100644 --- a/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h +++ b/drivers/gpu/drm/nouveau/nvkm/subdev/devinit/fbmem.h @@ -60,19 +60,19 @@ fbmem_fini(struct io_mapping *fb) static inline u32 fbmem_peek(struct io_mapping *fb, u32 off) { - u8 __iomem *p = io_mapping_map_atomic_wc(fb, off & PAGE_MASK); + u8 __iomem *p = io_mapping_map_local_wc(fb, off & PAGE_MASK); u32 val = ioread32(p + (off & ~PAGE_MASK)); - io_mapping_unmap_atomic(p); + io_mapping_unmap_local(p); return val; } static inline void fbmem_poke(struct io_mapping *fb, u32 off, u32 val) { - u8 __iomem *p = io_mapping_map_atomic_wc(fb, off & PAGE_MASK); + u8 __iomem *p = io_mapping_map_local_wc(fb, off & PAGE_MASK); iowrite32(val, p + (off & ~PAGE_MASK)); wmb(); - io_mapping_unmap_atomic(p); + io_mapping_unmap_local(p); } static inline bool diff --git a/drivers/gpu/drm/qxl/qxl_image.c b/drivers/gpu/drm/qxl/qxl_image.c index 60ab7151b..93f92ccd4 100644 --- a/drivers/gpu/drm/qxl/qxl_image.c +++ b/drivers/gpu/drm/qxl/qxl_image.c @@ -124,12 +124,12 @@ qxl_image_init_helper(struct qxl_device *qdev, wrong (check the bitmaps are sent correctly first) */ - ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, 0); + ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, 0); chunk = ptr; chunk->data_size = height * chunk_stride; chunk->prev_chunk = 0; chunk->next_chunk = 0; - qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr); + qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr); { void *k_data, *i_data; @@ -143,7 +143,7 @@ qxl_image_init_helper(struct qxl_device *qdev, i_data = (void *)data; while (remain > 0) { - ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, page << PAGE_SHIFT); + ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, page << PAGE_SHIFT); if (page == 0) { chunk = ptr; @@ -157,7 +157,7 @@ qxl_image_init_helper(struct qxl_device *qdev, memcpy(k_data, i_data, size); - qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr); + qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr); i_data += size; remain -= size; page++; @@ -175,10 +175,10 @@ qxl_image_init_helper(struct qxl_device *qdev, page_offset = offset_in_page(out_offset); size = min((int)(PAGE_SIZE - page_offset), remain); - ptr = qxl_bo_kmap_atomic_page(qdev, chunk_bo, page_base); + ptr = qxl_bo_kmap_local_page(qdev, chunk_bo, page_base); k_data = ptr + page_offset; memcpy(k_data, i_data, size); - qxl_bo_kunmap_atomic_page(qdev, chunk_bo, ptr); + qxl_bo_kunmap_local_page(qdev, chunk_bo, ptr); remain -= size; i_data += size; out_offset += size; @@ -189,7 +189,7 @@ qxl_image_init_helper(struct qxl_device *qdev, qxl_bo_kunmap(chunk_bo); image_bo = dimage->bo; - ptr = qxl_bo_kmap_atomic_page(qdev, image_bo, 0); + ptr = qxl_bo_kmap_local_page(qdev, image_bo, 0); image = ptr; image->descriptor.id = 0; @@ -212,7 +212,7 @@ qxl_image_init_helper(struct qxl_device *qdev, break; default: DRM_ERROR("unsupported image bit depth\n"); - qxl_bo_kunmap_atomic_page(qdev, image_bo, ptr); + qxl_bo_kunmap_local_page(qdev, image_bo, ptr); return -EINVAL; } image->u.bitmap.flags = QXL_BITMAP_TOP_DOWN; @@ -222,7 +222,7 @@ qxl_image_init_helper(struct qxl_device *qdev, image->u.bitmap.palette = 0; image->u.bitmap.data = qxl_bo_physical_address(qdev, chunk_bo, 0); - qxl_bo_kunmap_atomic_page(qdev, image_bo, ptr); + qxl_bo_kunmap_local_page(qdev, image_bo, ptr); return 0; } diff --git a/drivers/gpu/drm/qxl/qxl_ioctl.c b/drivers/gpu/drm/qxl/qxl_ioctl.c index 5cea6eea7..785023081 100644 --- a/drivers/gpu/drm/qxl/qxl_ioctl.c +++ b/drivers/gpu/drm/qxl/qxl_ioctl.c @@ -89,11 +89,11 @@ apply_reloc(struct qxl_device *qdev, struct qxl_reloc_info *info) { void *reloc_page; - reloc_page = qxl_bo_kmap_atomic_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); + reloc_page = qxl_bo_kmap_local_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); *(uint64_t *)(reloc_page + (info->dst_offset & ~PAGE_MASK)) = qxl_bo_physical_address(qdev, info->src_bo, info->src_offset); - qxl_bo_kunmap_atomic_page(qdev, info->dst_bo, reloc_page); + qxl_bo_kunmap_local_page(qdev, info->dst_bo, reloc_page); } static void @@ -105,9 +105,9 @@ apply_surf_reloc(struct qxl_device *qdev, struct qxl_reloc_info *info) if (info->src_bo && !info->src_bo->is_primary) id = info->src_bo->surface_id; - reloc_page = qxl_bo_kmap_atomic_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); + reloc_page = qxl_bo_kmap_local_page(qdev, info->dst_bo, info->dst_offset & PAGE_MASK); *(uint32_t *)(reloc_page + (info->dst_offset & ~PAGE_MASK)) = id; - qxl_bo_kunmap_atomic_page(qdev, info->dst_bo, reloc_page); + qxl_bo_kunmap_local_page(qdev, info->dst_bo, reloc_page); } /* return holding the reference to this object */ @@ -149,7 +149,6 @@ static int qxl_process_single_command(struct qxl_device *qdev, struct qxl_bo *cmd_bo; void *fb_cmd; int i, ret, num_relocs; - int unwritten; switch (cmd->type) { case QXL_CMD_DRAW: @@ -185,21 +184,21 @@ static int qxl_process_single_command(struct qxl_device *qdev, goto out_free_reloc; /* TODO copy slow path code from i915 */ - fb_cmd = qxl_bo_kmap_atomic_page(qdev, cmd_bo, (release->release_offset & PAGE_MASK)); - unwritten = __copy_from_user_inatomic_nocache - (fb_cmd + sizeof(union qxl_release_info) + (release->release_offset & ~PAGE_MASK), - u64_to_user_ptr(cmd->command), cmd->command_size); + fb_cmd = qxl_bo_kmap_local_page(qdev, cmd_bo, (release->release_offset & PAGE_MASK)); - { + if (copy_from_user(fb_cmd + sizeof(union qxl_release_info) + + (release->release_offset & ~PAGE_MASK), + u64_to_user_ptr(cmd->command), cmd->command_size)) { + ret = -EFAULT; + } else { struct qxl_drawable *draw = fb_cmd; draw->mm_time = qdev->rom->mm_clock; } - qxl_bo_kunmap_atomic_page(qdev, cmd_bo, fb_cmd); - if (unwritten) { - DRM_ERROR("got unwritten %d\n", unwritten); - ret = -EFAULT; + qxl_bo_kunmap_local_page(qdev, cmd_bo, fb_cmd); + if (ret) { + DRM_ERROR("copy from user failed %d\n", ret); goto out_free_release; } diff --git a/drivers/gpu/drm/qxl/qxl_object.c b/drivers/gpu/drm/qxl/qxl_object.c index 544a9e4df..5ee5171d4 100644 --- a/drivers/gpu/drm/qxl/qxl_object.c +++ b/drivers/gpu/drm/qxl/qxl_object.c @@ -173,8 +173,8 @@ int qxl_bo_kmap(struct qxl_bo *bo, void **ptr) return 0; } -void *qxl_bo_kmap_atomic_page(struct qxl_device *qdev, - struct qxl_bo *bo, int page_offset) +void *qxl_bo_kmap_local_page(struct qxl_device *qdev, + struct qxl_bo *bo, int page_offset) { unsigned long offset; void *rptr; @@ -189,7 +189,7 @@ void *qxl_bo_kmap_atomic_page(struct qxl_device *qdev, goto fallback; offset = bo->tbo.mem.start << PAGE_SHIFT; - return io_mapping_map_atomic_wc(map, offset + page_offset); + return io_mapping_map_local_wc(map, offset + page_offset); fallback: if (bo->kptr) { rptr = bo->kptr + (page_offset * PAGE_SIZE); @@ -215,14 +215,14 @@ void qxl_bo_kunmap(struct qxl_bo *bo) ttm_bo_kunmap(&bo->kmap); } -void qxl_bo_kunmap_atomic_page(struct qxl_device *qdev, - struct qxl_bo *bo, void *pmap) +void qxl_bo_kunmap_local_page(struct qxl_device *qdev, + struct qxl_bo *bo, void *pmap) { if ((bo->tbo.mem.mem_type != TTM_PL_VRAM) && (bo->tbo.mem.mem_type != TTM_PL_PRIV)) goto fallback; - io_mapping_unmap_atomic(pmap); + io_mapping_unmap_local(pmap); return; fallback: qxl_bo_kunmap(bo); diff --git a/drivers/gpu/drm/qxl/qxl_object.h b/drivers/gpu/drm/qxl/qxl_object.h index 5762ea40d..6ae89b1b3 100644 --- a/drivers/gpu/drm/qxl/qxl_object.h +++ b/drivers/gpu/drm/qxl/qxl_object.h @@ -89,8 +89,8 @@ extern int qxl_bo_create(struct qxl_device *qdev, struct qxl_bo **bo_ptr); extern int qxl_bo_kmap(struct qxl_bo *bo, void **ptr); extern void qxl_bo_kunmap(struct qxl_bo *bo); -void *qxl_bo_kmap_atomic_page(struct qxl_device *qdev, struct qxl_bo *bo, int page_offset); -void qxl_bo_kunmap_atomic_page(struct qxl_device *qdev, struct qxl_bo *bo, void *map); +void *qxl_bo_kmap_local_page(struct qxl_device *qdev, struct qxl_bo *bo, int page_offset); +void qxl_bo_kunmap_local_page(struct qxl_device *qdev, struct qxl_bo *bo, void *map); extern struct qxl_bo *qxl_bo_ref(struct qxl_bo *bo); extern void qxl_bo_unref(struct qxl_bo **bo); extern int qxl_bo_pin(struct qxl_bo *bo); diff --git a/drivers/gpu/drm/qxl/qxl_release.c b/drivers/gpu/drm/qxl/qxl_release.c index b2a475a0c..b665a33b4 100644 --- a/drivers/gpu/drm/qxl/qxl_release.c +++ b/drivers/gpu/drm/qxl/qxl_release.c @@ -414,7 +414,7 @@ union qxl_release_info *qxl_release_map(struct qxl_device *qdev, union qxl_release_info *info; struct qxl_bo *bo = release->release_bo; - ptr = qxl_bo_kmap_atomic_page(qdev, bo, release->release_offset & PAGE_MASK); + ptr = qxl_bo_kmap_local_page(qdev, bo, release->release_offset & PAGE_MASK); if (!ptr) return NULL; info = ptr + (release->release_offset & ~PAGE_MASK); @@ -429,7 +429,7 @@ void qxl_release_unmap(struct qxl_device *qdev, void *ptr; ptr = ((void *)info) - (release->release_offset & ~PAGE_MASK); - qxl_bo_kunmap_atomic_page(qdev, bo, ptr); + qxl_bo_kunmap_local_page(qdev, bo, ptr); } void qxl_release_fence_buffer_objects(struct qxl_release *release) diff --git a/drivers/gpu/drm/radeon/radeon_display.c b/drivers/gpu/drm/radeon/radeon_display.c index 07d23a1e6..add8e6044 100644 --- a/drivers/gpu/drm/radeon/radeon_display.c +++ b/drivers/gpu/drm/radeon/radeon_display.c @@ -1828,6 +1828,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, struct radeon_device *rdev = dev->dev_private; /* preempt_disable_rt() should go right here in PREEMPT_RT patchset. */ + preempt_disable_rt(); /* Get optional system timestamp before query. */ if (stime) @@ -1920,6 +1921,7 @@ int radeon_get_crtc_scanoutpos(struct drm_device *dev, unsigned int pipe, *etime = ktime_get(); /* preempt_enable_rt() should go right here in PREEMPT_RT patchset. */ + preempt_enable_rt(); /* Decode into vertical and horizontal scanout position. */ *vpos = position & 0x1fff; diff --git a/drivers/gpu/drm/ttm/ttm_bo_util.c b/drivers/gpu/drm/ttm/ttm_bo_util.c index fb2a25f84..164b9a015 100644 --- a/drivers/gpu/drm/ttm/ttm_bo_util.c +++ b/drivers/gpu/drm/ttm/ttm_bo_util.c @@ -181,13 +181,15 @@ static int ttm_copy_io_ttm_page(struct ttm_tt *ttm, void *src, return -ENOMEM; src = (void *)((unsigned long)src + (page << PAGE_SHIFT)); - dst = kmap_atomic_prot(d, prot); - if (!dst) - return -ENOMEM; + /* + * Ensure that a highmem page is mapped with the correct + * pgprot. For non highmem the mapping is already there. + */ + dst = kmap_local_page_prot(d, prot); memcpy_fromio(dst, src, PAGE_SIZE); - kunmap_atomic(dst); + kunmap_local(dst); return 0; } @@ -203,13 +205,15 @@ static int ttm_copy_ttm_io_page(struct ttm_tt *ttm, void *dst, return -ENOMEM; dst = (void *)((unsigned long)dst + (page << PAGE_SHIFT)); - src = kmap_atomic_prot(s, prot); - if (!src) - return -ENOMEM; + /* + * Ensure that a highmem page is mapped with the correct + * pgprot. For non highmem the mapping is already there. + */ + src = kmap_local_page_prot(s, prot); memcpy_toio(dst, src, PAGE_SIZE); - kunmap_atomic(src); + kunmap_local(src); return 0; } diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c index e8d66182c..71dba228f 100644 --- a/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c +++ b/drivers/gpu/drm/vmwgfx/vmwgfx_blit.c @@ -375,12 +375,12 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, copy_size = min_t(u32, copy_size, PAGE_SIZE - src_page_offset); if (unmap_src) { - kunmap_atomic(d->src_addr); + kunmap_local(d->src_addr); d->src_addr = NULL; } if (unmap_dst) { - kunmap_atomic(d->dst_addr); + kunmap_local(d->dst_addr); d->dst_addr = NULL; } @@ -388,12 +388,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, if (WARN_ON_ONCE(dst_page >= d->dst_num_pages)) return -EINVAL; - d->dst_addr = - kmap_atomic_prot(d->dst_pages[dst_page], - d->dst_prot); - if (!d->dst_addr) - return -ENOMEM; - + d->dst_addr = kmap_local_page_prot(d->dst_pages[dst_page], + d->dst_prot); d->mapped_dst = dst_page; } @@ -401,12 +397,8 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, if (WARN_ON_ONCE(src_page >= d->src_num_pages)) return -EINVAL; - d->src_addr = - kmap_atomic_prot(d->src_pages[src_page], - d->src_prot); - if (!d->src_addr) - return -ENOMEM; - + d->src_addr = kmap_local_page_prot(d->src_pages[src_page], + d->src_prot); d->mapped_src = src_page; } diff->do_cpy(diff, d->dst_addr + dst_page_offset, @@ -436,8 +428,10 @@ static int vmw_bo_cpu_blit_line(struct vmw_bo_blit_line_data *d, * * Performs a CPU blit from one buffer object to another avoiding a full * bo vmap which may exhaust- or fragment vmalloc space. - * On supported architectures (x86), we're using kmap_atomic which avoids - * cross-processor TLB- and cache flushes and may, on non-HIGHMEM systems + * + * On supported architectures (x86), we're using kmap_local_prot() which + * avoids cross-processor TLB- and cache flushes. kmap_local_prot() will + * either map a highmem page with the proper pgprot on HIGHMEM=y systems or * reference already set-up mappings. * * Neither of the buffer objects may be placed in PCI memory @@ -500,9 +494,9 @@ int vmw_bo_cpu_blit(struct ttm_buffer_object *dst, } out: if (d.src_addr) - kunmap_atomic(d.src_addr); + kunmap_local(d.src_addr); if (d.dst_addr) - kunmap_atomic(d.dst_addr); + kunmap_local(d.dst_addr); return ret; } diff --git a/drivers/hv/hyperv_vmbus.h b/drivers/hv/hyperv_vmbus.h index 7845fa5de..043e058bb 100644 --- a/drivers/hv/hyperv_vmbus.h +++ b/drivers/hv/hyperv_vmbus.h @@ -19,6 +19,7 @@ #include <linux/atomic.h> #include <linux/hyperv.h> #include <linux/interrupt.h> +#include <linux/irq.h> #include "hv_trace.h" diff --git a/drivers/hv/vmbus_drv.c b/drivers/hv/vmbus_drv.c index a5a402e77..d0763ea99 100644 --- a/drivers/hv/vmbus_drv.c +++ b/drivers/hv/vmbus_drv.c @@ -22,6 +22,7 @@ #include <linux/clockchips.h> #include <linux/cpu.h> #include <linux/sched/task_stack.h> +#include <linux/irq.h> #include <linux/delay.h> #include <linux/notifier.h> @@ -1307,6 +1308,8 @@ static void vmbus_isr(void) void *page_addr = hv_cpu->synic_event_page; struct hv_message *msg; union hv_synic_event_flags *event; + struct pt_regs *regs = get_irq_regs(); + u64 ip = regs ? instruction_pointer(regs) : 0; bool handled = false; if (unlikely(page_addr == NULL)) @@ -1351,7 +1354,7 @@ static void vmbus_isr(void) tasklet_schedule(&hv_cpu->msg_dpc); } - add_interrupt_randomness(hv_get_vector(), 0); + add_interrupt_randomness(hv_get_vector(), 0, ip); } /* @@ -1359,7 +1362,8 @@ static void vmbus_isr(void) * buffer and call into Hyper-V to transfer the data. */ static void hv_kmsg_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) + enum kmsg_dump_reason reason, + struct kmsg_dumper_iter *iter) { size_t bytes_written; phys_addr_t panic_pa; @@ -1374,7 +1378,7 @@ static void hv_kmsg_dump(struct kmsg_dumper *dumper, * Write dump contents to the page. No need to synchronize; panic should * be single-threaded. */ - kmsg_dump_get_buffer(dumper, false, hv_panic_page, HV_HYP_PAGE_SIZE, + kmsg_dump_get_buffer(iter, false, hv_panic_page, HV_HYP_PAGE_SIZE, &bytes_written); if (bytes_written) hyperv_report_panic_msg(panic_pa, bytes_written); diff --git a/drivers/leds/trigger/Kconfig b/drivers/leds/trigger/Kconfig index d45aba3e1..6e890131d 100644 --- a/drivers/leds/trigger/Kconfig +++ b/drivers/leds/trigger/Kconfig @@ -64,6 +64,7 @@ config LEDS_TRIGGER_BACKLIGHT config LEDS_TRIGGER_CPU bool "LED CPU Trigger" + depends on !PREEMPT_RT help This allows LEDs to be controlled by active CPUs. This shows the active CPUs across an array of LEDs so you can see which diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c index c82953a32..061fea763 100644 --- a/drivers/md/raid5.c +++ b/drivers/md/raid5.c @@ -2217,8 +2217,9 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) struct raid5_percpu *percpu; unsigned long cpu; - cpu = get_cpu(); + cpu = get_cpu_light(); percpu = per_cpu_ptr(conf->percpu, cpu); + spin_lock(&percpu->lock); if (test_bit(STRIPE_OP_BIOFILL, &ops_request)) { ops_run_biofill(sh); overlap_clear++; @@ -2277,7 +2278,8 @@ static void raid_run_ops(struct stripe_head *sh, unsigned long ops_request) if (test_and_clear_bit(R5_Overlap, &dev->flags)) wake_up(&sh->raid_conf->wait_for_overlap); } - put_cpu(); + spin_unlock(&percpu->lock); + put_cpu_light(); } static void free_stripe(struct kmem_cache *sc, struct stripe_head *sh) @@ -7099,6 +7101,7 @@ static int raid456_cpu_up_prepare(unsigned int cpu, struct hlist_node *node) __func__, cpu); return -ENOMEM; } + spin_lock_init(&per_cpu_ptr(conf->percpu, cpu)->lock); return 0; } diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h index 5c05acf20..665fe138a 100644 --- a/drivers/md/raid5.h +++ b/drivers/md/raid5.h @@ -635,6 +635,7 @@ struct r5conf { int recovery_disabled; /* per cpu variables */ struct raid5_percpu { + spinlock_t lock; /* Protection for -RT */ struct page *spare_page; /* Used when checking P/Q in raid6 */ void *scribble; /* space for constructing buffer * lists and performing address diff --git a/drivers/mtd/mtdoops.c b/drivers/mtd/mtdoops.c index 774970bfc..6bc2c728a 100644 --- a/drivers/mtd/mtdoops.c +++ b/drivers/mtd/mtdoops.c @@ -267,7 +267,8 @@ static void find_next_position(struct mtdoops_context *cxt) } static void mtdoops_do_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) + enum kmsg_dump_reason reason, + struct kmsg_dumper_iter *iter) { struct mtdoops_context *cxt = container_of(dumper, struct mtdoops_context, dump); @@ -276,7 +277,7 @@ static void mtdoops_do_dump(struct kmsg_dumper *dumper, if (reason == KMSG_DUMP_OOPS && !dump_oops) return; - kmsg_dump_get_buffer(dumper, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, + kmsg_dump_get_buffer(iter, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, record_size - MTDOOPS_HEADER_SIZE, NULL); if (reason != KMSG_DUMP_OOPS) { diff --git a/drivers/net/arcnet/arc-rimi.c b/drivers/net/arcnet/arc-rimi.c index 98df38fe5..12d085405 100644 --- a/drivers/net/arcnet/arc-rimi.c +++ b/drivers/net/arcnet/arc-rimi.c @@ -332,7 +332,7 @@ static int __init arc_rimi_init(void) dev->irq = 9; if (arcrimi_probe(dev)) { - free_netdev(dev); + free_arcdev(dev); return -EIO; } @@ -349,7 +349,7 @@ static void __exit arc_rimi_exit(void) iounmap(lp->mem_start); release_mem_region(dev->mem_start, dev->mem_end - dev->mem_start + 1); free_irq(dev->irq, dev); - free_netdev(dev); + free_arcdev(dev); } #ifndef MODULE diff --git a/drivers/net/arcnet/arcdevice.h b/drivers/net/arcnet/arcdevice.h index 22a49c6d7..5d4a4c7ef 100644 --- a/drivers/net/arcnet/arcdevice.h +++ b/drivers/net/arcnet/arcdevice.h @@ -298,6 +298,10 @@ struct arcnet_local { int excnak_pending; /* We just got an excesive nak interrupt */ + /* RESET flag handling */ + int reset_in_progress; + struct work_struct reset_work; + struct { uint16_t sequence; /* sequence number (incs with each packet) */ __be16 aborted_seq; @@ -350,7 +354,9 @@ void arcnet_dump_skb(struct net_device *dev, struct sk_buff *skb, char *desc) void arcnet_unregister_proto(struct ArcProto *proto); irqreturn_t arcnet_interrupt(int irq, void *dev_id); + struct net_device *alloc_arcdev(const char *name); +void free_arcdev(struct net_device *dev); int arcnet_open(struct net_device *dev); int arcnet_close(struct net_device *dev); diff --git a/drivers/net/arcnet/arcnet.c b/drivers/net/arcnet/arcnet.c index e04efc0a5..d76dd7d14 100644 --- a/drivers/net/arcnet/arcnet.c +++ b/drivers/net/arcnet/arcnet.c @@ -387,10 +387,44 @@ static void arcnet_timer(struct timer_list *t) struct arcnet_local *lp = from_timer(lp, t, timer); struct net_device *dev = lp->dev; - if (!netif_carrier_ok(dev)) { + spin_lock_irq(&lp->lock); + + if (!lp->reset_in_progress && !netif_carrier_ok(dev)) { netif_carrier_on(dev); netdev_info(dev, "link up\n"); } + + spin_unlock_irq(&lp->lock); +} + +static void reset_device_work(struct work_struct *work) +{ + struct arcnet_local *lp; + struct net_device *dev; + + lp = container_of(work, struct arcnet_local, reset_work); + dev = lp->dev; + + /* Do not bring the network interface back up if an ifdown + * was already done. + */ + if (!netif_running(dev) || !lp->reset_in_progress) + return; + + rtnl_lock(); + + /* Do another check, in case of an ifdown that was triggered in + * the small race window between the exit condition above and + * acquiring RTNL. + */ + if (!netif_running(dev) || !lp->reset_in_progress) + goto out; + + dev_close(dev); + dev_open(dev, NULL); + +out: + rtnl_unlock(); } static void arcnet_reply_tasklet(unsigned long data) @@ -452,12 +486,25 @@ struct net_device *alloc_arcdev(const char *name) lp->dev = dev; spin_lock_init(&lp->lock); timer_setup(&lp->timer, arcnet_timer, 0); + INIT_WORK(&lp->reset_work, reset_device_work); } return dev; } EXPORT_SYMBOL(alloc_arcdev); +void free_arcdev(struct net_device *dev) +{ + struct arcnet_local *lp = netdev_priv(dev); + + /* Do not cancel this at ->ndo_close(), as the workqueue itself + * indirectly calls the ifdown path through dev_close(). + */ + cancel_work_sync(&lp->reset_work); + free_netdev(dev); +} +EXPORT_SYMBOL(free_arcdev); + /* Open/initialize the board. This is called sometime after booting when * the 'ifconfig' program is run. * @@ -587,6 +634,10 @@ int arcnet_close(struct net_device *dev) /* shut down the card */ lp->hw.close(dev); + + /* reset counters */ + lp->reset_in_progress = 0; + module_put(lp->hw.owner); return 0; } @@ -820,6 +871,9 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) spin_lock_irqsave(&lp->lock, flags); + if (lp->reset_in_progress) + goto out; + /* RESET flag was enabled - if device is not running, we must * clear it right away (but nothing else). */ @@ -852,11 +906,14 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) if (status & RESETflag) { arc_printk(D_NORMAL, dev, "spurious reset (status=%Xh)\n", status); - arcnet_close(dev); - arcnet_open(dev); + + lp->reset_in_progress = 1; + netif_stop_queue(dev); + netif_carrier_off(dev); + schedule_work(&lp->reset_work); /* get out of the interrupt handler! */ - break; + goto out; } /* RX is inhibited - we must have received something. * Prepare to receive into the next buffer. @@ -1052,6 +1109,7 @@ irqreturn_t arcnet_interrupt(int irq, void *dev_id) udelay(1); lp->hw.intmask(dev, lp->intmask); +out: spin_unlock_irqrestore(&lp->lock, flags); return retval; } diff --git a/drivers/net/arcnet/com20020-isa.c b/drivers/net/arcnet/com20020-isa.c index f983c4ce6..be618e4b9 100644 --- a/drivers/net/arcnet/com20020-isa.c +++ b/drivers/net/arcnet/com20020-isa.c @@ -169,7 +169,7 @@ static int __init com20020_init(void) dev->irq = 9; if (com20020isa_probe(dev)) { - free_netdev(dev); + free_arcdev(dev); return -EIO; } @@ -182,7 +182,7 @@ static void __exit com20020_exit(void) unregister_netdev(my_dev); free_irq(my_dev->irq, my_dev); release_region(my_dev->base_addr, ARCNET_TOTAL_SIZE); - free_netdev(my_dev); + free_arcdev(my_dev); } #ifndef MODULE diff --git a/drivers/net/arcnet/com20020-pci.c b/drivers/net/arcnet/com20020-pci.c index eb7f76753..8bdc44b7e 100644 --- a/drivers/net/arcnet/com20020-pci.c +++ b/drivers/net/arcnet/com20020-pci.c @@ -291,7 +291,7 @@ static void com20020pci_remove(struct pci_dev *pdev) unregister_netdev(dev); free_irq(dev->irq, dev); - free_netdev(dev); + free_arcdev(dev); } } diff --git a/drivers/net/arcnet/com20020_cs.c b/drivers/net/arcnet/com20020_cs.c index cf607ffcf..9cc5eb6a8 100644 --- a/drivers/net/arcnet/com20020_cs.c +++ b/drivers/net/arcnet/com20020_cs.c @@ -177,7 +177,7 @@ static void com20020_detach(struct pcmcia_device *link) dev = info->dev; if (dev) { dev_dbg(&link->dev, "kfree...\n"); - free_netdev(dev); + free_arcdev(dev); } dev_dbg(&link->dev, "kfree2...\n"); kfree(info); diff --git a/drivers/net/arcnet/com90io.c b/drivers/net/arcnet/com90io.c index cf214b730..3856b447d 100644 --- a/drivers/net/arcnet/com90io.c +++ b/drivers/net/arcnet/com90io.c @@ -396,7 +396,7 @@ static int __init com90io_init(void) err = com90io_probe(dev); if (err) { - free_netdev(dev); + free_arcdev(dev); return err; } @@ -419,7 +419,7 @@ static void __exit com90io_exit(void) free_irq(dev->irq, dev); release_region(dev->base_addr, ARCNET_TOTAL_SIZE); - free_netdev(dev); + free_arcdev(dev); } module_init(com90io_init) diff --git a/drivers/net/arcnet/com90xx.c b/drivers/net/arcnet/com90xx.c index 3dc3d533c..d8dfb9ea0 100644 --- a/drivers/net/arcnet/com90xx.c +++ b/drivers/net/arcnet/com90xx.c @@ -554,7 +554,7 @@ static int __init com90xx_found(int ioaddr, int airq, u_long shmem, err_release_mem: release_mem_region(dev->mem_start, dev->mem_end - dev->mem_start + 1); err_free_dev: - free_netdev(dev); + free_arcdev(dev); return -EIO; } @@ -672,7 +672,7 @@ static void __exit com90xx_exit(void) release_region(dev->base_addr, ARCNET_TOTAL_SIZE); release_mem_region(dev->mem_start, dev->mem_end - dev->mem_start + 1); - free_netdev(dev); + free_arcdev(dev); } } diff --git a/drivers/net/ethernet/chelsio/cxgb/common.h b/drivers/net/ethernet/chelsio/cxgb/common.h index 647506064..0321be773 100644 --- a/drivers/net/ethernet/chelsio/cxgb/common.h +++ b/drivers/net/ethernet/chelsio/cxgb/common.h @@ -238,7 +238,6 @@ struct adapter { int msg_enable; u32 mmio_len; - struct work_struct ext_intr_handler_task; struct adapter_params params; /* Terminator modules. */ @@ -257,6 +256,7 @@ struct adapter { /* guards async operations */ spinlock_t async_lock ____cacheline_aligned; + u32 pending_thread_intr; u32 slow_intr_mask; int t1powersave; }; @@ -334,8 +334,7 @@ void t1_interrupts_enable(adapter_t *adapter); void t1_interrupts_disable(adapter_t *adapter); void t1_interrupts_clear(adapter_t *adapter); int t1_elmer0_ext_intr_handler(adapter_t *adapter); -void t1_elmer0_ext_intr(adapter_t *adapter); -int t1_slow_intr_handler(adapter_t *adapter); +irqreturn_t t1_slow_intr_handler(adapter_t *adapter); int t1_link_start(struct cphy *phy, struct cmac *mac, struct link_config *lc); const struct board_info *t1_get_board_info(unsigned int board_id); @@ -347,7 +346,6 @@ int t1_get_board_rev(adapter_t *adapter, const struct board_info *bi, int t1_init_hw_modules(adapter_t *adapter); int t1_init_sw_modules(adapter_t *adapter, const struct board_info *bi); void t1_free_sw_modules(adapter_t *adapter); -void t1_fatal_err(adapter_t *adapter); void t1_link_changed(adapter_t *adapter, int port_id); void t1_link_negotiated(adapter_t *adapter, int port_id, int link_stat, int speed, int duplex, int pause); diff --git a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c index 1311eac9e..c827273c4 100644 --- a/drivers/net/ethernet/chelsio/cxgb/cxgb2.c +++ b/drivers/net/ethernet/chelsio/cxgb/cxgb2.c @@ -211,9 +211,10 @@ static int cxgb_up(struct adapter *adapter) t1_interrupts_clear(adapter); adapter->params.has_msi = !disable_msi && !pci_enable_msi(adapter->pdev); - err = request_irq(adapter->pdev->irq, t1_interrupt, - adapter->params.has_msi ? 0 : IRQF_SHARED, - adapter->name, adapter); + err = request_threaded_irq(adapter->pdev->irq, t1_interrupt, + t1_interrupt_thread, + adapter->params.has_msi ? 0 : IRQF_SHARED, + adapter->name, adapter); if (err) { if (adapter->params.has_msi) pci_disable_msi(adapter->pdev); @@ -924,51 +925,6 @@ static void mac_stats_task(struct work_struct *work) spin_unlock(&adapter->work_lock); } -/* - * Processes elmer0 external interrupts in process context. - */ -static void ext_intr_task(struct work_struct *work) -{ - struct adapter *adapter = - container_of(work, struct adapter, ext_intr_handler_task); - - t1_elmer0_ext_intr_handler(adapter); - - /* Now reenable external interrupts */ - spin_lock_irq(&adapter->async_lock); - adapter->slow_intr_mask |= F_PL_INTR_EXT; - writel(F_PL_INTR_EXT, adapter->regs + A_PL_CAUSE); - writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, - adapter->regs + A_PL_ENABLE); - spin_unlock_irq(&adapter->async_lock); -} - -/* - * Interrupt-context handler for elmer0 external interrupts. - */ -void t1_elmer0_ext_intr(struct adapter *adapter) -{ - /* - * Schedule a task to handle external interrupts as we require - * a process context. We disable EXT interrupts in the interim - * and let the task reenable them when it's done. - */ - adapter->slow_intr_mask &= ~F_PL_INTR_EXT; - writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, - adapter->regs + A_PL_ENABLE); - schedule_work(&adapter->ext_intr_handler_task); -} - -void t1_fatal_err(struct adapter *adapter) -{ - if (adapter->flags & FULL_INIT_DONE) { - t1_sge_stop(adapter->sge); - t1_interrupts_disable(adapter); - } - pr_alert("%s: encountered fatal error, operation suspended\n", - adapter->name); -} - static const struct net_device_ops cxgb_netdev_ops = { .ndo_open = cxgb_open, .ndo_stop = cxgb_close, @@ -1070,8 +1026,6 @@ static int init_one(struct pci_dev *pdev, const struct pci_device_id *ent) spin_lock_init(&adapter->async_lock); spin_lock_init(&adapter->mac_lock); - INIT_WORK(&adapter->ext_intr_handler_task, - ext_intr_task); INIT_DELAYED_WORK(&adapter->stats_update_task, mac_stats_task); diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.c b/drivers/net/ethernet/chelsio/cxgb/sge.c index 2d9c2b5a6..cda01f22c 100644 --- a/drivers/net/ethernet/chelsio/cxgb/sge.c +++ b/drivers/net/ethernet/chelsio/cxgb/sge.c @@ -940,10 +940,11 @@ void t1_sge_intr_clear(struct sge *sge) /* * SGE 'Error' interrupt handler */ -int t1_sge_intr_error_handler(struct sge *sge) +bool t1_sge_intr_error_handler(struct sge *sge) { struct adapter *adapter = sge->adapter; u32 cause = readl(adapter->regs + A_SG_INT_CAUSE); + bool wake = false; if (adapter->port[0].dev->hw_features & NETIF_F_TSO) cause &= ~F_PACKET_TOO_BIG; @@ -967,11 +968,14 @@ int t1_sge_intr_error_handler(struct sge *sge) sge->stats.pkt_mismatch++; pr_alert("%s: SGE packet mismatch\n", adapter->name); } - if (cause & SGE_INT_FATAL) - t1_fatal_err(adapter); + if (cause & SGE_INT_FATAL) { + t1_interrupts_disable(adapter); + adapter->pending_thread_intr |= F_PL_INTR_SGE_ERR; + wake = true; + } writel(cause, adapter->regs + A_SG_INT_CAUSE); - return 0; + return wake; } const struct sge_intr_counts *t1_sge_get_intr_counts(const struct sge *sge) @@ -1619,11 +1623,46 @@ int t1_poll(struct napi_struct *napi, int budget) return work_done; } +irqreturn_t t1_interrupt_thread(int irq, void *data) +{ + struct adapter *adapter = data; + u32 pending_thread_intr; + + spin_lock_irq(&adapter->async_lock); + pending_thread_intr = adapter->pending_thread_intr; + adapter->pending_thread_intr = 0; + spin_unlock_irq(&adapter->async_lock); + + if (!pending_thread_intr) + return IRQ_NONE; + + if (pending_thread_intr & F_PL_INTR_EXT) + t1_elmer0_ext_intr_handler(adapter); + + /* This error is fatal, interrupts remain off */ + if (pending_thread_intr & F_PL_INTR_SGE_ERR) { + pr_alert("%s: encountered fatal error, operation suspended\n", + adapter->name); + t1_sge_stop(adapter->sge); + return IRQ_HANDLED; + } + + spin_lock_irq(&adapter->async_lock); + adapter->slow_intr_mask |= F_PL_INTR_EXT; + + writel(F_PL_INTR_EXT, adapter->regs + A_PL_CAUSE); + writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, + adapter->regs + A_PL_ENABLE); + spin_unlock_irq(&adapter->async_lock); + + return IRQ_HANDLED; +} + irqreturn_t t1_interrupt(int irq, void *data) { struct adapter *adapter = data; struct sge *sge = adapter->sge; - int handled; + irqreturn_t handled; if (likely(responses_pending(adapter))) { writel(F_PL_INTR_SGE_DATA, adapter->regs + A_PL_CAUSE); @@ -1645,10 +1684,10 @@ irqreturn_t t1_interrupt(int irq, void *data) handled = t1_slow_intr_handler(adapter); spin_unlock(&adapter->async_lock); - if (!handled) + if (handled == IRQ_NONE) sge->stats.unhandled_irqs++; - return IRQ_RETVAL(handled != 0); + return handled; } /* diff --git a/drivers/net/ethernet/chelsio/cxgb/sge.h b/drivers/net/ethernet/chelsio/cxgb/sge.h index a1ba591b3..716705b96 100644 --- a/drivers/net/ethernet/chelsio/cxgb/sge.h +++ b/drivers/net/ethernet/chelsio/cxgb/sge.h @@ -74,6 +74,7 @@ struct sge *t1_sge_create(struct adapter *, struct sge_params *); int t1_sge_configure(struct sge *, struct sge_params *); int t1_sge_set_coalesce_params(struct sge *, struct sge_params *); void t1_sge_destroy(struct sge *); +irqreturn_t t1_interrupt_thread(int irq, void *data); irqreturn_t t1_interrupt(int irq, void *cookie); int t1_poll(struct napi_struct *, int); @@ -81,7 +82,7 @@ netdev_tx_t t1_start_xmit(struct sk_buff *skb, struct net_device *dev); void t1_vlan_mode(struct adapter *adapter, netdev_features_t features); void t1_sge_start(struct sge *); void t1_sge_stop(struct sge *); -int t1_sge_intr_error_handler(struct sge *); +bool t1_sge_intr_error_handler(struct sge *sge); void t1_sge_intr_enable(struct sge *); void t1_sge_intr_disable(struct sge *); void t1_sge_intr_clear(struct sge *); diff --git a/drivers/net/ethernet/chelsio/cxgb/subr.c b/drivers/net/ethernet/chelsio/cxgb/subr.c index ea0f8741d..310add28f 100644 --- a/drivers/net/ethernet/chelsio/cxgb/subr.c +++ b/drivers/net/ethernet/chelsio/cxgb/subr.c @@ -170,7 +170,7 @@ void t1_link_changed(adapter_t *adapter, int port_id) t1_link_negotiated(adapter, port_id, link_ok, speed, duplex, fc); } -static int t1_pci_intr_handler(adapter_t *adapter) +static bool t1_pci_intr_handler(adapter_t *adapter) { u32 pcix_cause; @@ -179,9 +179,13 @@ static int t1_pci_intr_handler(adapter_t *adapter) if (pcix_cause) { pci_write_config_dword(adapter->pdev, A_PCICFG_INTR_CAUSE, pcix_cause); - t1_fatal_err(adapter); /* PCI errors are fatal */ + /* PCI errors are fatal */ + t1_interrupts_disable(adapter); + adapter->pending_thread_intr |= F_PL_INTR_SGE_ERR; + pr_alert("%s: PCI error encountered.\n", adapter->name); + return true; } - return 0; + return false; } #ifdef CONFIG_CHELSIO_T1_1G @@ -210,13 +214,16 @@ static int fpga_phy_intr_handler(adapter_t *adapter) /* * Slow path interrupt handler for FPGAs. */ -static int fpga_slow_intr(adapter_t *adapter) +static irqreturn_t fpga_slow_intr(adapter_t *adapter) { u32 cause = readl(adapter->regs + A_PL_CAUSE); + irqreturn_t ret = IRQ_NONE; cause &= ~F_PL_INTR_SGE_DATA; - if (cause & F_PL_INTR_SGE_ERR) - t1_sge_intr_error_handler(adapter->sge); + if (cause & F_PL_INTR_SGE_ERR) { + if (t1_sge_intr_error_handler(adapter->sge)) + ret = IRQ_WAKE_THREAD; + } if (cause & FPGA_PCIX_INTERRUPT_GMAC) fpga_phy_intr_handler(adapter); @@ -231,14 +238,19 @@ static int fpga_slow_intr(adapter_t *adapter) /* Clear TP interrupt */ writel(tp_cause, adapter->regs + FPGA_TP_ADDR_INTERRUPT_CAUSE); } - if (cause & FPGA_PCIX_INTERRUPT_PCIX) - t1_pci_intr_handler(adapter); + if (cause & FPGA_PCIX_INTERRUPT_PCIX) { + if (t1_pci_intr_handler(adapter)) + ret = IRQ_WAKE_THREAD; + } /* Clear the interrupts just processed. */ if (cause) writel(cause, adapter->regs + A_PL_CAUSE); - return cause != 0; + if (ret != IRQ_NONE) + return ret; + + return cause == 0 ? IRQ_NONE : IRQ_HANDLED; } #endif @@ -842,31 +854,45 @@ void t1_interrupts_clear(adapter_t* adapter) /* * Slow path interrupt handler for ASICs. */ -static int asic_slow_intr(adapter_t *adapter) +static irqreturn_t asic_slow_intr(adapter_t *adapter) { u32 cause = readl(adapter->regs + A_PL_CAUSE); + irqreturn_t ret = IRQ_HANDLED; cause &= adapter->slow_intr_mask; if (!cause) - return 0; - if (cause & F_PL_INTR_SGE_ERR) - t1_sge_intr_error_handler(adapter->sge); + return IRQ_NONE; + if (cause & F_PL_INTR_SGE_ERR) { + if (t1_sge_intr_error_handler(adapter->sge)) + ret = IRQ_WAKE_THREAD; + } if (cause & F_PL_INTR_TP) t1_tp_intr_handler(adapter->tp); if (cause & F_PL_INTR_ESPI) t1_espi_intr_handler(adapter->espi); - if (cause & F_PL_INTR_PCIX) - t1_pci_intr_handler(adapter); - if (cause & F_PL_INTR_EXT) - t1_elmer0_ext_intr(adapter); + if (cause & F_PL_INTR_PCIX) { + if (t1_pci_intr_handler(adapter)) + ret = IRQ_WAKE_THREAD; + } + if (cause & F_PL_INTR_EXT) { + /* Wake the threaded interrupt to handle external interrupts as + * we require a process context. We disable EXT interrupts in + * the interim and let the thread reenable them when it's done. + */ + adapter->pending_thread_intr |= F_PL_INTR_EXT; + adapter->slow_intr_mask &= ~F_PL_INTR_EXT; + writel(adapter->slow_intr_mask | F_PL_INTR_SGE_DATA, + adapter->regs + A_PL_ENABLE); + ret = IRQ_WAKE_THREAD; + } /* Clear the interrupts just processed. */ writel(cause, adapter->regs + A_PL_CAUSE); readl(adapter->regs + A_PL_CAUSE); /* flush writes */ - return 1; + return ret; } -int t1_slow_intr_handler(adapter_t *adapter) +irqreturn_t t1_slow_intr_handler(adapter_t *adapter) { #ifdef CONFIG_CHELSIO_T1_1G if (!t1_is_asic(adapter)) diff --git a/drivers/net/ethernet/dlink/sundance.c b/drivers/net/ethernet/dlink/sundance.c index e3a885891..df0eab479 100644 --- a/drivers/net/ethernet/dlink/sundance.c +++ b/drivers/net/ethernet/dlink/sundance.c @@ -963,7 +963,7 @@ static void tx_timeout(struct net_device *dev, unsigned int txqueue) unsigned long flag; netif_stop_queue(dev); - tasklet_disable(&np->tx_tasklet); + tasklet_disable_in_atomic(&np->tx_tasklet); iowrite16(0, ioaddr + IntrEnable); printk(KERN_WARNING "%s: Transmit timed out, TxStatus %2.2x " "TxFrameId %2.2x," diff --git a/drivers/net/ethernet/jme.c b/drivers/net/ethernet/jme.c index 4185ca3dd..cf5c33d0f 100644 --- a/drivers/net/ethernet/jme.c +++ b/drivers/net/ethernet/jme.c @@ -1265,9 +1265,9 @@ jme_stop_shutdown_timer(struct jme_adapter *jme) jwrite32f(jme, JME_APMC, apmc); } -static void jme_link_change_tasklet(struct tasklet_struct *t) +static void jme_link_change_work(struct work_struct *work) { - struct jme_adapter *jme = from_tasklet(jme, t, linkch_task); + struct jme_adapter *jme = container_of(work, struct jme_adapter, linkch_task); struct net_device *netdev = jme->dev; int rc; @@ -1510,7 +1510,7 @@ jme_intr_msi(struct jme_adapter *jme, u32 intrstat) * all other events are ignored */ jwrite32(jme, JME_IEVE, intrstat); - tasklet_schedule(&jme->linkch_task); + schedule_work(&jme->linkch_task); goto out_reenable; } @@ -1832,7 +1832,6 @@ jme_open(struct net_device *netdev) jme_clear_pm_disable_wol(jme); JME_NAPI_ENABLE(jme); - tasklet_setup(&jme->linkch_task, jme_link_change_tasklet); tasklet_setup(&jme->txclean_task, jme_tx_clean_tasklet); tasklet_setup(&jme->rxclean_task, jme_rx_clean_tasklet); tasklet_setup(&jme->rxempty_task, jme_rx_empty_tasklet); @@ -1920,7 +1919,7 @@ jme_close(struct net_device *netdev) JME_NAPI_DISABLE(jme); - tasklet_kill(&jme->linkch_task); + cancel_work_sync(&jme->linkch_task); tasklet_kill(&jme->txclean_task); tasklet_kill(&jme->rxclean_task); tasklet_kill(&jme->rxempty_task); @@ -3039,6 +3038,7 @@ jme_init_one(struct pci_dev *pdev, atomic_set(&jme->rx_empty, 1); tasklet_setup(&jme->pcc_task, jme_pcc_tasklet); + INIT_WORK(&jme->linkch_task, jme_link_change_work); jme->dpi.cur = PCC_P1; jme->reg_ghc = 0; diff --git a/drivers/net/ethernet/jme.h b/drivers/net/ethernet/jme.h index a2c3b00d9..2af76329b 100644 --- a/drivers/net/ethernet/jme.h +++ b/drivers/net/ethernet/jme.h @@ -411,7 +411,7 @@ struct jme_adapter { struct tasklet_struct rxempty_task; struct tasklet_struct rxclean_task; struct tasklet_struct txclean_task; - struct tasklet_struct linkch_task; + struct work_struct linkch_task; struct tasklet_struct pcc_task; unsigned long flags; u32 reg_txcs; diff --git a/drivers/net/wireless/ath/ath9k/beacon.c b/drivers/net/wireless/ath/ath9k/beacon.c index 71e2ada86..72e2e71aa 100644 --- a/drivers/net/wireless/ath/ath9k/beacon.c +++ b/drivers/net/wireless/ath/ath9k/beacon.c @@ -251,7 +251,7 @@ void ath9k_beacon_ensure_primary_slot(struct ath_softc *sc) int first_slot = ATH_BCBUF; int slot; - tasklet_disable(&sc->bcon_tasklet); + tasklet_disable_in_atomic(&sc->bcon_tasklet); /* Find first taken slot. */ for (slot = 0; slot < ATH_BCBUF; slot++) { diff --git a/drivers/pci/controller/pci-hyperv.c b/drivers/pci/controller/pci-hyperv.c index ad3e3cde1..c0b8911bd 100644 --- a/drivers/pci/controller/pci-hyperv.c +++ b/drivers/pci/controller/pci-hyperv.c @@ -1457,7 +1457,7 @@ static void hv_compose_msi_msg(struct irq_data *data, struct msi_msg *msg) * Prevents hv_pci_onchannelcallback() from running concurrently * in the tasklet. */ - tasklet_disable(&channel->callback_event); + tasklet_disable_in_atomic(&channel->callback_event); /* * Since this function is called with IRQ locks held, can't diff --git a/drivers/scsi/fcoe/fcoe.c b/drivers/scsi/fcoe/fcoe.c index 0f9274960..dc97e4f1f 100644 --- a/drivers/scsi/fcoe/fcoe.c +++ b/drivers/scsi/fcoe/fcoe.c @@ -1452,11 +1452,11 @@ static int fcoe_rcv(struct sk_buff *skb, struct net_device *netdev, static int fcoe_alloc_paged_crc_eof(struct sk_buff *skb, int tlen) { struct fcoe_percpu_s *fps; - int rc; + int rc, cpu = get_cpu_light(); - fps = &get_cpu_var(fcoe_percpu); + fps = &per_cpu(fcoe_percpu, cpu); rc = fcoe_get_paged_crc_eof(skb, tlen, fps); - put_cpu_var(fcoe_percpu); + put_cpu_light(); return rc; } @@ -1641,11 +1641,11 @@ static inline int fcoe_filter_frames(struct fc_lport *lport, return 0; } - stats = per_cpu_ptr(lport->stats, get_cpu()); + stats = per_cpu_ptr(lport->stats, get_cpu_light()); stats->InvalidCRCCount++; if (stats->InvalidCRCCount < 5) printk(KERN_WARNING "fcoe: dropping frame with CRC error\n"); - put_cpu(); + put_cpu_light(); return -EINVAL; } @@ -1686,7 +1686,7 @@ static void fcoe_recv_frame(struct sk_buff *skb) */ hp = (struct fcoe_hdr *) skb_network_header(skb); - stats = per_cpu_ptr(lport->stats, get_cpu()); + stats = per_cpu_ptr(lport->stats, get_cpu_light()); if (unlikely(FC_FCOE_DECAPS_VER(hp) != FC_FCOE_VER)) { if (stats->ErrorFrames < 5) printk(KERN_WARNING "fcoe: FCoE version " @@ -1718,13 +1718,13 @@ static void fcoe_recv_frame(struct sk_buff *skb) goto drop; if (!fcoe_filter_frames(lport, fp)) { - put_cpu(); + put_cpu_light(); fc_exch_recv(lport, fp); return; } drop: stats->ErrorFrames++; - put_cpu(); + put_cpu_light(); kfree_skb(skb); } diff --git a/drivers/scsi/fcoe/fcoe_ctlr.c b/drivers/scsi/fcoe/fcoe_ctlr.c index 5ea426eff..0d6b9acc7 100644 --- a/drivers/scsi/fcoe/fcoe_ctlr.c +++ b/drivers/scsi/fcoe/fcoe_ctlr.c @@ -828,7 +828,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) INIT_LIST_HEAD(&del_list); - stats = per_cpu_ptr(fip->lp->stats, get_cpu()); + stats = per_cpu_ptr(fip->lp->stats, get_cpu_light()); list_for_each_entry_safe(fcf, next, &fip->fcfs, list) { deadline = fcf->time + fcf->fka_period + fcf->fka_period / 2; @@ -864,7 +864,7 @@ static unsigned long fcoe_ctlr_age_fcfs(struct fcoe_ctlr *fip) sel_time = fcf->time; } } - put_cpu(); + put_cpu_light(); list_for_each_entry_safe(fcf, next, &del_list, list) { /* Removes fcf from current list */ diff --git a/drivers/scsi/libfc/fc_exch.c b/drivers/scsi/libfc/fc_exch.c index a50f1eef0..0b2acad7c 100644 --- a/drivers/scsi/libfc/fc_exch.c +++ b/drivers/scsi/libfc/fc_exch.c @@ -826,10 +826,10 @@ static struct fc_exch *fc_exch_em_alloc(struct fc_lport *lport, } memset(ep, 0, sizeof(*ep)); - cpu = get_cpu(); + cpu = get_cpu_light(); pool = per_cpu_ptr(mp->pool, cpu); spin_lock_bh(&pool->lock); - put_cpu(); + put_cpu_light(); /* peek cache of free slot */ if (pool->left != FC_XID_UNKNOWN) { diff --git a/drivers/tty/serial/8250/8250.h b/drivers/tty/serial/8250/8250.h index 34aa2714f..42cd2baa7 100644 --- a/drivers/tty/serial/8250/8250.h +++ b/drivers/tty/serial/8250/8250.h @@ -131,12 +131,55 @@ static inline void serial_dl_write(struct uart_8250_port *up, int value) up->dl_write(up, value); } +static inline void serial8250_set_IER(struct uart_8250_port *up, + unsigned char ier) +{ + struct uart_port *port = &up->port; + unsigned int flags; + bool is_console; + + is_console = uart_console(port); + + if (is_console) + console_atomic_lock(&flags); + + serial_out(up, UART_IER, ier); + + if (is_console) + console_atomic_unlock(flags); +} + +static inline unsigned char serial8250_clear_IER(struct uart_8250_port *up) +{ + struct uart_port *port = &up->port; + unsigned int clearval = 0; + unsigned int prior; + unsigned int flags; + bool is_console; + + is_console = uart_console(port); + + if (up->capabilities & UART_CAP_UUE) + clearval = UART_IER_UUE; + + if (is_console) + console_atomic_lock(&flags); + + prior = serial_port_in(port, UART_IER); + serial_port_out(port, UART_IER, clearval); + + if (is_console) + console_atomic_unlock(flags); + + return prior; +} + static inline bool serial8250_set_THRI(struct uart_8250_port *up) { if (up->ier & UART_IER_THRI) return false; up->ier |= UART_IER_THRI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); return true; } @@ -145,7 +188,7 @@ static inline bool serial8250_clear_THRI(struct uart_8250_port *up) if (!(up->ier & UART_IER_THRI)) return false; up->ier &= ~UART_IER_THRI; - serial_out(up, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); return true; } diff --git a/drivers/tty/serial/8250/8250_core.c b/drivers/tty/serial/8250/8250_core.c index cae61d1eb..47dd23056 100644 --- a/drivers/tty/serial/8250/8250_core.c +++ b/drivers/tty/serial/8250/8250_core.c @@ -274,10 +274,8 @@ static void serial8250_backup_timeout(struct timer_list *t) * Must disable interrupts or else we risk racing with the interrupt * based handler. */ - if (up->port.irq) { - ier = serial_in(up, UART_IER); - serial_out(up, UART_IER, 0); - } + if (up->port.irq) + ier = serial8250_clear_IER(up); iir = serial_in(up, UART_IIR); @@ -300,7 +298,7 @@ static void serial8250_backup_timeout(struct timer_list *t) serial8250_tx_chars(up); if (up->port.irq) - serial_out(up, UART_IER, ier); + serial8250_set_IER(up, ier); spin_unlock_irqrestore(&up->port.lock, flags); @@ -578,6 +576,14 @@ serial8250_register_ports(struct uart_driver *drv, struct device *dev) #ifdef CONFIG_SERIAL_8250_CONSOLE +static void univ8250_console_write_atomic(struct console *co, const char *s, + unsigned int count) +{ + struct uart_8250_port *up = &serial8250_ports[co->index]; + + serial8250_console_write_atomic(up, s, count); +} + static void univ8250_console_write(struct console *co, const char *s, unsigned int count) { @@ -671,6 +677,7 @@ static int univ8250_console_match(struct console *co, char *name, int idx, static struct console univ8250_console = { .name = "ttyS", + .write_atomic = univ8250_console_write_atomic, .write = univ8250_console_write, .device = uart_console_device, .setup = univ8250_console_setup, diff --git a/drivers/tty/serial/8250/8250_fsl.c b/drivers/tty/serial/8250/8250_fsl.c index fbcc90c31..b33cb454c 100644 --- a/drivers/tty/serial/8250/8250_fsl.c +++ b/drivers/tty/serial/8250/8250_fsl.c @@ -60,9 +60,18 @@ int fsl8250_handle_irq(struct uart_port *port) /* Stop processing interrupts on input overrun */ if ((orig_lsr & UART_LSR_OE) && (up->overrun_backoff_time_ms > 0)) { + unsigned int ca_flags; unsigned long delay; + bool is_console; + is_console = uart_console(port); + + if (is_console) + console_atomic_lock(&ca_flags); up->ier = port->serial_in(port, UART_IER); + if (is_console) + console_atomic_unlock(ca_flags); + if (up->ier & (UART_IER_RLSI | UART_IER_RDI)) { port->ops->stop_rx(port); } else { diff --git a/drivers/tty/serial/8250/8250_ingenic.c b/drivers/tty/serial/8250/8250_ingenic.c index 988bf6bcc..bcd26d672 100644 --- a/drivers/tty/serial/8250/8250_ingenic.c +++ b/drivers/tty/serial/8250/8250_ingenic.c @@ -146,6 +146,8 @@ OF_EARLYCON_DECLARE(x1000_uart, "ingenic,x1000-uart", static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) { + unsigned int flags; + bool is_console; int ier; switch (offset) { @@ -167,7 +169,12 @@ static void ingenic_uart_serial_out(struct uart_port *p, int offset, int value) * If we have enabled modem status IRQs we should enable * modem mode. */ + is_console = uart_console(p); + if (is_console) + console_atomic_lock(&flags); ier = p->serial_in(p, UART_IER); + if (is_console) + console_atomic_unlock(flags); if (ier & UART_IER_MSI) value |= UART_MCR_MDCE | UART_MCR_FCM; diff --git a/drivers/tty/serial/8250/8250_mtk.c b/drivers/tty/serial/8250/8250_mtk.c index fb65dc601..5bc734c70 100644 --- a/drivers/tty/serial/8250/8250_mtk.c +++ b/drivers/tty/serial/8250/8250_mtk.c @@ -218,12 +218,37 @@ static void mtk8250_shutdown(struct uart_port *port) static void mtk8250_disable_intrs(struct uart_8250_port *up, int mask) { - serial_out(up, UART_IER, serial_in(up, UART_IER) & (~mask)); + struct uart_port *port = &up->port; + unsigned int flags; + unsigned int ier; + bool is_console; + + is_console = uart_console(port); + + if (is_console) + console_atomic_lock(&flags); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier & (~mask)); + + if (is_console) + console_atomic_unlock(flags); } static void mtk8250_enable_intrs(struct uart_8250_port *up, int mask) { - serial_out(up, UART_IER, serial_in(up, UART_IER) | mask); + struct uart_port *port = &up->port; + unsigned int flags; + unsigned int ier; + + if (uart_console(port)) + console_atomic_lock(&flags); + + ier = serial_in(up, UART_IER); + serial_out(up, UART_IER, ier | mask); + + if (uart_console(port)) + console_atomic_unlock(flags); } static void mtk8250_set_flow_ctrl(struct uart_8250_port *up, int mode) diff --git a/drivers/tty/serial/8250/8250_port.c b/drivers/tty/serial/8250/8250_port.c index 7c07ebb37..a0a617caa 100644 --- a/drivers/tty/serial/8250/8250_port.c +++ b/drivers/tty/serial/8250/8250_port.c @@ -762,7 +762,7 @@ static void serial8250_set_sleep(struct uart_8250_port *p, int sleep) serial_out(p, UART_EFR, UART_EFR_ECB); serial_out(p, UART_LCR, 0); } - serial_out(p, UART_IER, sleep ? UART_IERX_SLEEP : 0); + serial8250_set_IER(p, sleep ? UART_IERX_SLEEP : 0); if (p->capabilities & UART_CAP_EFR) { serial_out(p, UART_LCR, UART_LCR_CONF_MODE_B); serial_out(p, UART_EFR, efr); @@ -1436,7 +1436,7 @@ static void serial8250_stop_rx(struct uart_port *port) up->ier &= ~(UART_IER_RLSI | UART_IER_RDI); up->port.read_status_mask &= ~UART_LSR_DR; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial8250_rpm_put(up); } @@ -1466,7 +1466,7 @@ void serial8250_em485_stop_tx(struct uart_8250_port *p) serial8250_clear_and_reinit_fifos(p); p->ier |= UART_IER_RLSI | UART_IER_RDI; - serial_port_out(&p->port, UART_IER, p->ier); + serial8250_set_IER(p, p->ier); } } EXPORT_SYMBOL_GPL(serial8250_em485_stop_tx); @@ -1694,7 +1694,7 @@ static void serial8250_disable_ms(struct uart_port *port) mctrl_gpio_disable_ms(up->gpios); up->ier &= ~UART_IER_MSI; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); } static void serial8250_enable_ms(struct uart_port *port) @@ -1710,7 +1710,7 @@ static void serial8250_enable_ms(struct uart_port *port) up->ier |= UART_IER_MSI; serial8250_rpm_get(up); - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); serial8250_rpm_put(up); } @@ -2130,14 +2130,7 @@ static void serial8250_put_poll_char(struct uart_port *port, struct uart_8250_port *up = up_to_u8250p(port); serial8250_rpm_get(up); - /* - * First save the IER then disable the interrupts - */ - ier = serial_port_in(port, UART_IER); - if (up->capabilities & UART_CAP_UUE) - serial_port_out(port, UART_IER, UART_IER_UUE); - else - serial_port_out(port, UART_IER, 0); + ier = serial8250_clear_IER(up); wait_for_xmitr(up, BOTH_EMPTY); /* @@ -2150,7 +2143,7 @@ static void serial8250_put_poll_char(struct uart_port *port, * and restore the IER */ wait_for_xmitr(up, BOTH_EMPTY); - serial_port_out(port, UART_IER, ier); + serial8250_set_IER(up, ier); serial8250_rpm_put(up); } @@ -2453,7 +2446,7 @@ void serial8250_do_shutdown(struct uart_port *port) */ spin_lock_irqsave(&port->lock, flags); up->ier = 0; - serial_port_out(port, UART_IER, 0); + serial8250_set_IER(up, 0); spin_unlock_irqrestore(&port->lock, flags); synchronize_irq(port->irq); @@ -2809,7 +2802,7 @@ serial8250_do_set_termios(struct uart_port *port, struct ktermios *termios, if (up->capabilities & UART_CAP_RTOIE) up->ier |= UART_IER_RTOIE; - serial_port_out(port, UART_IER, up->ier); + serial8250_set_IER(up, up->ier); if (up->capabilities & UART_CAP_EFR) { unsigned char efr = 0; @@ -3275,7 +3268,7 @@ EXPORT_SYMBOL_GPL(serial8250_set_defaults); #ifdef CONFIG_SERIAL_8250_CONSOLE -static void serial8250_console_putchar(struct uart_port *port, int ch) +static void serial8250_console_putchar_locked(struct uart_port *port, int ch) { struct uart_8250_port *up = up_to_u8250p(port); @@ -3283,6 +3276,18 @@ static void serial8250_console_putchar(struct uart_port *port, int ch) serial_port_out(port, UART_TX, ch); } +static void serial8250_console_putchar(struct uart_port *port, int ch) +{ + struct uart_8250_port *up = up_to_u8250p(port); + unsigned int flags; + + wait_for_xmitr(up, UART_LSR_THRE); + + console_atomic_lock(&flags); + serial8250_console_putchar_locked(port, ch); + console_atomic_unlock(flags); +} + /* * Restore serial console when h/w power-off detected */ @@ -3304,6 +3309,32 @@ static void serial8250_console_restore(struct uart_8250_port *up) serial8250_out_MCR(up, UART_MCR_DTR | UART_MCR_RTS); } +void serial8250_console_write_atomic(struct uart_8250_port *up, + const char *s, unsigned int count) +{ + struct uart_port *port = &up->port; + unsigned int flags; + unsigned int ier; + + console_atomic_lock(&flags); + + touch_nmi_watchdog(); + + ier = serial8250_clear_IER(up); + + if (atomic_fetch_inc(&up->console_printing)) { + uart_console_write(port, "\n", 1, + serial8250_console_putchar_locked); + } + uart_console_write(port, s, count, serial8250_console_putchar_locked); + atomic_dec(&up->console_printing); + + wait_for_xmitr(up, BOTH_EMPTY); + serial8250_set_IER(up, ier); + + console_atomic_unlock(flags); +} + /* * Print a string to the serial port trying not to disturb * any possible real use of the port... @@ -3320,24 +3351,12 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, struct uart_port *port = &up->port; unsigned long flags; unsigned int ier; - int locked = 1; touch_nmi_watchdog(); - if (oops_in_progress) - locked = spin_trylock_irqsave(&port->lock, flags); - else - spin_lock_irqsave(&port->lock, flags); - - /* - * First save the IER then disable the interrupts - */ - ier = serial_port_in(port, UART_IER); + spin_lock_irqsave(&port->lock, flags); - if (up->capabilities & UART_CAP_UUE) - serial_port_out(port, UART_IER, UART_IER_UUE); - else - serial_port_out(port, UART_IER, 0); + ier = serial8250_clear_IER(up); /* check scratch reg to see if port powered off during system sleep */ if (up->canary && (up->canary != serial_port_in(port, UART_SCR))) { @@ -3351,7 +3370,9 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, mdelay(port->rs485.delay_rts_before_send); } + atomic_inc(&up->console_printing); uart_console_write(port, s, count, serial8250_console_putchar); + atomic_dec(&up->console_printing); /* * Finally, wait for transmitter to become empty @@ -3364,8 +3385,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, if (em485->tx_stopped) up->rs485_stop_tx(up); } - - serial_port_out(port, UART_IER, ier); + serial8250_set_IER(up, ier); /* * The receive handling will happen properly because the @@ -3377,8 +3397,7 @@ void serial8250_console_write(struct uart_8250_port *up, const char *s, if (up->msr_saved_flags) serial8250_modem_status(up); - if (locked) - spin_unlock_irqrestore(&port->lock, flags); + spin_unlock_irqrestore(&port->lock, flags); } static unsigned int probe_baud(struct uart_port *port) @@ -3398,6 +3417,7 @@ static unsigned int probe_baud(struct uart_port *port) int serial8250_console_setup(struct uart_port *port, char *options, bool probe) { + struct uart_8250_port *up = up_to_u8250p(port); int baud = 9600; int bits = 8; int parity = 'n'; @@ -3407,6 +3427,8 @@ int serial8250_console_setup(struct uart_port *port, char *options, bool probe) if (!port->iobase && !port->membase) return -ENODEV; + atomic_set(&up->console_printing, 0); + if (options) uart_parse_options(options, &baud, &parity, &bits, &flow); else if (probe) diff --git a/drivers/tty/serial/amba-pl011.c b/drivers/tty/serial/amba-pl011.c index 931906713..a2c46ecee 100644 --- a/drivers/tty/serial/amba-pl011.c +++ b/drivers/tty/serial/amba-pl011.c @@ -2325,18 +2325,24 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) { struct uart_amba_port *uap = amba_ports[co->index]; unsigned int old_cr = 0, new_cr; - unsigned long flags; + unsigned long flags = 0; int locked = 1; clk_enable(uap->clk); - local_irq_save(flags); + /* + * local_irq_save(flags); + * + * This local_irq_save() is nonsense. If we come in via sysrq + * handling then interrupts are already disabled. Aside of + * that the port.sysrq check is racy on SMP regardless. + */ if (uap->port.sysrq) locked = 0; else if (oops_in_progress) - locked = spin_trylock(&uap->port.lock); + locked = spin_trylock_irqsave(&uap->port.lock, flags); else - spin_lock(&uap->port.lock); + spin_lock_irqsave(&uap->port.lock, flags); /* * First save the CR then disable the interrupts @@ -2362,8 +2368,7 @@ pl011_console_write(struct console *co, const char *s, unsigned int count) pl011_write(old_cr, uap, REG_CR); if (locked) - spin_unlock(&uap->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&uap->port.lock, flags); clk_disable(uap->clk); } diff --git a/drivers/tty/serial/omap-serial.c b/drivers/tty/serial/omap-serial.c index 84e815808..342005ed5 100644 --- a/drivers/tty/serial/omap-serial.c +++ b/drivers/tty/serial/omap-serial.c @@ -1311,13 +1311,10 @@ serial_omap_console_write(struct console *co, const char *s, pm_runtime_get_sync(up->dev); - local_irq_save(flags); - if (up->port.sysrq) - locked = 0; - else if (oops_in_progress) - locked = spin_trylock(&up->port.lock); + if (up->port.sysrq || oops_in_progress) + locked = spin_trylock_irqsave(&up->port.lock, flags); else - spin_lock(&up->port.lock); + spin_lock_irqsave(&up->port.lock, flags); /* * First save the IER then disable the interrupts @@ -1346,8 +1343,7 @@ serial_omap_console_write(struct console *co, const char *s, pm_runtime_mark_last_busy(up->dev); pm_runtime_put_autosuspend(up->dev); if (locked) - spin_unlock(&up->port.lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&up->port.lock, flags); } static int __init diff --git a/drivers/tty/tty_buffer.c b/drivers/tty/tty_buffer.c index 713cfa72d..0fc473321 100644 --- a/drivers/tty/tty_buffer.c +++ b/drivers/tty/tty_buffer.c @@ -172,9 +172,7 @@ static struct tty_buffer *tty_buffer_alloc(struct tty_port *port, size_t size) have queued and recycle that ? */ if (atomic_read(&port->buf.mem_used) > port->buf.mem_limit) return NULL; - printk_safe_enter(); p = kmalloc(sizeof(struct tty_buffer) + 2 * size, GFP_ATOMIC); - printk_safe_exit(); if (p == NULL) return NULL; diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index dae9a57d7..9a6a0ec4d 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -239,7 +239,7 @@ int afs_silly_iput(struct dentry *dentry, struct inode *inode) struct dentry *alias; int ret; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); _enter("%p{%pd},%llx", dentry, dentry, vnode->fid.vnode); diff --git a/fs/aio.c b/fs/aio.c index 5e5333d72..cc16ac777 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -43,7 +43,6 @@ #include <linux/mount.h> #include <linux/pseudo_fs.h> -#include <asm/kmap_types.h> #include <linux/uaccess.h> #include <linux/nospec.h> @@ -1762,7 +1761,7 @@ static int aio_poll_wake(struct wait_queue_entry *wait, unsigned mode, int sync, list_del_init(&req->wait.entry); list_del(&iocb->ki_list); iocb->ki_res.res = mangle_poll(mask); - if (iocb->ki_eventfd && eventfd_signal_count()) { + if (iocb->ki_eventfd && !eventfd_signal_allowed()) { iocb = NULL; INIT_WORK(&req->work, aio_poll_put_work); schedule_work(&req->work); diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index bcc6848bb..fabbf6cc4 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -17,7 +17,6 @@ #include <linux/wait.h> #include <linux/slab.h> #include <trace/events/btrfs.h> -#include <asm/kmap_types.h> #include <asm/unaligned.h> #include <linux/pagemap.h> #include <linux/btrfs.h> diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 799be3a5d..d5165a7da 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -81,7 +81,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, struct inode *inode; struct super_block *sb = parent->d_sb; struct cifs_sb_info *cifs_sb = CIFS_SB(sb); - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); diff --git a/fs/dcache.c b/fs/dcache.c index f5b78cc80..b2e0d1a07 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2566,9 +2566,10 @@ EXPORT_SYMBOL(d_rehash); static inline unsigned start_dir_add(struct inode *dir) { + preempt_disable_rt(); for (;;) { - unsigned n = dir->i_dir_seq; - if (!(n & 1) && cmpxchg(&dir->i_dir_seq, n, n + 1) == n) + unsigned n = dir->__i_dir_seq; + if (!(n & 1) && cmpxchg(&dir->__i_dir_seq, n, n + 1) == n) return n; cpu_relax(); } @@ -2576,26 +2577,30 @@ static inline unsigned start_dir_add(struct inode *dir) static inline void end_dir_add(struct inode *dir, unsigned n) { - smp_store_release(&dir->i_dir_seq, n + 2); + smp_store_release(&dir->__i_dir_seq, n + 2); + preempt_enable_rt(); } static void d_wait_lookup(struct dentry *dentry) { - if (d_in_lookup(dentry)) { - DECLARE_WAITQUEUE(wait, current); - add_wait_queue(dentry->d_wait, &wait); - do { - set_current_state(TASK_UNINTERRUPTIBLE); - spin_unlock(&dentry->d_lock); - schedule(); - spin_lock(&dentry->d_lock); - } while (d_in_lookup(dentry)); - } + struct swait_queue __wait; + + if (!d_in_lookup(dentry)) + return; + + INIT_LIST_HEAD(&__wait.task_list); + do { + prepare_to_swait_exclusive(dentry->d_wait, &__wait, TASK_UNINTERRUPTIBLE); + spin_unlock(&dentry->d_lock); + schedule(); + spin_lock(&dentry->d_lock); + } while (d_in_lookup(dentry)); + finish_swait(dentry->d_wait, &__wait); } struct dentry *d_alloc_parallel(struct dentry *parent, const struct qstr *name, - wait_queue_head_t *wq) + struct swait_queue_head *wq) { unsigned int hash = name->hash; struct hlist_bl_head *b = in_lookup_hash(parent, hash); @@ -2609,7 +2614,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, retry: rcu_read_lock(); - seq = smp_load_acquire(&parent->d_inode->i_dir_seq); + seq = smp_load_acquire(&parent->d_inode->__i_dir_seq); r_seq = read_seqbegin(&rename_lock); dentry = __d_lookup_rcu(parent, name, &d_seq); if (unlikely(dentry)) { @@ -2637,7 +2642,7 @@ struct dentry *d_alloc_parallel(struct dentry *parent, } hlist_bl_lock(b); - if (unlikely(READ_ONCE(parent->d_inode->i_dir_seq) != seq)) { + if (unlikely(READ_ONCE(parent->d_inode->__i_dir_seq) != seq)) { hlist_bl_unlock(b); rcu_read_unlock(); goto retry; @@ -2710,7 +2715,7 @@ void __d_lookup_done(struct dentry *dentry) hlist_bl_lock(b); dentry->d_flags &= ~DCACHE_PAR_LOOKUP; __hlist_bl_del(&dentry->d_u.d_in_lookup_hash); - wake_up_all(dentry->d_wait); + swake_up_all(dentry->d_wait); dentry->d_wait = NULL; hlist_bl_unlock(b); INIT_HLIST_NODE(&dentry->d_u.d_alias); diff --git a/fs/eventfd.c b/fs/eventfd.c index df466ef81..9035ca60b 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -25,8 +25,6 @@ #include <linux/idr.h> #include <linux/uio.h> -DEFINE_PER_CPU(int, eventfd_wake_count); - static DEFINE_IDA(eventfd_ida); struct eventfd_ctx { @@ -67,21 +65,21 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n) * Deadlock or stack overflow issues can happen if we recurse here * through waitqueue wakeup handlers. If the caller users potentially * nested waitqueues with custom wakeup handlers, then it should - * check eventfd_signal_count() before calling this function. If - * it returns true, the eventfd_signal() call should be deferred to a + * check eventfd_signal_allowed() before calling this function. If + * it returns false, the eventfd_signal() call should be deferred to a * safe context. */ - if (WARN_ON_ONCE(this_cpu_read(eventfd_wake_count))) + if (WARN_ON_ONCE(current->in_eventfd_signal)) return 0; spin_lock_irqsave(&ctx->wqh.lock, flags); - this_cpu_inc(eventfd_wake_count); + current->in_eventfd_signal = 1; if (ULLONG_MAX - ctx->count < n) n = ULLONG_MAX - ctx->count; ctx->count += n; if (waitqueue_active(&ctx->wqh)) wake_up_locked_poll(&ctx->wqh, EPOLLIN); - this_cpu_dec(eventfd_wake_count); + current->in_eventfd_signal = 0; spin_unlock_irqrestore(&ctx->wqh.lock, flags); return n; diff --git a/fs/fscache/internal.h b/fs/fscache/internal.h index 64aa552b2..7dae569da 100644 --- a/fs/fscache/internal.h +++ b/fs/fscache/internal.h @@ -95,7 +95,6 @@ extern unsigned fscache_debug; extern struct kobject *fscache_root; extern struct workqueue_struct *fscache_object_wq; extern struct workqueue_struct *fscache_op_wq; -DECLARE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); extern unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n); diff --git a/fs/fscache/main.c b/fs/fscache/main.c index 4207f98e4..85f8cf3a3 100644 --- a/fs/fscache/main.c +++ b/fs/fscache/main.c @@ -41,8 +41,6 @@ struct kobject *fscache_root; struct workqueue_struct *fscache_object_wq; struct workqueue_struct *fscache_op_wq; -DEFINE_PER_CPU(wait_queue_head_t, fscache_object_cong_wait); - /* these values serve as lower bounds, will be adjusted in fscache_init() */ static unsigned fscache_object_max_active = 4; static unsigned fscache_op_max_active = 2; @@ -138,7 +136,6 @@ unsigned int fscache_hash(unsigned int salt, unsigned int *data, unsigned int n) static int __init fscache_init(void) { unsigned int nr_cpus = num_possible_cpus(); - unsigned int cpu; int ret; fscache_object_max_active = @@ -161,9 +158,6 @@ static int __init fscache_init(void) if (!fscache_op_wq) goto error_op_wq; - for_each_possible_cpu(cpu) - init_waitqueue_head(&per_cpu(fscache_object_cong_wait, cpu)); - ret = fscache_proc_init(); if (ret < 0) goto error_proc; diff --git a/fs/fscache/object.c b/fs/fscache/object.c index cb2146e02..fb9794dce 100644 --- a/fs/fscache/object.c +++ b/fs/fscache/object.c @@ -807,6 +807,8 @@ void fscache_object_destroy(struct fscache_object *object) } EXPORT_SYMBOL(fscache_object_destroy); +static DECLARE_WAIT_QUEUE_HEAD(fscache_object_cong_wait); + /* * enqueue an object for metadata-type processing */ @@ -815,16 +817,12 @@ void fscache_enqueue_object(struct fscache_object *object) _enter("{OBJ%x}", object->debug_id); if (fscache_get_object(object, fscache_obj_get_queue) >= 0) { - wait_queue_head_t *cong_wq = - &get_cpu_var(fscache_object_cong_wait); if (queue_work(fscache_object_wq, &object->work)) { if (fscache_object_congested()) - wake_up(cong_wq); + wake_up(&fscache_object_cong_wait); } else fscache_put_object(object, fscache_obj_put_queue); - - put_cpu_var(fscache_object_cong_wait); } } @@ -842,16 +840,15 @@ void fscache_enqueue_object(struct fscache_object *object) */ bool fscache_object_sleep_till_congested(signed long *timeoutp) { - wait_queue_head_t *cong_wq = this_cpu_ptr(&fscache_object_cong_wait); DEFINE_WAIT(wait); if (fscache_object_congested()) return true; - add_wait_queue_exclusive(cong_wq, &wait); + add_wait_queue_exclusive(&fscache_object_cong_wait, &wait); if (!fscache_object_congested()) *timeoutp = schedule_timeout(*timeoutp); - finish_wait(cong_wq, &wait); + finish_wait(&fscache_object_cong_wait, &wait); return fscache_object_congested(); } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index bc2678323..3176913fa 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -158,7 +158,7 @@ static int fuse_direntplus_link(struct file *file, struct inode *dir = d_inode(parent); struct fuse_conn *fc; struct inode *inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); if (!o->nodeid) { /* diff --git a/fs/inode.c b/fs/inode.c index 82090bfad..96ddef6c6 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -158,7 +158,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) inode->i_bdev = NULL; inode->i_cdev = NULL; inode->i_link = NULL; - inode->i_dir_seq = 0; + inode->__i_dir_seq = 0; inode->i_rdev = 0; inode->dirtied_when = 0; diff --git a/fs/namei.c b/fs/namei.c index c94a814e8..36abc074d 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -1526,7 +1526,7 @@ static struct dentry *__lookup_slow(const struct qstr *name, { struct dentry *dentry, *old; struct inode *inode = dir->d_inode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); /* Don't go there if it's already dead */ if (unlikely(IS_DEADDIR(inode))) @@ -3021,7 +3021,7 @@ static struct dentry *lookup_open(struct nameidata *nd, struct file *file, struct dentry *dentry; int error, create_error = 0; umode_t mode = op->mode; - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); if (unlikely(IS_DEADDIR(dir_inode))) return ERR_PTR(-ENOENT); diff --git a/fs/namespace.c b/fs/namespace.c index 6e76f2a72..dbd1119a5 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -14,6 +14,7 @@ #include <linux/mnt_namespace.h> #include <linux/user_namespace.h> #include <linux/namei.h> +#include <linux/delay.h> #include <linux/security.h> #include <linux/cred.h> #include <linux/idr.h> @@ -322,8 +323,11 @@ int __mnt_want_write(struct vfsmount *m) * incremented count after it has set MNT_WRITE_HOLD. */ smp_mb(); - while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) - cpu_relax(); + while (READ_ONCE(mnt->mnt.mnt_flags) & MNT_WRITE_HOLD) { + preempt_enable(); + cpu_chill(); + preempt_disable(); + } /* * After the slowpath clears MNT_WRITE_HOLD, mnt_is_readonly will * be set to match its requirements. So we must not load that until diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 1276437b4..97637bbfe 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -484,7 +484,7 @@ void nfs_prime_dcache(struct dentry *parent, struct nfs_entry *entry, unsigned long dir_verifier) { struct qstr filename = QSTR_INIT(entry->name, entry->len); - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); struct dentry *dentry; struct dentry *alias; struct inode *inode; @@ -1670,7 +1670,7 @@ int nfs_atomic_open(struct inode *dir, struct dentry *dentry, struct file *file, unsigned open_flags, umode_t mode) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); struct nfs_open_context *ctx; struct dentry *res; struct iattr attr = { .ia_valid = ATTR_OPEN }; diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index b27ebdcce..f86c98a7e 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -13,7 +13,7 @@ #include <linux/sunrpc/clnt.h> #include <linux/nfs_fs.h> #include <linux/sched.h> -#include <linux/wait.h> +#include <linux/swait.h> #include <linux/namei.h> #include <linux/fsnotify.h> @@ -180,7 +180,7 @@ nfs_async_unlink(struct dentry *dentry, const struct qstr *name) data->cred = get_current_cred(); data->res.dir_attr = &data->dir_attr; - init_waitqueue_head(&data->wq); + init_swait_queue_head(&data->wq); status = -EBUSY; spin_lock(&dentry->d_lock); diff --git a/fs/proc/array.c b/fs/proc/array.c index 18a4588c3..decaa7768 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -384,9 +384,9 @@ static inline void task_context_switch_counts(struct seq_file *m, static void task_cpus_allowed(struct seq_file *m, struct task_struct *task) { seq_printf(m, "Cpus_allowed:\t%*pb\n", - cpumask_pr_args(task->cpus_ptr)); + cpumask_pr_args(&task->cpus_mask)); seq_printf(m, "Cpus_allowed_list:\t%*pbl\n", - cpumask_pr_args(task->cpus_ptr)); + cpumask_pr_args(&task->cpus_mask)); } static inline void task_core_dumping(struct seq_file *m, struct mm_struct *mm) diff --git a/fs/proc/base.c b/fs/proc/base.c index 2ba1313aa..fa6ae9bca 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -96,6 +96,7 @@ #include <linux/posix-timers.h> #include <linux/time_namespace.h> #include <linux/resctrl.h> +#include <linux/swait.h> #include <linux/share_pool.h> #include <trace/events/oom.h> #include "internal.h" @@ -2145,7 +2146,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, child = d_hash_and_lookup(dir, &qname); if (!child) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) goto end_instantiate; diff --git a/fs/proc/proc_sysctl.c b/fs/proc/proc_sysctl.c index ffed75f83..15f837dc0 100644 --- a/fs/proc/proc_sysctl.c +++ b/fs/proc/proc_sysctl.c @@ -683,7 +683,7 @@ static bool proc_sys_fill_cache(struct file *file, child = d_lookup(dir, &qname); if (!child) { - DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + DECLARE_SWAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); if (IS_ERR(child)) return false; diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index b1ebf7b61..b7e3a6bac 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -383,7 +383,8 @@ void pstore_record_init(struct pstore_record *record, * end of the buffer. */ static void pstore_dump(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) + enum kmsg_dump_reason reason, + struct kmsg_dumper_iter *iter) { unsigned long total = 0; const char *why; @@ -435,7 +436,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, dst_size -= header_size; /* Write dump contents. */ - if (!kmsg_dump_get_buffer(dumper, true, dst + header_size, + if (!kmsg_dump_get_buffer(iter, true, dst + header_size, dst_size, &dump_size)) break; diff --git a/include/asm-generic/Kbuild b/include/asm-generic/Kbuild index d1300c6e0..267f6dfb8 100644 --- a/include/asm-generic/Kbuild +++ b/include/asm-generic/Kbuild @@ -30,7 +30,7 @@ mandatory-y += irq.h mandatory-y += irq_regs.h mandatory-y += irq_work.h mandatory-y += kdebug.h -mandatory-y += kmap_types.h +mandatory-y += kmap_size.h mandatory-y += kprobes.h mandatory-y += linkage.h mandatory-y += local.h diff --git a/include/asm-generic/hardirq.h b/include/asm-generic/hardirq.h index d14214dfc..7317e8258 100644 --- a/include/asm-generic/hardirq.h +++ b/include/asm-generic/hardirq.h @@ -7,9 +7,13 @@ typedef struct { unsigned int __softirq_pending; +#ifdef ARCH_WANTS_NMI_IRQSTAT + unsigned int __nmi_count; +#endif } ____cacheline_aligned irq_cpustat_t; -#include <linux/irq_cpustat.h> /* Standard mappings for irq_cpustat_t above */ +DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); + #include <linux/irq.h> #ifndef ack_bad_irq diff --git a/include/asm-generic/kmap_size.h b/include/asm-generic/kmap_size.h new file mode 100644 index 000000000..9d6c7786a --- /dev/null +++ b/include/asm-generic/kmap_size.h @@ -0,0 +1,12 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_GENERIC_KMAP_SIZE_H +#define _ASM_GENERIC_KMAP_SIZE_H + +/* For debug this provides guard pages between the maps */ +#ifdef CONFIG_DEBUG_HIGHMEM +# define KM_MAX_IDX 33 +#else +# define KM_MAX_IDX 16 +#endif + +#endif diff --git a/include/asm-generic/kmap_types.h b/include/asm-generic/kmap_types.h deleted file mode 100644 index 9f95b7b63..000000000 --- a/include/asm-generic/kmap_types.h +++ /dev/null @@ -1,11 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _ASM_GENERIC_KMAP_TYPES_H -#define _ASM_GENERIC_KMAP_TYPES_H - -#ifdef __WITH_KM_FENCE -# define KM_TYPE_NR 41 -#else -# define KM_TYPE_NR 20 -#endif - -#endif diff --git a/include/asm-generic/preempt.h b/include/asm-generic/preempt.h index b4d43a4af..ac255e889 100644 --- a/include/asm-generic/preempt.h +++ b/include/asm-generic/preempt.h @@ -79,6 +79,9 @@ static __always_inline bool should_resched(int preempt_offset) } #ifdef CONFIG_PREEMPTION +#ifdef CONFIG_PREEMPT_RT +extern void preempt_schedule_lock(void); +#endif extern asmlinkage void preempt_schedule(void); #define __preempt_schedule() preempt_schedule() extern asmlinkage void preempt_schedule_notrace(void); diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h index 433485f8b..e41c9b283 100644 --- a/include/linux/blkdev.h +++ b/include/linux/blkdev.h @@ -162,7 +162,7 @@ struct request { */ union { struct hlist_node hash; /* merge hash */ - struct list_head ipi_list; + struct llist_node ipi_list; }; /* diff --git a/include/linux/bottom_half.h b/include/linux/bottom_half.h index a19519f42..eed86eb0a 100644 --- a/include/linux/bottom_half.h +++ b/include/linux/bottom_half.h @@ -4,7 +4,7 @@ #include <linux/preempt.h> -#ifdef CONFIG_TRACE_IRQFLAGS +#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_TRACE_IRQFLAGS) extern void __local_bh_disable_ip(unsigned long ip, unsigned int cnt); #else static __always_inline void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) @@ -32,4 +32,10 @@ static inline void local_bh_enable(void) __local_bh_enable_ip(_THIS_IP_, SOFTIRQ_DISABLE_OFFSET); } +#ifdef CONFIG_PREEMPT_RT +extern bool local_bh_blocked(void); +#else +static inline bool local_bh_blocked(void) { return false; } +#endif + #endif /* _LINUX_BH_H */ diff --git a/include/linux/console.h b/include/linux/console.h index bc2a749e6..027278792 100644 --- a/include/linux/console.h +++ b/include/linux/console.h @@ -16,6 +16,7 @@ #include <linux/atomic.h> #include <linux/types.h> +#include <linux/printk.h> struct vc_data; struct console_font_op; @@ -137,10 +138,12 @@ static inline int con_debug_leave(void) #define CON_ANYTIME (16) /* Safe to call when cpu is offline */ #define CON_BRL (32) /* Used for a braille device */ #define CON_EXTENDED (64) /* Use the extended output format a la /dev/kmsg */ +#define CON_HANDOVER (128) /* Device was previously a boot console. */ struct console { char name[16]; void (*write)(struct console *, const char *, unsigned); + void (*write_atomic)(struct console *co, const char *s, unsigned int count); int (*read)(struct console *, char *, unsigned); struct tty_driver *(*device)(struct console *, int *); void (*unblank)(void); @@ -150,6 +153,11 @@ struct console { short flags; short index; int cflag; +#ifdef CONFIG_PRINTK + char sync_buf[CONSOLE_LOG_MAX]; +#endif + atomic64_t printk_seq; + struct task_struct *thread; uint ispeed; uint ospeed; void *data; @@ -232,4 +240,7 @@ extern void console_init(void); void dummycon_register_output_notifier(struct notifier_block *nb); void dummycon_unregister_output_notifier(struct notifier_block *nb); +extern void console_atomic_lock(unsigned int *flags); +extern void console_atomic_unlock(unsigned int flags); + #endif /* _LINUX_CONSOLE_H */ diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index b98b9eb7d..c986a9543 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -155,6 +155,7 @@ enum cpuhp_state { CPUHP_AP_ONLINE, CPUHP_TEARDOWN_CPU, CPUHP_AP_ONLINE_IDLE, + CPUHP_AP_SCHED_WAIT_EMPTY, CPUHP_AP_SMPBOOT_THREADS, CPUHP_AP_X86_VDSO_VMA_ONLINE, CPUHP_AP_IRQ_AFFINITY_ONLINE, diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h index 0159986ac..c53364c42 100644 --- a/include/linux/cpumask.h +++ b/include/linux/cpumask.h @@ -199,6 +199,11 @@ static inline int cpumask_any_and_distribute(const struct cpumask *src1p, return cpumask_next_and(-1, src1p, src2p); } +static inline int cpumask_any_distribute(const struct cpumask *srcp) +{ + return cpumask_first(srcp); +} + #define for_each_cpu(cpu, mask) \ for ((cpu) = 0; (cpu) < 1; (cpu)++, (void)mask) #define for_each_cpu_not(cpu, mask) \ @@ -252,6 +257,7 @@ int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu); unsigned int cpumask_local_spread(unsigned int i, int node); int cpumask_any_and_distribute(const struct cpumask *src1p, const struct cpumask *src2p); +int cpumask_any_distribute(const struct cpumask *srcp); /** * for_each_cpu - iterate over every cpu in a mask diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 4bb8b1759..c5821c04a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -108,7 +108,7 @@ struct dentry { union { struct list_head d_lru; /* LRU list */ - wait_queue_head_t *d_wait; /* in-lookup ones only */ + struct swait_queue_head *d_wait; /* in-lookup ones only */ }; struct list_head d_child; /* child of parent list */ struct list_head d_subdirs; /* our children */ @@ -251,7 +251,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op extern struct dentry * d_alloc(struct dentry *, const struct qstr *); extern struct dentry * d_alloc_anon(struct super_block *); extern struct dentry * d_alloc_parallel(struct dentry *, const struct qstr *, - wait_queue_head_t *); + struct swait_queue_head *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern struct dentry * d_exact_alias(struct dentry *, struct inode *); diff --git a/include/linux/debug_locks.h b/include/linux/debug_locks.h index edb5c186b..3f49e6516 100644 --- a/include/linux/debug_locks.h +++ b/include/linux/debug_locks.h @@ -3,8 +3,7 @@ #define __LINUX_DEBUG_LOCKING_H #include <linux/atomic.h> -#include <linux/bug.h> -#include <linux/printk.h> +#include <linux/cache.h> struct task_struct; diff --git a/include/linux/delay.h b/include/linux/delay.h index e8607992c..cd24f34b4 100644 --- a/include/linux/delay.h +++ b/include/linux/delay.h @@ -88,4 +88,10 @@ static inline void fsleep(unsigned long usecs) msleep(DIV_ROUND_UP(usecs, 1000)); } +#ifdef CONFIG_PREEMPT_RT +extern void cpu_chill(void); +#else +# define cpu_chill() cpu_relax() +#endif + #endif /* defined(_LINUX_DELAY_H) */ diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h index d8e1c798d..d2aca09f7 100644 --- a/include/linux/entry-common.h +++ b/include/linux/entry-common.h @@ -70,7 +70,7 @@ #define EXIT_TO_USER_MODE_WORK \ (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ - _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | \ + _TIF_NEED_RESCHED_MASK | _TIF_PATCH_PENDING | \ ARCH_EXIT_TO_USER_MODE_WORK) /** diff --git a/include/linux/eventfd.h b/include/linux/eventfd.h index dc4fd8a66..836b4c021 100644 --- a/include/linux/eventfd.h +++ b/include/linux/eventfd.h @@ -14,6 +14,7 @@ #include <linux/err.h> #include <linux/percpu-defs.h> #include <linux/percpu.h> +#include <linux/sched.h> /* * CAREFUL: Check include/uapi/asm-generic/fcntl.h when defining @@ -42,11 +43,9 @@ __u64 eventfd_signal(struct eventfd_ctx *ctx, __u64 n); int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, wait_queue_entry_t *wait, __u64 *cnt); -DECLARE_PER_CPU(int, eventfd_wake_count); - -static inline bool eventfd_signal_count(void) +static inline bool eventfd_signal_allowed(void) { - return this_cpu_read(eventfd_wake_count); + return !current->in_eventfd_signal; } #else /* CONFIG_EVENTFD */ @@ -77,9 +76,9 @@ static inline int eventfd_ctx_remove_wait_queue(struct eventfd_ctx *ctx, return -ENOSYS; } -static inline bool eventfd_signal_count(void) +static inline bool eventfd_signal_allowed(void) { - return false; + return true; } #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index db6327477..9ce4c7baa 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -712,7 +712,7 @@ struct inode { struct block_device *i_bdev; struct cdev *i_cdev; char *i_link; - unsigned i_dir_seq; + unsigned __i_dir_seq; }; __u32 i_generation; diff --git a/include/linux/hardirq.h b/include/linux/hardirq.h index 754f67ac4..76878b357 100644 --- a/include/linux/hardirq.h +++ b/include/linux/hardirq.h @@ -6,6 +6,7 @@ #include <linux/preempt.h> #include <linux/lockdep.h> #include <linux/ftrace_irq.h> +#include <linux/sched.h> #include <linux/vtime.h> #include <asm/hardirq.h> @@ -32,9 +33,9 @@ static __always_inline void rcu_irq_enter_check_tick(void) */ #define __irq_enter() \ do { \ - account_irq_enter_time(current); \ preempt_count_add(HARDIRQ_OFFSET); \ lockdep_hardirq_enter(); \ + account_hardirq_enter(current); \ } while (0) /* @@ -62,8 +63,8 @@ void irq_enter_rcu(void); */ #define __irq_exit() \ do { \ + account_hardirq_exit(current); \ lockdep_hardirq_exit(); \ - account_irq_exit_time(current); \ preempt_count_sub(HARDIRQ_OFFSET); \ } while (0) @@ -115,7 +116,6 @@ extern void rcu_nmi_exit(void); do { \ lockdep_off(); \ arch_nmi_enter(); \ - printk_nmi_enter(); \ BUG_ON(in_nmi() == NMI_MASK); \ __preempt_count_add(NMI_OFFSET + HARDIRQ_OFFSET); \ } while (0) @@ -134,7 +134,6 @@ extern void rcu_nmi_exit(void); do { \ BUG_ON(!in_nmi()); \ __preempt_count_sub(NMI_OFFSET + HARDIRQ_OFFSET); \ - printk_nmi_exit(); \ arch_nmi_exit(); \ lockdep_on(); \ } while (0) diff --git a/include/linux/highmem-internal.h b/include/linux/highmem-internal.h new file mode 100644 index 000000000..f9bc6acd3 --- /dev/null +++ b/include/linux/highmem-internal.h @@ -0,0 +1,222 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LINUX_HIGHMEM_INTERNAL_H +#define _LINUX_HIGHMEM_INTERNAL_H + +/* + * Outside of CONFIG_HIGHMEM to support X86 32bit iomap_atomic() cruft. + */ +#ifdef CONFIG_KMAP_LOCAL +void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot); +void *__kmap_local_page_prot(struct page *page, pgprot_t prot); +void kunmap_local_indexed(void *vaddr); +void kmap_local_fork(struct task_struct *tsk); +void __kmap_local_sched_out(void); +void __kmap_local_sched_in(void); +static inline void kmap_assert_nomap(void) +{ + DEBUG_LOCKS_WARN_ON(current->kmap_ctrl.idx); +} +#else +static inline void kmap_local_fork(struct task_struct *tsk) { } +static inline void kmap_assert_nomap(void) { } +#endif + +#ifdef CONFIG_HIGHMEM +#include <asm/highmem.h> + +#ifndef ARCH_HAS_KMAP_FLUSH_TLB +static inline void kmap_flush_tlb(unsigned long addr) { } +#endif + +#ifndef kmap_prot +#define kmap_prot PAGE_KERNEL +#endif + +void *kmap_high(struct page *page); +void kunmap_high(struct page *page); +void __kmap_flush_unused(void); +struct page *__kmap_to_page(void *addr); + +static inline void *kmap(struct page *page) +{ + void *addr; + + might_sleep(); + if (!PageHighMem(page)) + addr = page_address(page); + else + addr = kmap_high(page); + kmap_flush_tlb((unsigned long)addr); + return addr; +} + +static inline void kunmap(struct page *page) +{ + might_sleep(); + if (!PageHighMem(page)) + return; + kunmap_high(page); +} + +static inline struct page *kmap_to_page(void *addr) +{ + return __kmap_to_page(addr); +} + +static inline void kmap_flush_unused(void) +{ + __kmap_flush_unused(); +} + +static inline void *kmap_local_page(struct page *page) +{ + return __kmap_local_page_prot(page, kmap_prot); +} + +static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +{ + return __kmap_local_page_prot(page, prot); +} + +static inline void *kmap_local_pfn(unsigned long pfn) +{ + return __kmap_local_pfn_prot(pfn, kmap_prot); +} + +static inline void __kunmap_local(void *vaddr) +{ + kunmap_local_indexed(vaddr); +} + +static inline void *kmap_atomic(struct page *page) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); + pagefault_disable(); + return __kmap_local_page_prot(page, kmap_prot); +} + +static inline void __kunmap_atomic(void *addr) +{ + kunmap_local_indexed(addr); + pagefault_enable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_enable(); + else + preempt_enable(); +} + +unsigned int __nr_free_highpages(void); +extern atomic_long_t _totalhigh_pages; + +static inline unsigned int nr_free_highpages(void) +{ + return __nr_free_highpages(); +} + +static inline unsigned long totalhigh_pages(void) +{ + return (unsigned long)atomic_long_read(&_totalhigh_pages); +} + +static inline void totalhigh_pages_inc(void) +{ + atomic_long_inc(&_totalhigh_pages); +} + +static inline void totalhigh_pages_add(long count) +{ + atomic_long_add(count, &_totalhigh_pages); +} + +#else /* CONFIG_HIGHMEM */ + +static inline struct page *kmap_to_page(void *addr) +{ + return virt_to_page(addr); +} + +static inline void *kmap(struct page *page) +{ + might_sleep(); + return page_address(page); +} + +static inline void kunmap_high(struct page *page) { } +static inline void kmap_flush_unused(void) { } + +static inline void kunmap(struct page *page) +{ +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(page_address(page)); +#endif +} + +static inline void *kmap_local_page(struct page *page) +{ + return page_address(page); +} + +static inline void *kmap_local_page_prot(struct page *page, pgprot_t prot) +{ + return kmap_local_page(page); +} + +static inline void *kmap_local_pfn(unsigned long pfn) +{ + return kmap_local_page(pfn_to_page(pfn)); +} + +static inline void __kunmap_local(void *addr) +{ +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(addr); +#endif +} + +static inline void *kmap_atomic(struct page *page) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_disable(); + else + preempt_disable(); + pagefault_disable(); + return page_address(page); +} + +static inline void __kunmap_atomic(void *addr) +{ +#ifdef ARCH_HAS_FLUSH_ON_KUNMAP + kunmap_flush_on_unmap(addr); +#endif + pagefault_enable(); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + migrate_enable(); + else + preempt_enable(); +} + +static inline unsigned int nr_free_highpages(void) { return 0; } +static inline unsigned long totalhigh_pages(void) { return 0UL; } + +#endif /* CONFIG_HIGHMEM */ + +/* + * Prevent people trying to call kunmap_atomic() as if it were kunmap() + * kunmap_atomic() should get the return value of kmap_atomic, not the page. + */ +#define kunmap_atomic(__addr) \ +do { \ + BUILD_BUG_ON(__same_type((__addr), struct page *)); \ + __kunmap_atomic(__addr); \ +} while (0) + +#define kunmap_local(__addr) \ +do { \ + BUILD_BUG_ON(__same_type((__addr), struct page *)); \ + __kunmap_local(__addr); \ +} while (0) + +#endif diff --git a/include/linux/highmem.h b/include/linux/highmem.h index 6b27af8fe..ec1edaf12 100644 --- a/include/linux/highmem.h +++ b/include/linux/highmem.h @@ -11,217 +11,137 @@ #include <asm/cacheflush.h> -#ifndef ARCH_HAS_FLUSH_ANON_PAGE -static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) -{ -} -#endif - -#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE -static inline void flush_kernel_dcache_page(struct page *page) -{ -} -static inline void flush_kernel_vmap_range(void *vaddr, int size) -{ -} -static inline void invalidate_kernel_vmap_range(void *vaddr, int size) -{ -} -#endif - -#include <asm/kmap_types.h> - -#ifdef CONFIG_HIGHMEM -extern void *kmap_atomic_high_prot(struct page *page, pgprot_t prot); -extern void kunmap_atomic_high(void *kvaddr); -#include <asm/highmem.h> - -#ifndef ARCH_HAS_KMAP_FLUSH_TLB -static inline void kmap_flush_tlb(unsigned long addr) { } -#endif - -#ifndef kmap_prot -#define kmap_prot PAGE_KERNEL -#endif - -void *kmap_high(struct page *page); -static inline void *kmap(struct page *page) -{ - void *addr; - - might_sleep(); - if (!PageHighMem(page)) - addr = page_address(page); - else - addr = kmap_high(page); - kmap_flush_tlb((unsigned long)addr); - return addr; -} - -void kunmap_high(struct page *page); - -static inline void kunmap(struct page *page) -{ - might_sleep(); - if (!PageHighMem(page)) - return; - kunmap_high(page); -} +#include "highmem-internal.h" -/* - * kmap_atomic/kunmap_atomic is significantly faster than kmap/kunmap because - * no global lock is needed and because the kmap code must perform a global TLB - * invalidation when the kmap pool wraps. +/** + * kmap - Map a page for long term usage + * @page: Pointer to the page to be mapped + * + * Returns: The virtual address of the mapping * - * However when holding an atomic kmap it is not legal to sleep, so atomic - * kmaps are appropriate for short, tight code paths only. + * Can only be invoked from preemptible task context because on 32bit + * systems with CONFIG_HIGHMEM enabled this function might sleep. * - * The use of kmap_atomic/kunmap_atomic is discouraged - kmap/kunmap - * gives a more generic (and caching) interface. But kmap_atomic can - * be used in IRQ contexts, so in some (very limited) cases we need - * it. + * For systems with CONFIG_HIGHMEM=n and for pages in the low memory area + * this returns the virtual address of the direct kernel mapping. + * + * The returned virtual address is globally visible and valid up to the + * point where it is unmapped via kunmap(). The pointer can be handed to + * other contexts. + * + * For highmem pages on 32bit systems this can be slow as the mapping space + * is limited and protected by a global lock. In case that there is no + * mapping slot available the function blocks until a slot is released via + * kunmap(). */ -static inline void *kmap_atomic_prot(struct page *page, pgprot_t prot) -{ - preempt_disable(); - pagefault_disable(); - if (!PageHighMem(page)) - return page_address(page); - return kmap_atomic_high_prot(page, prot); -} -#define kmap_atomic(page) kmap_atomic_prot(page, kmap_prot) +static inline void *kmap(struct page *page); -/* declarations for linux/mm/highmem.c */ -unsigned int nr_free_highpages(void); -extern atomic_long_t _totalhigh_pages; -static inline unsigned long totalhigh_pages(void) -{ - return (unsigned long)atomic_long_read(&_totalhigh_pages); -} - -static inline void totalhigh_pages_inc(void) -{ - atomic_long_inc(&_totalhigh_pages); -} - -static inline void totalhigh_pages_dec(void) -{ - atomic_long_dec(&_totalhigh_pages); -} - -static inline void totalhigh_pages_add(long count) -{ - atomic_long_add(count, &_totalhigh_pages); -} - -static inline void totalhigh_pages_set(long val) -{ - atomic_long_set(&_totalhigh_pages, val); -} - -void kmap_flush_unused(void); - -struct page *kmap_to_page(void *addr); - -#else /* CONFIG_HIGHMEM */ +/** + * kunmap - Unmap the virtual address mapped by kmap() + * @addr: Virtual address to be unmapped + * + * Counterpart to kmap(). A NOOP for CONFIG_HIGHMEM=n and for mappings of + * pages in the low memory area. + */ +static inline void kunmap(struct page *page); -static inline unsigned int nr_free_highpages(void) { return 0; } +/** + * kmap_to_page - Get the page for a kmap'ed address + * @addr: The address to look up + * + * Returns: The page which is mapped to @addr. + */ +static inline struct page *kmap_to_page(void *addr); -static inline struct page *kmap_to_page(void *addr) -{ - return virt_to_page(addr); -} +/** + * kmap_flush_unused - Flush all unused kmap mappings in order to + * remove stray mappings + */ +static inline void kmap_flush_unused(void); -static inline unsigned long totalhigh_pages(void) { return 0UL; } +/** + * kmap_local_page - Map a page for temporary usage + * @page: Pointer to the page to be mapped + * + * Returns: The virtual address of the mapping + * + * Can be invoked from any context. + * + * Requires careful handling when nesting multiple mappings because the map + * management is stack based. The unmap has to be in the reverse order of + * the map operation: + * + * addr1 = kmap_local_page(page1); + * addr2 = kmap_local_page(page2); + * ... + * kunmap_local(addr2); + * kunmap_local(addr1); + * + * Unmapping addr1 before addr2 is invalid and causes malfunction. + * + * Contrary to kmap() mappings the mapping is only valid in the context of + * the caller and cannot be handed to other contexts. + * + * On CONFIG_HIGHMEM=n kernels and for low memory pages this returns the + * virtual address of the direct mapping. Only real highmem pages are + * temporarily mapped. + * + * While it is significantly faster than kmap() for the higmem case it + * comes with restrictions about the pointer validity. Only use when really + * necessary. + * + * On HIGHMEM enabled systems mapping a highmem page has the side effect of + * disabling migration in order to keep the virtual address stable across + * preemption. No caller of kmap_local_page() can rely on this side effect. + */ +static inline void *kmap_local_page(struct page *page); -static inline void *kmap(struct page *page) -{ - might_sleep(); - return page_address(page); -} +/** + * kmap_atomic - Atomically map a page for temporary usage - Deprecated! + * @page: Pointer to the page to be mapped + * + * Returns: The virtual address of the mapping + * + * Effectively a wrapper around kmap_local_page() which disables pagefaults + * and preemption. + * + * Do not use in new code. Use kmap_local_page() instead. + */ +static inline void *kmap_atomic(struct page *page); -static inline void kunmap_high(struct page *page) -{ -} +/** + * kunmap_atomic - Unmap the virtual address mapped by kmap_atomic() + * @addr: Virtual address to be unmapped + * + * Counterpart to kmap_atomic(). + * + * Effectively a wrapper around kunmap_local() which additionally undoes + * the side effects of kmap_atomic(), i.e. reenabling pagefaults and + * preemption. + */ -static inline void kunmap(struct page *page) -{ -#ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(page_address(page)); -#endif -} +/* Highmem related interfaces for management code */ +static inline unsigned int nr_free_highpages(void); +static inline unsigned long totalhigh_pages(void); -static inline void *kmap_atomic(struct page *page) +#ifndef ARCH_HAS_FLUSH_ANON_PAGE +static inline void flush_anon_page(struct vm_area_struct *vma, struct page *page, unsigned long vmaddr) { - preempt_disable(); - pagefault_disable(); - return page_address(page); } -#define kmap_atomic_prot(page, prot) kmap_atomic(page) - -static inline void kunmap_atomic_high(void *addr) -{ - /* - * Mostly nothing to do in the CONFIG_HIGHMEM=n case as kunmap_atomic() - * handles re-enabling faults + preemption - */ -#ifdef ARCH_HAS_FLUSH_ON_KUNMAP - kunmap_flush_on_unmap(addr); #endif -} - -#define kmap_atomic_pfn(pfn) kmap_atomic(pfn_to_page(pfn)) - -#define kmap_flush_unused() do {} while(0) - -#endif /* CONFIG_HIGHMEM */ -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) - -DECLARE_PER_CPU(int, __kmap_atomic_idx); - -static inline int kmap_atomic_idx_push(void) +#ifndef ARCH_HAS_FLUSH_KERNEL_DCACHE_PAGE +static inline void flush_kernel_dcache_page(struct page *page) { - int idx = __this_cpu_inc_return(__kmap_atomic_idx) - 1; - -#ifdef CONFIG_DEBUG_HIGHMEM - WARN_ON_ONCE(in_irq() && !irqs_disabled()); - BUG_ON(idx >= KM_TYPE_NR); -#endif - return idx; } - -static inline int kmap_atomic_idx(void) +static inline void flush_kernel_vmap_range(void *vaddr, int size) { - return __this_cpu_read(__kmap_atomic_idx) - 1; } - -static inline void kmap_atomic_idx_pop(void) +static inline void invalidate_kernel_vmap_range(void *vaddr, int size) { -#ifdef CONFIG_DEBUG_HIGHMEM - int idx = __this_cpu_dec_return(__kmap_atomic_idx); - - BUG_ON(idx < 0); -#else - __this_cpu_dec(__kmap_atomic_idx); -#endif } - #endif -/* - * Prevent people trying to call kunmap_atomic() as if it were kunmap() - * kunmap_atomic() should get the return value of kmap_atomic, not the page. - */ -#define kunmap_atomic(addr) \ -do { \ - BUILD_BUG_ON(__same_type((addr), struct page *)); \ - kunmap_atomic_high(addr); \ - pagefault_enable(); \ - preempt_enable(); \ -} while (0) - - /* when CONFIG_HIGHMEM is not set these will be plain clear/copy_page */ #ifndef clear_user_highpage static inline void clear_user_highpage(struct page *page, unsigned long vaddr) diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h index 22240a8c3..fc162c252 100644 --- a/include/linux/interrupt.h +++ b/include/linux/interrupt.h @@ -542,7 +542,7 @@ struct softirq_action asmlinkage void do_softirq(void); asmlinkage void __do_softirq(void); -#ifdef __ARCH_HAS_DO_SOFTIRQ +#if defined(__ARCH_HAS_DO_SOFTIRQ) && !defined(CONFIG_PREEMPT_RT) void do_softirq_own_stack(void); #else static inline void do_softirq_own_stack(void) @@ -637,26 +637,21 @@ enum TASKLET_STATE_RUN /* Tasklet is running (SMP only) */ }; -#ifdef CONFIG_SMP +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) static inline int tasklet_trylock(struct tasklet_struct *t) { return !test_and_set_bit(TASKLET_STATE_RUN, &(t)->state); } -static inline void tasklet_unlock(struct tasklet_struct *t) -{ - smp_mb__before_atomic(); - clear_bit(TASKLET_STATE_RUN, &(t)->state); -} +void tasklet_unlock(struct tasklet_struct *t); +void tasklet_unlock_wait(struct tasklet_struct *t); +void tasklet_unlock_spin_wait(struct tasklet_struct *t); -static inline void tasklet_unlock_wait(struct tasklet_struct *t) -{ - while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { barrier(); } -} #else -#define tasklet_trylock(t) 1 -#define tasklet_unlock_wait(t) do { } while (0) -#define tasklet_unlock(t) do { } while (0) +static inline int tasklet_trylock(struct tasklet_struct *t) { return 1; } +static inline void tasklet_unlock(struct tasklet_struct *t) { } +static inline void tasklet_unlock_wait(struct tasklet_struct *t) { } +static inline void tasklet_unlock_spin_wait(struct tasklet_struct *t) { } #endif extern void __tasklet_schedule(struct tasklet_struct *t); @@ -681,6 +676,17 @@ static inline void tasklet_disable_nosync(struct tasklet_struct *t) smp_mb__after_atomic(); } +/* + * Do not use in new code. Disabling tasklets from atomic contexts is + * error prone and should be avoided. + */ +static inline void tasklet_disable_in_atomic(struct tasklet_struct *t) +{ + tasklet_disable_nosync(t); + tasklet_unlock_spin_wait(t); + smp_mb(); +} + static inline void tasklet_disable(struct tasklet_struct *t) { tasklet_disable_nosync(t); diff --git a/include/linux/io-mapping.h b/include/linux/io-mapping.h index c75e4d3d8..4bb8223f2 100644 --- a/include/linux/io-mapping.h +++ b/include/linux/io-mapping.h @@ -60,22 +60,20 @@ io_mapping_fini(struct io_mapping *mapping) iomap_free(mapping->base, mapping->size); } -/* Atomic map/unmap */ +/* Temporary mappings which are only valid in the current context */ static inline void __iomem * -io_mapping_map_atomic_wc(struct io_mapping *mapping, - unsigned long offset) +io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset) { resource_size_t phys_addr; BUG_ON(offset >= mapping->size); phys_addr = mapping->base + offset; - return iomap_atomic_prot_pfn(PHYS_PFN(phys_addr), mapping->prot); + return __iomap_local_pfn_prot(PHYS_PFN(phys_addr), mapping->prot); } -static inline void -io_mapping_unmap_atomic(void __iomem *vaddr) +static inline void io_mapping_unmap_local(void __iomem *vaddr) { - iounmap_atomic(vaddr); + kunmap_local_indexed((void __force *)vaddr); } static inline void __iomem * @@ -97,7 +95,7 @@ io_mapping_unmap(void __iomem *vaddr) iounmap(vaddr); } -#else +#else /* HAVE_ATOMIC_IOMAP */ #include <linux/uaccess.h> @@ -144,25 +142,19 @@ io_mapping_unmap(void __iomem *vaddr) { } -/* Atomic map/unmap */ +/* Temporary mappings which are only valid in the current context */ static inline void __iomem * -io_mapping_map_atomic_wc(struct io_mapping *mapping, - unsigned long offset) +io_mapping_map_local_wc(struct io_mapping *mapping, unsigned long offset) { - preempt_disable(); - pagefault_disable(); return io_mapping_map_wc(mapping, offset, PAGE_SIZE); } -static inline void -io_mapping_unmap_atomic(void __iomem *vaddr) +static inline void io_mapping_unmap_local(void __iomem *vaddr) { io_mapping_unmap(vaddr); - pagefault_enable(); - preempt_enable(); } -#endif /* HAVE_ATOMIC_IOMAP */ +#endif /* !HAVE_ATOMIC_IOMAP */ static inline struct io_mapping * io_mapping_create_wc(resource_size_t base, diff --git a/include/linux/irq_cpustat.h b/include/linux/irq_cpustat.h deleted file mode 100644 index 6e8895cd4..000000000 --- a/include/linux/irq_cpustat.h +++ /dev/null @@ -1,28 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef __irq_cpustat_h -#define __irq_cpustat_h - -/* - * Contains default mappings for irq_cpustat_t, used by almost every - * architecture. Some arch (like s390) have per cpu hardware pages and - * they define their own mappings for irq_stat. - * - * Keith Owens <kaos@ocs.com.au> July 2000. - */ - - -/* - * Simple wrappers reducing source bloat. Define all irq_stat fields - * here, even ones that are arch dependent. That way we get common - * definitions instead of differing sets for each arch. - */ - -#ifndef __ARCH_IRQ_STAT -DECLARE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); /* defined in asm/hardirq.h */ -#define __IRQ_STAT(cpu, member) (per_cpu(irq_stat.member, cpu)) -#endif - -/* arch dependent irq_stat fields */ -#define nmi_count(cpu) __IRQ_STAT((cpu), __nmi_count) /* i386 */ - -#endif /* __irq_cpustat_h */ diff --git a/include/linux/irq_work.h b/include/linux/irq_work.h index ec2a47a81..255d2dfec 100644 --- a/include/linux/irq_work.h +++ b/include/linux/irq_work.h @@ -3,6 +3,7 @@ #define _LINUX_IRQ_WORK_H #include <linux/smp_types.h> +#include <linux/rcuwait.h> /* * An entry can be in one of four states: @@ -16,11 +17,13 @@ struct irq_work { struct __call_single_node node; void (*func)(struct irq_work *); + struct rcuwait irqwait; }; #define __IRQ_WORK_INIT(_func, _flags) (struct irq_work){ \ .node = { .u_flags = (_flags), }, \ .func = (_func), \ + .irqwait = __RCUWAIT_INITIALIZER(irqwait), \ } #define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) @@ -30,10 +33,15 @@ struct irq_work { #define DEFINE_IRQ_WORK(name, _f) \ struct irq_work name = IRQ_WORK_INIT(_f) +#define IRQ_WORK_INIT(_func) __IRQ_WORK_INIT(_func, 0) +#define IRQ_WORK_INIT_LAZY(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_LAZY) +#define IRQ_WORK_INIT_HARD(_func) __IRQ_WORK_INIT(_func, IRQ_WORK_HARD_IRQ) + static inline void init_irq_work(struct irq_work *work, void (*func)(struct irq_work *)) { *work = IRQ_WORK_INIT(func); + rcuwait_init(&work->irqwait); } static inline bool irq_work_is_pending(struct irq_work *work) @@ -46,6 +54,11 @@ static inline bool irq_work_is_busy(struct irq_work *work) return atomic_read(&work->node.a_flags) & IRQ_WORK_BUSY; } +static inline bool irq_work_is_hard(struct irq_work *work) +{ + return atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ; +} + bool irq_work_queue(struct irq_work *work); bool irq_work_queue_on(struct irq_work *work, int cpu); diff --git a/include/linux/irqdesc.h b/include/linux/irqdesc.h index dc1b213ae..9bbcd8cba 100644 --- a/include/linux/irqdesc.h +++ b/include/linux/irqdesc.h @@ -68,6 +68,7 @@ struct irq_desc { unsigned int irqs_unhandled; atomic_t threads_handled; int threads_handled_last; + u64 random_ip; raw_spinlock_t lock; struct cpumask *percpu_enabled; const struct cpumask *percpu_affinity; diff --git a/include/linux/irqflags.h b/include/linux/irqflags.h index fef2d43a7..741aa2008 100644 --- a/include/linux/irqflags.h +++ b/include/linux/irqflags.h @@ -71,14 +71,6 @@ do { \ do { \ __this_cpu_dec(hardirq_context); \ } while (0) -# define lockdep_softirq_enter() \ -do { \ - current->softirq_context++; \ -} while (0) -# define lockdep_softirq_exit() \ -do { \ - current->softirq_context--; \ -} while (0) # define lockdep_hrtimer_enter(__hrtimer) \ ({ \ @@ -140,6 +132,21 @@ do { \ # define lockdep_irq_work_exit(__work) do { } while (0) #endif +#if defined(CONFIG_TRACE_IRQFLAGS) && !defined(CONFIG_PREEMPT_RT) +# define lockdep_softirq_enter() \ +do { \ + current->softirq_context++; \ +} while (0) +# define lockdep_softirq_exit() \ +do { \ + current->softirq_context--; \ +} while (0) + +#else +# define lockdep_softirq_enter() do { } while (0) +# define lockdep_softirq_exit() do { } while (0) +#endif + #if defined(CONFIG_IRQSOFF_TRACER) || \ defined(CONFIG_PREEMPT_TRACER) extern void stop_critical_timings(void); diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 78a0907f0..e6270bfa6 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -220,6 +220,7 @@ static __always_inline void might_resched(void) extern void ___might_sleep(const char *file, int line, int preempt_offset); extern void __might_sleep(const char *file, int line, int preempt_offset); extern void __cant_sleep(const char *file, int line, int preempt_offset); +extern void __cant_migrate(const char *file, int line); /** * might_sleep - annotation for functions that can sleep @@ -235,6 +236,10 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); */ # define might_sleep() \ do { __might_sleep(__FILE__, __LINE__, 0); might_resched(); } while (0) + +# define might_sleep_no_state_check() \ + do { ___might_sleep(__FILE__, __LINE__, 0); } while (0) + /** * cant_sleep - annotation for functions that cannot sleep * @@ -243,6 +248,18 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); # define cant_sleep() \ do { __cant_sleep(__FILE__, __LINE__, 0); } while (0) # define sched_annotate_sleep() (current->task_state_change = 0) + +/** + * cant_migrate - annotation for functions that cannot migrate + * + * Will print a stack trace if executed in code which is migratable + */ +# define cant_migrate() \ + do { \ + if (IS_ENABLED(CONFIG_SMP)) \ + __cant_migrate(__FILE__, __LINE__); \ + } while (0) + /** * non_block_start - annotate the start of section where sleeping is prohibited * @@ -266,7 +283,9 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); static inline void __might_sleep(const char *file, int line, int preempt_offset) { } # define might_sleep() do { might_resched(); } while (0) +# define might_sleep_no_state_check() do { might_resched(); } while (0) # define cant_sleep() do { } while (0) +# define cant_migrate() do { } while (0) # define sched_annotate_sleep() do { } while (0) # define non_block_start() do { } while (0) # define non_block_end() do { } while (0) @@ -274,13 +293,6 @@ extern void __cant_sleep(const char *file, int line, int preempt_offset); #define might_sleep_if(cond) do { if (cond) might_sleep(); } while (0) -#ifndef CONFIG_PREEMPT_RT -# define cant_migrate() cant_sleep() -#else - /* Placeholder for now */ -# define cant_migrate() do { } while (0) -#endif - /** * abs - return absolute value of an argument * @x: the value. If it is unsigned type, it is converted to signed type first. diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 3378bcbe5..86673930c 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -29,6 +29,18 @@ enum kmsg_dump_reason { KMSG_DUMP_MAX }; +/** + * struct kmsg_dumper_iter - iterator for kernel crash message dumper + * @active: Flag that specifies if this is currently dumping + * @cur_seq: Points to the oldest message to dump (private) + * @next_seq: Points after the newest message to dump (private) + */ +struct kmsg_dumper_iter { + bool active; + u64 cur_seq; + u64 next_seq; +}; + /** * struct kmsg_dumper - kernel crash message dumper structure * @list: Entry in the dumper list (private) @@ -39,33 +51,22 @@ enum kmsg_dump_reason { */ struct kmsg_dumper { struct list_head list; - void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); + void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason, + struct kmsg_dumper_iter *iter); enum kmsg_dump_reason max_reason; - bool active; bool registered; - - /* private state of the kmsg iterator */ - u32 cur_idx; - u32 next_idx; - u64 cur_seq; - u64 next_seq; }; #ifdef CONFIG_PRINTK void kmsg_dump(enum kmsg_dump_reason reason); -bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len); - -bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, +bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, char *line, size_t size, size_t *len); -bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, - char *buf, size_t size, size_t *len); - -void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper); +bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + char *buf, size_t size, size_t *len_out); -void kmsg_dump_rewind(struct kmsg_dumper *dumper); +void kmsg_dump_rewind(struct kmsg_dumper_iter *iter); int kmsg_dump_register(struct kmsg_dumper *dumper); @@ -77,30 +78,19 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason) { } -static inline bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, - bool syslog, const char *line, - size_t size, size_t *len) -{ - return false; -} - -static inline bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, +static inline bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, const char *line, size_t size, size_t *len) { return false; } -static inline bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, +static inline bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, char *buf, size_t size, size_t *len) { return false; } -static inline void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) -{ -} - -static inline void kmsg_dump_rewind(struct kmsg_dumper *dumper) +static inline void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) { } diff --git a/include/linux/local_lock_internal.h b/include/linux/local_lock_internal.h index 3f02b8186..1b8ae0349 100644 --- a/include/linux/local_lock_internal.h +++ b/include/linux/local_lock_internal.h @@ -7,13 +7,39 @@ #include <linux/lockdep.h> typedef struct { -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#ifdef CONFIG_PREEMPT_RT + spinlock_t lock; + struct task_struct *owner; + int nestcnt; + +#elif defined(CONFIG_DEBUG_LOCK_ALLOC) struct lockdep_map dep_map; struct task_struct *owner; #endif } local_lock_t; -#ifdef CONFIG_DEBUG_LOCK_ALLOC +#ifdef CONFIG_PREEMPT_RT + +#define INIT_LOCAL_LOCK(lockname) { \ + __SPIN_LOCK_UNLOCKED((lockname).lock), \ + .owner = NULL, \ + .nestcnt = 0, \ + } + +static inline void ___local_lock_init(local_lock_t *l) +{ + l->owner = NULL; + l->nestcnt = 0; +} + +#define __local_lock_init(l) \ +do { \ + spin_lock_init(&(l)->lock); \ + ___local_lock_init(l); \ +} while (0) + +#elif defined(CONFIG_DEBUG_LOCK_ALLOC) + # define LOCAL_LOCK_DEBUG_INIT(lockname) \ .dep_map = { \ .name = #lockname, \ @@ -21,7 +47,33 @@ typedef struct { .lock_type = LD_LOCK_PERCPU, \ }, \ .owner = NULL, +#endif + +#ifdef CONFIG_PREEMPT_RT +static inline void local_lock_acquire(local_lock_t *l) +{ + if (l->owner != current) { + spin_lock(&l->lock); + DEBUG_LOCKS_WARN_ON(l->owner); + DEBUG_LOCKS_WARN_ON(l->nestcnt); + l->owner = current; + } + l->nestcnt++; +} + +static inline void local_lock_release(local_lock_t *l) +{ + DEBUG_LOCKS_WARN_ON(l->nestcnt == 0); + DEBUG_LOCKS_WARN_ON(l->owner != current); + if (--l->nestcnt) + return; + + l->owner = NULL; + spin_unlock(&l->lock); +} + +#elif defined(CONFIG_DEBUG_LOCK_ALLOC) static inline void local_lock_acquire(local_lock_t *l) { lock_map_acquire(&l->dep_map); @@ -47,6 +99,47 @@ static inline void local_lock_release(local_lock_t *l) { } static inline void local_lock_debug_init(local_lock_t *l) { } #endif /* !CONFIG_DEBUG_LOCK_ALLOC */ +#ifdef CONFIG_PREEMPT_RT + +#define __local_lock(lock) \ + do { \ + migrate_disable(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_unlock(lock) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ + migrate_enable(); \ + } while (0) + +#define __local_lock_irq(lock) \ + do { \ + migrate_disable(); \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_lock_irqsave(lock, flags) \ + do { \ + migrate_disable(); \ + flags = 0; \ + local_lock_acquire(this_cpu_ptr(lock)); \ + } while (0) + +#define __local_unlock_irq(lock) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ + migrate_enable(); \ + } while (0) + +#define __local_unlock_irqrestore(lock, flags) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ + migrate_enable(); \ + } while (0) + +#else + #define INIT_LOCAL_LOCK(lockname) { LOCAL_LOCK_DEBUG_INIT(lockname) } #define __local_lock_init(lock) \ @@ -66,6 +159,12 @@ do { \ local_lock_acquire(this_cpu_ptr(lock)); \ } while (0) +#define __local_unlock(lock) \ + do { \ + local_lock_release(this_cpu_ptr(lock)); \ + preempt_enable(); \ + } while (0) + #define __local_lock_irq(lock) \ do { \ local_irq_disable(); \ @@ -78,12 +177,6 @@ do { \ local_lock_acquire(this_cpu_ptr(lock)); \ } while (0) -#define __local_unlock(lock) \ - do { \ - local_lock_release(this_cpu_ptr(lock)); \ - preempt_enable(); \ - } while (0) - #define __local_unlock_irq(lock) \ do { \ local_lock_release(this_cpu_ptr(lock)); \ @@ -95,3 +188,5 @@ do { \ local_lock_release(this_cpu_ptr(lock)); \ local_irq_restore(flags); \ } while (0) + +#endif diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 1c22e294f..41aed4e91 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -13,6 +13,7 @@ #include <linux/completion.h> #include <linux/cpumask.h> #include <linux/uprobes.h> +#include <linux/rcupdate.h> #include <linux/page-flags-layout.h> #include <linux/workqueue.h> #include <linux/seqlock.h> @@ -577,6 +578,9 @@ struct mm_struct { bool tlb_flush_batched; #endif struct uprobes_state uprobes_state; +#ifdef CONFIG_PREEMPT_RT + struct rcu_head delayed_drop; +#endif #ifdef CONFIG_HUGETLB_PAGE atomic_long_t hugetlb_usage; #endif diff --git a/include/linux/mutex.h b/include/linux/mutex.h index 4d671fba3..90923d300 100644 --- a/include/linux/mutex.h +++ b/include/linux/mutex.h @@ -22,6 +22,20 @@ struct ww_acquire_ctx; +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ + , .dep_map = { \ + .name = #lockname, \ + .wait_type_inner = LD_WAIT_SLEEP, \ + } +#else +# define __DEP_MAP_MUTEX_INITIALIZER(lockname) +#endif + +#ifdef CONFIG_PREEMPT_RT +# include <linux/mutex_rt.h> +#else + /* * Simple, straightforward mutexes with strict semantics: * @@ -68,14 +82,6 @@ struct mutex { struct ww_class; struct ww_acquire_ctx; -struct ww_mutex { - struct mutex base; - struct ww_acquire_ctx *ctx; -#ifdef CONFIG_DEBUG_MUTEXES - struct ww_class *ww_class; -#endif -}; - /* * This is the control structure for tasks blocked on mutex, * which resides on the blocked task's kernel stack: @@ -119,16 +125,6 @@ do { \ __mutex_init((mutex), #mutex, &__key); \ } while (0) -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) \ - , .dep_map = { \ - .name = #lockname, \ - .wait_type_inner = LD_WAIT_SLEEP, \ - } -#else -# define __DEP_MAP_MUTEX_INITIALIZER(lockname) -#endif - #define __MUTEX_INITIALIZER(lockname) \ { .owner = ATOMIC_LONG_INIT(0) \ , .wait_lock = __SPIN_LOCK_UNLOCKED(lockname.wait_lock) \ @@ -224,4 +220,6 @@ enum mutex_trylock_recursive_enum { extern /* __deprecated */ __must_check enum mutex_trylock_recursive_enum mutex_trylock_recursive(struct mutex *lock); +#endif /* !PREEMPT_RT */ + #endif /* __LINUX_MUTEX_H */ diff --git a/include/linux/mutex_rt.h b/include/linux/mutex_rt.h new file mode 100644 index 000000000..f0b2e07cd --- /dev/null +++ b/include/linux/mutex_rt.h @@ -0,0 +1,130 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __LINUX_MUTEX_RT_H +#define __LINUX_MUTEX_RT_H + +#ifndef __LINUX_MUTEX_H +#error "Please include mutex.h" +#endif + +#include <linux/rtmutex.h> + +/* FIXME: Just for __lockfunc */ +#include <linux/spinlock.h> + +struct mutex { + struct rt_mutex lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#define __MUTEX_INITIALIZER(mutexname) \ + { \ + .lock = __RT_MUTEX_INITIALIZER(mutexname.lock) \ + __DEP_MAP_MUTEX_INITIALIZER(mutexname) \ + } + +#define DEFINE_MUTEX(mutexname) \ + struct mutex mutexname = __MUTEX_INITIALIZER(mutexname) + +extern void __mutex_do_init(struct mutex *lock, const char *name, struct lock_class_key *key); +extern void __lockfunc _mutex_lock(struct mutex *lock); +extern void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_lock_interruptible(struct mutex *lock); +extern int __lockfunc _mutex_lock_killable(struct mutex *lock); +extern void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass); +extern void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest_lock); +extern int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass); +extern int __lockfunc _mutex_trylock(struct mutex *lock); +extern void __lockfunc _mutex_unlock(struct mutex *lock); + +#define mutex_is_locked(l) rt_mutex_is_locked(&(l)->lock) +#define mutex_lock(l) _mutex_lock(l) +#define mutex_lock_interruptible(l) _mutex_lock_interruptible(l) +#define mutex_lock_killable(l) _mutex_lock_killable(l) +#define mutex_trylock(l) _mutex_trylock(l) +#define mutex_unlock(l) _mutex_unlock(l) +#define mutex_lock_io(l) _mutex_lock_io_nested(l, 0); + +#define __mutex_owner(l) ((l)->lock.owner) + +#ifdef CONFIG_DEBUG_MUTEXES +#define mutex_destroy(l) rt_mutex_destroy(&(l)->lock) +#else +static inline void mutex_destroy(struct mutex *lock) {} +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define mutex_lock_nested(l, s) _mutex_lock_nested(l, s) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible_nested(l, s) +# define mutex_lock_killable_nested(l, s) \ + _mutex_lock_killable_nested(l, s) +# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s) + +# define mutex_lock_nest_lock(lock, nest_lock) \ +do { \ + typecheck(struct lockdep_map *, &(nest_lock)->dep_map); \ + _mutex_lock_nest_lock(lock, &(nest_lock)->dep_map); \ +} while (0) + +#else +# define mutex_lock_nested(l, s) _mutex_lock(l) +# define mutex_lock_interruptible_nested(l, s) \ + _mutex_lock_interruptible(l) +# define mutex_lock_killable_nested(l, s) \ + _mutex_lock_killable(l) +# define mutex_lock_nest_lock(lock, nest_lock) mutex_lock(lock) +# define mutex_lock_io_nested(l, s) _mutex_lock_io_nested(l, s) +#endif + +# define mutex_init(mutex) \ +do { \ + static struct lock_class_key __key; \ + \ + rt_mutex_init(&(mutex)->lock); \ + __mutex_do_init((mutex), #mutex, &__key); \ +} while (0) + +# define __mutex_init(mutex, name, key) \ +do { \ + rt_mutex_init(&(mutex)->lock); \ + __mutex_do_init((mutex), name, key); \ +} while (0) + +/** + * These values are chosen such that FAIL and SUCCESS match the + * values of the regular mutex_trylock(). + */ +enum mutex_trylock_recursive_enum { + MUTEX_TRYLOCK_FAILED = 0, + MUTEX_TRYLOCK_SUCCESS = 1, + MUTEX_TRYLOCK_RECURSIVE, +}; +/** + * mutex_trylock_recursive - trylock variant that allows recursive locking + * @lock: mutex to be locked + * + * This function should not be used, _ever_. It is purely for hysterical GEM + * raisins, and once those are gone this will be removed. + * + * Returns: + * MUTEX_TRYLOCK_FAILED - trylock failed, + * MUTEX_TRYLOCK_SUCCESS - lock acquired, + * MUTEX_TRYLOCK_RECURSIVE - we already owned the lock. + */ +int __rt_mutex_owner_current(struct rt_mutex *lock); + +static inline /* __deprecated */ __must_check enum mutex_trylock_recursive_enum +mutex_trylock_recursive(struct mutex *lock) +{ + if (unlikely(__rt_mutex_owner_current(&lock->lock))) + return MUTEX_TRYLOCK_RECURSIVE; + + return mutex_trylock(lock); +} + +extern int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock); + +#endif diff --git a/include/linux/nfs_xdr.h b/include/linux/nfs_xdr.h index 5491ad5f4..cd9e5b3f1 100644 --- a/include/linux/nfs_xdr.h +++ b/include/linux/nfs_xdr.h @@ -1675,7 +1675,7 @@ struct nfs_unlinkdata { struct nfs_removeargs args; struct nfs_removeres res; struct dentry *dentry; - wait_queue_head_t wq; + struct swait_queue_head wq; const struct cred *cred; struct nfs_fattr dir_attr; long timeout; diff --git a/include/linux/notifier.h b/include/linux/notifier.h index 2fb373a5c..723bc2df6 100644 --- a/include/linux/notifier.h +++ b/include/linux/notifier.h @@ -58,7 +58,7 @@ struct notifier_block { }; struct atomic_notifier_head { - spinlock_t lock; + raw_spinlock_t lock; struct notifier_block __rcu *head; }; @@ -78,7 +78,7 @@ struct srcu_notifier_head { }; #define ATOMIC_INIT_NOTIFIER_HEAD(name) do { \ - spin_lock_init(&(name)->lock); \ + raw_spin_lock_init(&(name)->lock); \ (name)->head = NULL; \ } while (0) #define BLOCKING_INIT_NOTIFIER_HEAD(name) do { \ @@ -95,7 +95,7 @@ extern void srcu_init_notifier_head(struct srcu_notifier_head *nh); cleanup_srcu_struct(&(name)->srcu); #define ATOMIC_NOTIFIER_INIT(name) { \ - .lock = __SPIN_LOCK_UNLOCKED(name.lock), \ + .lock = __RAW_SPIN_LOCK_UNLOCKED(name.lock), \ .head = NULL } #define BLOCKING_NOTIFIER_INIT(name) { \ .rwsem = __RWSEM_INITIALIZER((name).rwsem), \ diff --git a/include/linux/pid.h b/include/linux/pid.h index 34afff2dc..514dd026c 100644 --- a/include/linux/pid.h +++ b/include/linux/pid.h @@ -3,6 +3,7 @@ #define _LINUX_PID_H #include <linux/rculist.h> +#include <linux/atomic.h> #include <linux/wait.h> #include <linux/refcount.h> diff --git a/include/linux/preempt.h b/include/linux/preempt.h index 7d9c1c0e1..7b5b2ed55 100644 --- a/include/linux/preempt.h +++ b/include/linux/preempt.h @@ -77,31 +77,37 @@ /* preempt_count() and related functions, depends on PREEMPT_NEED_RESCHED */ #include <asm/preempt.h> +#define nmi_count() (preempt_count() & NMI_MASK) #define hardirq_count() (preempt_count() & HARDIRQ_MASK) -#define softirq_count() (preempt_count() & SOFTIRQ_MASK) -#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \ - | NMI_MASK)) +#ifdef CONFIG_PREEMPT_RT +# define softirq_count() (current->softirq_disable_cnt & SOFTIRQ_MASK) +#else +# define softirq_count() (preempt_count() & SOFTIRQ_MASK) +#endif +#define irq_count() (nmi_count() | hardirq_count() | softirq_count()) /* - * Are we doing bottom half or hardware interrupt processing? + * Macros to retrieve the current execution context: * - * in_irq() - We're in (hard) IRQ context + * in_nmi() - We're in NMI context + * in_hardirq() - We're in hard IRQ context + * in_serving_softirq() - We're in softirq context + * in_task() - We're in task context + */ +#define in_nmi() (nmi_count()) +#define in_hardirq() (hardirq_count()) +#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) +#define in_task() (!(in_nmi() | in_hardirq() | in_serving_softirq())) + +/* + * The following macros are deprecated and should not be used in new code: + * in_irq() - Obsolete version of in_hardirq() * in_softirq() - We have BH disabled, or are processing softirqs * in_interrupt() - We're in NMI,IRQ,SoftIRQ context or have BH disabled - * in_serving_softirq() - We're in softirq context - * in_nmi() - We're in NMI context - * in_task() - We're in task context - * - * Note: due to the BH disabled confusion: in_softirq(),in_interrupt() really - * should not be used in new code. */ #define in_irq() (hardirq_count()) #define in_softirq() (softirq_count()) #define in_interrupt() (irq_count()) -#define in_serving_softirq() (softirq_count() & SOFTIRQ_OFFSET) -#define in_nmi() (preempt_count() & NMI_MASK) -#define in_task() (!(preempt_count() & \ - (NMI_MASK | HARDIRQ_MASK | SOFTIRQ_OFFSET))) /* * The preempt_count offset after preempt_disable(); @@ -115,7 +121,11 @@ /* * The preempt_count offset after spin_lock() */ +#if !defined(CONFIG_PREEMPT_RT) #define PREEMPT_LOCK_OFFSET PREEMPT_DISABLE_OFFSET +#else +#define PREEMPT_LOCK_OFFSET 0 +#endif /* * The preempt_count offset needed for things like: @@ -164,6 +174,20 @@ extern void preempt_count_sub(int val); #define preempt_count_inc() preempt_count_add(1) #define preempt_count_dec() preempt_count_sub(1) +#ifdef CONFIG_PREEMPT_LAZY +#define add_preempt_lazy_count(val) do { preempt_lazy_count() += (val); } while (0) +#define sub_preempt_lazy_count(val) do { preempt_lazy_count() -= (val); } while (0) +#define inc_preempt_lazy_count() add_preempt_lazy_count(1) +#define dec_preempt_lazy_count() sub_preempt_lazy_count(1) +#define preempt_lazy_count() (current_thread_info()->preempt_lazy_count) +#else +#define add_preempt_lazy_count(val) do { } while (0) +#define sub_preempt_lazy_count(val) do { } while (0) +#define inc_preempt_lazy_count() do { } while (0) +#define dec_preempt_lazy_count() do { } while (0) +#define preempt_lazy_count() (0) +#endif + #ifdef CONFIG_PREEMPT_COUNT #define preempt_disable() \ @@ -172,13 +196,25 @@ do { \ barrier(); \ } while (0) +#define preempt_lazy_disable() \ +do { \ + inc_preempt_lazy_count(); \ + barrier(); \ +} while (0) + #define sched_preempt_enable_no_resched() \ do { \ barrier(); \ preempt_count_dec(); \ } while (0) -#define preempt_enable_no_resched() sched_preempt_enable_no_resched() +#ifndef CONFIG_PREEMPT_RT +# define preempt_enable_no_resched() sched_preempt_enable_no_resched() +# define preempt_check_resched_rt() barrier(); +#else +# define preempt_enable_no_resched() preempt_enable() +# define preempt_check_resched_rt() preempt_check_resched() +#endif #define preemptible() (preempt_count() == 0 && !irqs_disabled()) @@ -203,6 +239,18 @@ do { \ __preempt_schedule(); \ } while (0) +/* + * open code preempt_check_resched() because it is not exported to modules and + * used by local_unlock() or bpf_enable_instrumentation(). + */ +#define preempt_lazy_enable() \ +do { \ + dec_preempt_lazy_count(); \ + barrier(); \ + if (should_resched(0)) \ + __preempt_schedule(); \ +} while (0) + #else /* !CONFIG_PREEMPTION */ #define preempt_enable() \ do { \ @@ -210,6 +258,12 @@ do { \ preempt_count_dec(); \ } while (0) +#define preempt_lazy_enable() \ +do { \ + dec_preempt_lazy_count(); \ + barrier(); \ +} while (0) + #define preempt_enable_notrace() \ do { \ barrier(); \ @@ -248,8 +302,12 @@ do { \ #define preempt_disable_notrace() barrier() #define preempt_enable_no_resched_notrace() barrier() #define preempt_enable_notrace() barrier() +#define preempt_check_resched_rt() barrier() #define preemptible() 0 +#define preempt_lazy_disable() barrier() +#define preempt_lazy_enable() barrier() + #endif /* CONFIG_PREEMPT_COUNT */ #ifdef MODULE @@ -268,10 +326,22 @@ do { \ } while (0) #define preempt_fold_need_resched() \ do { \ - if (tif_need_resched()) \ + if (tif_need_resched_now()) \ set_preempt_need_resched(); \ } while (0) +#ifdef CONFIG_PREEMPT_RT +# define preempt_disable_rt() preempt_disable() +# define preempt_enable_rt() preempt_enable() +# define preempt_disable_nort() barrier() +# define preempt_enable_nort() barrier() +#else +# define preempt_disable_rt() barrier() +# define preempt_enable_rt() barrier() +# define preempt_disable_nort() preempt_disable() +# define preempt_enable_nort() preempt_enable() +#endif + #ifdef CONFIG_PREEMPT_NOTIFIERS struct preempt_notifier; @@ -322,34 +392,78 @@ static inline void preempt_notifier_init(struct preempt_notifier *notifier, #endif -/** - * migrate_disable - Prevent migration of the current task +#ifdef CONFIG_SMP + +/* + * Migrate-Disable and why it is undesired. * - * Maps to preempt_disable() which also disables preemption. Use - * migrate_disable() to annotate that the intent is to prevent migration, - * but not necessarily preemption. + * When a preempted task becomes elegible to run under the ideal model (IOW it + * becomes one of the M highest priority tasks), it might still have to wait + * for the preemptee's migrate_disable() section to complete. Thereby suffering + * a reduction in bandwidth in the exact duration of the migrate_disable() + * section. * - * Can be invoked nested like preempt_disable() and needs the corresponding - * number of migrate_enable() invocations. - */ -static __always_inline void migrate_disable(void) -{ - preempt_disable(); -} - -/** - * migrate_enable - Allow migration of the current task + * Per this argument, the change from preempt_disable() to migrate_disable() + * gets us: + * + * - a higher priority tasks gains reduced wake-up latency; with preempt_disable() + * it would have had to wait for the lower priority task. + * + * - a lower priority tasks; which under preempt_disable() could've instantly + * migrated away when another CPU becomes available, is now constrained + * by the ability to push the higher priority task away, which might itself be + * in a migrate_disable() section, reducing it's available bandwidth. + * + * IOW it trades latency / moves the interference term, but it stays in the + * system, and as long as it remains unbounded, the system is not fully + * deterministic. * - * Counterpart to migrate_disable(). * - * As migrate_disable() can be invoked nested, only the outermost invocation - * reenables migration. + * The reason we have it anyway. + * + * PREEMPT_RT breaks a number of assumptions traditionally held. By forcing a + * number of primitives into becoming preemptible, they would also allow + * migration. This turns out to break a bunch of per-cpu usage. To this end, + * all these primitives employ migirate_disable() to restore this implicit + * assumption. + * + * This is a 'temporary' work-around at best. The correct solution is getting + * rid of the above assumptions and reworking the code to employ explicit + * per-cpu locking or short preempt-disable regions. + * + * The end goal must be to get rid of migrate_disable(), alternatively we need + * a schedulability theory that does not depend on abritrary migration. + * + * + * Notes on the implementation. + * + * The implementation is particularly tricky since existing code patterns + * dictate neither migrate_disable() nor migrate_enable() is allowed to block. + * This means that it cannot use cpus_read_lock() to serialize against hotplug, + * nor can it easily migrate itself into a pending affinity mask change on + * migrate_enable(). + * + * + * Note: even non-work-conserving schedulers like semi-partitioned depends on + * migration, so migrate_disable() is not only a problem for + * work-conserving schedulers. * - * Currently mapped to preempt_enable(). */ -static __always_inline void migrate_enable(void) +extern void migrate_disable(void); +extern void migrate_enable(void); + +#else + +static inline void migrate_disable(void) { - preempt_enable(); + preempt_lazy_disable(); } +static inline void migrate_enable(void) +{ + preempt_lazy_enable(); +} + +#endif /* CONFIG_SMP */ + #endif /* __LINUX_PREEMPT_H */ diff --git a/include/linux/printk.h b/include/linux/printk.h index 7d787f91d..9331b131b 100644 --- a/include/linux/printk.h +++ b/include/linux/printk.h @@ -46,6 +46,12 @@ static inline const char *printk_skip_headers(const char *buffer) #define CONSOLE_EXT_LOG_MAX 8192 +/* + * The maximum size of a record formatted for console printing + * (i.e. with the prefix prepended to every line). + */ +#define CONSOLE_LOG_MAX 4096 + /* printk's without a loglevel use this.. */ #define MESSAGE_LOGLEVEL_DEFAULT CONFIG_MESSAGE_LOGLEVEL_DEFAULT @@ -149,18 +155,6 @@ static inline __printf(1, 2) __cold void early_printk(const char *s, ...) { } #endif -#ifdef CONFIG_PRINTK_NMI -extern void printk_nmi_enter(void); -extern void printk_nmi_exit(void); -extern void printk_nmi_direct_enter(void); -extern void printk_nmi_direct_exit(void); -#else -static inline void printk_nmi_enter(void) { } -static inline void printk_nmi_exit(void) { } -static inline void printk_nmi_direct_enter(void) { } -static inline void printk_nmi_direct_exit(void) { } -#endif /* PRINTK_NMI */ - #ifdef CONFIG_PRINTK extern void printk_safe_enter(void); extern void printk_safe_exit(void); @@ -247,8 +241,6 @@ __printf(1, 2) void dump_stack_set_arch_desc(const char *fmt, ...); void dump_stack_print_info(const char *log_lvl); void show_regs_print_info(const char *log_lvl); extern asmlinkage void dump_stack(void) __cold; -extern void printk_safe_flush(void); -extern void printk_safe_flush_on_panic(void); #if defined(CONFIG_X86) || defined(CONFIG_ARM64_PSEUDO_NMI) extern void zap_locks(void); #else @@ -318,14 +310,6 @@ static inline void dump_stack(void) { } -static inline void printk_safe_flush(void) -{ -} - -static inline void printk_safe_flush_on_panic(void) -{ -} - static inline void zap_locks(void) { } @@ -546,6 +530,8 @@ extern int kptr_restrict; no_printk(KERN_DEBUG pr_fmt(fmt), ##__VA_ARGS__) #endif +bool pr_flush(int timeout_ms, bool reset_on_progress); + /* * ratelimited messages with local ratelimit_state, * no local ratelimit_state used in the !PRINTK case diff --git a/include/linux/random.h b/include/linux/random.h index f45b8be3e..0e41d0527 100644 --- a/include/linux/random.h +++ b/include/linux/random.h @@ -35,7 +35,7 @@ static inline void add_latent_entropy(void) {} extern void add_input_randomness(unsigned int type, unsigned int code, unsigned int value) __latent_entropy; -extern void add_interrupt_randomness(int irq, int irq_flags) __latent_entropy; +extern void add_interrupt_randomness(int irq, int irq_flags, __u64 ip) __latent_entropy; extern void get_random_bytes(void *buf, int nbytes); extern int wait_for_random_bytes(void); diff --git a/include/linux/rbtree.h b/include/linux/rbtree.h index d7db17996..c33b0e16d 100644 --- a/include/linux/rbtree.h +++ b/include/linux/rbtree.h @@ -19,19 +19,9 @@ #include <linux/kernel.h> #include <linux/stddef.h> +#include <linux/rbtree_type.h> #include <linux/rcupdate.h> -struct rb_node { - unsigned long __rb_parent_color; - struct rb_node *rb_right; - struct rb_node *rb_left; -} __attribute__((aligned(sizeof(long)))); - /* The alignment might seem pointless, but allegedly CRIS needs it */ - -struct rb_root { - struct rb_node *rb_node; -}; - #define rb_parent(r) ((struct rb_node *)((r)->__rb_parent_color & ~3)) #define RB_ROOT (struct rb_root) { NULL, } @@ -112,21 +102,6 @@ static inline void rb_link_node_rcu(struct rb_node *node, struct rb_node *parent typeof(*pos), field); 1; }); \ pos = n) -/* - * Leftmost-cached rbtrees. - * - * We do not cache the rightmost node based on footprint - * size vs number of potential users that could benefit - * from O(1) rb_last(). Just not worth it, users that want - * this feature can always implement the logic explicitly. - * Furthermore, users that want to cache both pointers may - * find it a bit asymmetric, but that's ok. - */ -struct rb_root_cached { - struct rb_root rb_root; - struct rb_node *rb_leftmost; -}; - #define RB_ROOT_CACHED (struct rb_root_cached) { {NULL, }, NULL } /* Same as rb_first(), but O(1) */ diff --git a/include/linux/rbtree_type.h b/include/linux/rbtree_type.h new file mode 100644 index 000000000..77a89dd2c --- /dev/null +++ b/include/linux/rbtree_type.h @@ -0,0 +1,31 @@ +/* SPDX-License-Identifier: GPL-2.0-or-later */ +#ifndef _LINUX_RBTREE_TYPE_H +#define _LINUX_RBTREE_TYPE_H + +struct rb_node { + unsigned long __rb_parent_color; + struct rb_node *rb_right; + struct rb_node *rb_left; +} __attribute__((aligned(sizeof(long)))); +/* The alignment might seem pointless, but allegedly CRIS needs it */ + +struct rb_root { + struct rb_node *rb_node; +}; + +/* + * Leftmost-cached rbtrees. + * + * We do not cache the rightmost node based on footprint + * size vs number of potential users that could benefit + * from O(1) rb_last(). Just not worth it, users that want + * this feature can always implement the logic explicitly. + * Furthermore, users that want to cache both pointers may + * find it a bit asymmetric, but that's ok. + */ +struct rb_root_cached { + struct rb_root rb_root; + struct rb_node *rb_leftmost; +}; + +#endif diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h index 095b3b39b..1effcae06 100644 --- a/include/linux/rcupdate.h +++ b/include/linux/rcupdate.h @@ -54,6 +54,11 @@ void __rcu_read_unlock(void); * types of kernel builds, the rcu_read_lock() nesting depth is unknowable. */ #define rcu_preempt_depth() (current->rcu_read_lock_nesting) +#ifndef CONFIG_PREEMPT_RT +#define sched_rcu_preempt_depth() rcu_preempt_depth() +#else +static inline int sched_rcu_preempt_depth(void) { return 0; } +#endif #else /* #ifdef CONFIG_PREEMPT_RCU */ @@ -79,6 +84,8 @@ static inline int rcu_preempt_depth(void) return 0; } +#define sched_rcu_preempt_depth() rcu_preempt_depth() + #endif /* #else #ifdef CONFIG_PREEMPT_RCU */ /* Internal to kernel */ @@ -329,7 +336,8 @@ static inline void rcu_preempt_sleep_check(void) { } #define rcu_sleep_check() \ do { \ rcu_preempt_sleep_check(); \ - RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) \ + RCU_LOCKDEP_WARN(lock_is_held(&rcu_bh_lock_map), \ "Illegal context switch in RCU-bh read-side critical section"); \ RCU_LOCKDEP_WARN(lock_is_held(&rcu_sched_lock_map), \ "Illegal context switch in RCU-sched read-side critical section"); \ diff --git a/include/linux/rtmutex.h b/include/linux/rtmutex.h index 6fd615a0e..b02009f53 100644 --- a/include/linux/rtmutex.h +++ b/include/linux/rtmutex.h @@ -14,11 +14,15 @@ #define __LINUX_RT_MUTEX_H #include <linux/linkage.h> -#include <linux/rbtree.h> -#include <linux/spinlock_types.h> +#include <linux/rbtree_type.h> +#include <linux/spinlock_types_raw.h> extern int max_lock_depth; /* for sysctl */ +#ifdef CONFIG_DEBUG_MUTEXES +#include <linux/debug_locks.h> +#endif + /** * The rt_mutex structure * @@ -31,12 +35,7 @@ struct rt_mutex { raw_spinlock_t wait_lock; struct rb_root_cached waiters; struct task_struct *owner; -#ifdef CONFIG_DEBUG_RT_MUTEXES int save_state; - const char *name, *file; - int line; - void *magic; -#endif #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif @@ -49,6 +48,7 @@ struct hrtimer_sleeper; extern int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len); extern void rt_mutex_debug_check_no_locks_held(struct task_struct *task); + extern void rt_mutex_debug_task_free(struct task_struct *tsk); #else static inline int rt_mutex_debug_check_no_locks_freed(const void *from, unsigned long len) @@ -56,25 +56,15 @@ struct hrtimer_sleeper; return 0; } # define rt_mutex_debug_check_no_locks_held(task) do { } while (0) +# define rt_mutex_debug_task_free(t) do { } while (0) #endif -#ifdef CONFIG_DEBUG_RT_MUTEXES -# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ - , .name = #mutexname, .file = __FILE__, .line = __LINE__ - -# define rt_mutex_init(mutex) \ +#define rt_mutex_init(mutex) \ do { \ static struct lock_class_key __key; \ __rt_mutex_init(mutex, __func__, &__key); \ } while (0) - extern void rt_mutex_debug_task_free(struct task_struct *tsk); -#else -# define __DEBUG_RT_MUTEX_INITIALIZER(mutexname) -# define rt_mutex_init(mutex) __rt_mutex_init(mutex, NULL, NULL) -# define rt_mutex_debug_task_free(t) do { } while (0) -#endif - #ifdef CONFIG_DEBUG_LOCK_ALLOC #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) \ , .dep_map = { .name = #mutexname } @@ -82,12 +72,19 @@ do { \ #define __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) #endif -#define __RT_MUTEX_INITIALIZER(mutexname) \ - { .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ +#define __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(mutexname.wait_lock) \ , .waiters = RB_ROOT_CACHED \ , .owner = NULL \ - __DEBUG_RT_MUTEX_INITIALIZER(mutexname) \ - __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname)} + __DEP_MAP_RT_MUTEX_INITIALIZER(mutexname) + +#define __RT_MUTEX_INITIALIZER(mutexname) \ + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ + , .save_state = 0 } + +#define __RT_MUTEX_INITIALIZER_SAVE_STATE(mutexname) \ + { __RT_MUTEX_INITIALIZER_PLAIN(mutexname) \ + , .save_state = 1 } #define DEFINE_RT_MUTEX(mutexname) \ struct rt_mutex mutexname = __RT_MUTEX_INITIALIZER(mutexname) @@ -115,9 +112,6 @@ extern void rt_mutex_lock(struct rt_mutex *lock); #endif extern int rt_mutex_lock_interruptible(struct rt_mutex *lock); -extern int rt_mutex_timed_lock(struct rt_mutex *lock, - struct hrtimer_sleeper *timeout); - extern int rt_mutex_trylock(struct rt_mutex *lock); extern void rt_mutex_unlock(struct rt_mutex *lock); diff --git a/include/linux/rwlock_rt.h b/include/linux/rwlock_rt.h new file mode 100644 index 000000000..aafdb0a68 --- /dev/null +++ b/include/linux/rwlock_rt.h @@ -0,0 +1,109 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __LINUX_RWLOCK_RT_H +#define __LINUX_RWLOCK_RT_H + +#ifndef __LINUX_SPINLOCK_H +#error Do not include directly. Use spinlock.h +#endif + +extern void __lockfunc rt_write_lock(rwlock_t *rwlock); +extern void __lockfunc rt_read_lock(rwlock_t *rwlock); +extern int __lockfunc rt_write_trylock(rwlock_t *rwlock); +extern int __lockfunc rt_read_trylock(rwlock_t *rwlock); +extern void __lockfunc rt_write_unlock(rwlock_t *rwlock); +extern void __lockfunc rt_read_unlock(rwlock_t *rwlock); +extern int __lockfunc rt_read_can_lock(rwlock_t *rwlock); +extern int __lockfunc rt_write_can_lock(rwlock_t *rwlock); +extern void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key); + +#define read_can_lock(rwlock) rt_read_can_lock(rwlock) +#define write_can_lock(rwlock) rt_write_can_lock(rwlock) + +#define read_trylock(lock) __cond_lock(lock, rt_read_trylock(lock)) +#define write_trylock(lock) __cond_lock(lock, rt_write_trylock(lock)) + +static inline int __write_trylock_rt_irqsave(rwlock_t *lock, unsigned long *flags) +{ + *flags = 0; + return rt_write_trylock(lock); +} + +#define write_trylock_irqsave(lock, flags) \ + __cond_lock(lock, __write_trylock_rt_irqsave(lock, &(flags))) + +#define read_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + rt_read_lock(lock); \ + flags = 0; \ + } while (0) + +#define write_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + rt_write_lock(lock); \ + flags = 0; \ + } while (0) + +#define read_lock(lock) rt_read_lock(lock) + +#define read_lock_bh(lock) \ + do { \ + local_bh_disable(); \ + rt_read_lock(lock); \ + } while (0) + +#define read_lock_irq(lock) read_lock(lock) + +#define write_lock(lock) rt_write_lock(lock) + +#define write_lock_bh(lock) \ + do { \ + local_bh_disable(); \ + rt_write_lock(lock); \ + } while (0) + +#define write_lock_irq(lock) write_lock(lock) + +#define read_unlock(lock) rt_read_unlock(lock) + +#define read_unlock_bh(lock) \ + do { \ + rt_read_unlock(lock); \ + local_bh_enable(); \ + } while (0) + +#define read_unlock_irq(lock) read_unlock(lock) + +#define write_unlock(lock) rt_write_unlock(lock) + +#define write_unlock_bh(lock) \ + do { \ + rt_write_unlock(lock); \ + local_bh_enable(); \ + } while (0) + +#define write_unlock_irq(lock) write_unlock(lock) + +#define read_unlock_irqrestore(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + (void) flags; \ + rt_read_unlock(lock); \ + } while (0) + +#define write_unlock_irqrestore(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + (void) flags; \ + rt_write_unlock(lock); \ + } while (0) + +#define rwlock_init(rwl) \ +do { \ + static struct lock_class_key __key; \ + \ + __rt_rwlock_init(rwl, #rwl, &__key); \ +} while (0) + +#endif diff --git a/include/linux/rwlock_types.h b/include/linux/rwlock_types.h index 3bd03e180..0ad226b5d 100644 --- a/include/linux/rwlock_types.h +++ b/include/linux/rwlock_types.h @@ -1,6 +1,10 @@ #ifndef __LINUX_RWLOCK_TYPES_H #define __LINUX_RWLOCK_TYPES_H +#if !defined(__LINUX_SPINLOCK_TYPES_H) +# error "Do not include directly, include spinlock_types.h" +#endif + /* * include/linux/rwlock_types.h - generic rwlock type definitions * and initializers diff --git a/include/linux/rwlock_types_rt.h b/include/linux/rwlock_types_rt.h new file mode 100644 index 000000000..4762391d6 --- /dev/null +++ b/include/linux/rwlock_types_rt.h @@ -0,0 +1,56 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __LINUX_RWLOCK_TYPES_RT_H +#define __LINUX_RWLOCK_TYPES_RT_H + +#ifndef __LINUX_SPINLOCK_TYPES_H +#error "Do not include directly. Include spinlock_types.h instead" +#endif + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define RW_DEP_MAP_INIT(lockname) .dep_map = { .name = #lockname } +#else +# define RW_DEP_MAP_INIT(lockname) +#endif + +typedef struct rt_rw_lock rwlock_t; + +#define __RW_LOCK_UNLOCKED(name) __RWLOCK_RT_INITIALIZER(name) + +#define DEFINE_RWLOCK(name) \ + rwlock_t name = __RW_LOCK_UNLOCKED(name) + +/* + * A reader biased implementation primarily for CPU pinning. + * + * Can be selected as general replacement for the single reader RT rwlock + * variant + */ +struct rt_rw_lock { + struct rt_mutex rtmutex; + atomic_t readers; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#define READER_BIAS (1U << 31) +#define WRITER_BIAS (1U << 30) + +#define __RWLOCK_RT_INITIALIZER(name) \ +{ \ + .readers = ATOMIC_INIT(READER_BIAS), \ + .rtmutex = __RT_MUTEX_INITIALIZER_SAVE_STATE(name.rtmutex), \ + RW_DEP_MAP_INIT(name) \ +} + +void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name, + struct lock_class_key *key); + +#define rwlock_biased_rt_init(rwlock) \ + do { \ + static struct lock_class_key __key; \ + \ + __rwlock_biased_rt_init((rwlock), #rwlock, &__key); \ + } while (0) + +#endif diff --git a/include/linux/rwsem-rt.h b/include/linux/rwsem-rt.h new file mode 100644 index 000000000..0ba8aae9a --- /dev/null +++ b/include/linux/rwsem-rt.h @@ -0,0 +1,70 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef _LINUX_RWSEM_RT_H +#define _LINUX_RWSEM_RT_H + +#ifndef _LINUX_RWSEM_H +#error "Include rwsem.h" +#endif + +#include <linux/rtmutex.h> +#include <linux/swait.h> + +#define READER_BIAS (1U << 31) +#define WRITER_BIAS (1U << 30) + +struct rw_semaphore { + atomic_t readers; + struct rt_mutex rtmutex; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +}; + +#define __RWSEM_INITIALIZER(name) \ +{ \ + .readers = ATOMIC_INIT(READER_BIAS), \ + .rtmutex = __RT_MUTEX_INITIALIZER(name.rtmutex), \ + RW_DEP_MAP_INIT(name) \ +} + +#define DECLARE_RWSEM(lockname) \ + struct rw_semaphore lockname = __RWSEM_INITIALIZER(lockname) + +extern void __rwsem_init(struct rw_semaphore *rwsem, const char *name, + struct lock_class_key *key); + +#define __init_rwsem(sem, name, key) \ +do { \ + rt_mutex_init(&(sem)->rtmutex); \ + __rwsem_init((sem), (name), (key)); \ +} while (0) + +#define init_rwsem(sem) \ +do { \ + static struct lock_class_key __key; \ + \ + __init_rwsem((sem), #sem, &__key); \ +} while (0) + +static inline int rwsem_is_locked(struct rw_semaphore *sem) +{ + return atomic_read(&sem->readers) != READER_BIAS; +} + +static inline int rwsem_is_contended(struct rw_semaphore *sem) +{ + return atomic_read(&sem->readers) > 0; +} + +extern void __down_read(struct rw_semaphore *sem); +extern int __down_read_interruptible(struct rw_semaphore *sem); +extern int __down_read_killable(struct rw_semaphore *sem); +extern int __down_read_trylock(struct rw_semaphore *sem); +extern void __down_write(struct rw_semaphore *sem); +extern int __must_check __down_write_killable(struct rw_semaphore *sem); +extern int __down_write_trylock(struct rw_semaphore *sem); +extern void __up_read(struct rw_semaphore *sem); +extern void __up_write(struct rw_semaphore *sem); +extern void __downgrade_write(struct rw_semaphore *sem); + +#endif diff --git a/include/linux/rwsem.h b/include/linux/rwsem.h index 4c715be48..9323af8a9 100644 --- a/include/linux/rwsem.h +++ b/include/linux/rwsem.h @@ -16,6 +16,11 @@ #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/err.h> + +#ifdef CONFIG_PREEMPT_RT +#include <linux/rwsem-rt.h> +#else /* PREEMPT_RT */ + #ifdef CONFIG_RWSEM_SPIN_ON_OWNER #include <linux/osq_lock.h> #endif @@ -119,6 +124,13 @@ static inline int rwsem_is_contended(struct rw_semaphore *sem) return !list_empty(&sem->wait_list); } +#endif /* !PREEMPT_RT */ + +/* + * The functions below are the same for all rwsem implementations including + * the RT specific variant. + */ + /* * lock for reading */ diff --git a/include/linux/sched.h b/include/linux/sched.h index edd236f98..1cc9a0447 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -36,6 +36,7 @@ #include <linux/kcsan.h> #include <linux/thread_bits.h> #include <linux/kabi.h> +#include <asm/kmap_size.h> /* task_struct member predeclarations (sorted alphabetically): */ struct audit_context; @@ -114,12 +115,8 @@ struct io_uring_task; __TASK_TRACED | EXIT_DEAD | EXIT_ZOMBIE | \ TASK_PARKED) -#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0) - #define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0) -#define task_is_stopped_or_traced(task) ((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0) - #ifdef CONFIG_DEBUG_ATOMIC_SLEEP /* @@ -143,6 +140,9 @@ struct io_uring_task; smp_store_mb(current->state, (state_value)); \ } while (0) +#define __set_current_state_no_track(state_value) \ + current->state = (state_value); + #define set_special_state(state_value) \ do { \ unsigned long flags; /* may shadow */ \ @@ -196,6 +196,9 @@ struct io_uring_task; #define set_current_state(state_value) \ smp_store_mb(current->state, (state_value)) +#define __set_current_state_no_track(state_value) \ + __set_current_state(state_value) + /* * set_special_state() should be used for those states when the blocking task * can not use the regular condition based wait-loop. In that case we must @@ -674,6 +677,13 @@ struct wake_q_node { struct task_struct_resvd { }; +struct kmap_ctrl { +#ifdef CONFIG_KMAP_LOCAL + int idx; + pte_t pteval[KM_MAX_IDX]; +#endif +}; + struct task_struct { #ifdef CONFIG_THREAD_INFO_IN_TASK /* @@ -684,6 +694,8 @@ struct task_struct { #endif /* -1 unrunnable, 0 runnable, >0 stopped: */ volatile long state; + /* saved state for "spinlock sleepers" */ + volatile long saved_state; /* * This begins the randomizable portion of task_struct. Only @@ -755,6 +767,11 @@ struct task_struct { int nr_cpus_allowed; const cpumask_t *cpus_ptr; cpumask_t cpus_mask; + void *migration_pending; +#ifdef CONFIG_SMP + unsigned short migration_disabled; +#endif + unsigned short migration_flags; #ifdef CONFIG_PREEMPT_RCU int rcu_read_lock_nesting; @@ -860,6 +877,10 @@ struct task_struct { /* Stalled due to lack of memory */ unsigned in_memstall:1; #endif +#ifdef CONFIG_EVENTFD + /* Recursion prevention for eventfd_signal() */ + unsigned in_eventfd_signal:1; +#endif unsigned long atomic_flags; /* Flags requiring atomic access. */ @@ -1001,11 +1022,16 @@ struct task_struct { /* Signal handlers: */ struct signal_struct *signal; struct sighand_struct __rcu *sighand; + struct sigqueue *sigqueue_cache; sigset_t blocked; sigset_t real_blocked; /* Restored if set_restore_sigmask() was used: */ sigset_t saved_sigmask; struct sigpending pending; +#ifdef CONFIG_PREEMPT_RT + /* TODO: move me into ->restart_block ? */ + struct kernel_siginfo forced_info; +#endif unsigned long sas_ss_sp; size_t sas_ss_size; unsigned int sas_ss_flags; @@ -1032,6 +1058,7 @@ struct task_struct { raw_spinlock_t pi_lock; struct wake_q_node wake_q; + struct wake_q_node wake_q_sleeper; #ifdef CONFIG_RT_MUTEXES /* PI waiters blocked on a rt_mutex held by this task: */ @@ -1059,6 +1086,9 @@ struct task_struct { int softirq_context; int irq_config; #endif +#ifdef CONFIG_PREEMPT_RT + int softirq_disable_cnt; +#endif #ifdef CONFIG_LOCKDEP # define MAX_LOCK_DEPTH 48UL @@ -1344,6 +1374,7 @@ struct task_struct { unsigned int sequential_io; unsigned int sequential_io_avg; #endif + struct kmap_ctrl kmap_ctrl; #ifdef CONFIG_DEBUG_ATOMIC_SLEEP unsigned long task_state_change; #endif @@ -1809,6 +1840,7 @@ extern struct task_struct *find_get_task_by_vpid(pid_t nr); extern int wake_up_state(struct task_struct *tsk, unsigned int state); extern int wake_up_process(struct task_struct *tsk); +extern int wake_up_lock_sleeper(struct task_struct *tsk); extern void wake_up_new_task(struct task_struct *tsk); #ifdef CONFIG_SMP @@ -1899,6 +1931,89 @@ static inline int test_tsk_need_resched(struct task_struct *tsk) return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED)); } +#ifdef CONFIG_PREEMPT_LAZY +static inline void set_tsk_need_resched_lazy(struct task_struct *tsk) +{ + set_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); +} + +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) +{ + clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY); +} + +static inline int test_tsk_need_resched_lazy(struct task_struct *tsk) +{ + return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED_LAZY)); +} + +static inline int need_resched_lazy(void) +{ + return test_thread_flag(TIF_NEED_RESCHED_LAZY); +} + +static inline int need_resched_now(void) +{ + return test_thread_flag(TIF_NEED_RESCHED); +} + +#else +static inline void clear_tsk_need_resched_lazy(struct task_struct *tsk) { } +static inline int need_resched_lazy(void) { return 0; } + +static inline int need_resched_now(void) +{ + return test_thread_flag(TIF_NEED_RESCHED); +} + +#endif + + +static inline bool __task_is_stopped_or_traced(struct task_struct *task) +{ + if (task->state & (__TASK_STOPPED | __TASK_TRACED)) + return true; +#ifdef CONFIG_PREEMPT_RT + if (task->saved_state & (__TASK_STOPPED | __TASK_TRACED)) + return true; +#endif + return false; +} + +static inline bool task_is_stopped_or_traced(struct task_struct *task) +{ + bool traced_stopped; + +#ifdef CONFIG_PREEMPT_RT + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + traced_stopped = __task_is_stopped_or_traced(task); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); +#else + traced_stopped = __task_is_stopped_or_traced(task); +#endif + return traced_stopped; +} + +static inline bool task_is_traced(struct task_struct *task) +{ + bool traced = false; + + if (task->state & __TASK_TRACED) + return true; +#ifdef CONFIG_PREEMPT_RT + /* in case the task is sleeping on tasklist_lock */ + raw_spin_lock_irq(&task->pi_lock); + if (task->state & __TASK_TRACED) + traced = true; + else if (task->saved_state & __TASK_TRACED) + traced = true; + raw_spin_unlock_irq(&task->pi_lock); +#endif + return traced; +} + /* * cond_resched() and cond_resched_lock(): latency reduction via * explicit rescheduling in places that are safe. The return diff --git a/include/linux/sched/hotplug.h b/include/linux/sched/hotplug.h index 9a62ffdd2..412cdaba3 100644 --- a/include/linux/sched/hotplug.h +++ b/include/linux/sched/hotplug.h @@ -11,8 +11,10 @@ extern int sched_cpu_activate(unsigned int cpu); extern int sched_cpu_deactivate(unsigned int cpu); #ifdef CONFIG_HOTPLUG_CPU +extern int sched_cpu_wait_empty(unsigned int cpu); extern int sched_cpu_dying(unsigned int cpu); #else +# define sched_cpu_wait_empty NULL # define sched_cpu_dying NULL #endif diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index dc1f4dcd9..9796cc213 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -49,6 +49,17 @@ static inline void mmdrop(struct mm_struct *mm) __mmdrop(mm); } +#ifdef CONFIG_PREEMPT_RT +extern void __mmdrop_delayed(struct rcu_head *rhp); +static inline void mmdrop_delayed(struct mm_struct *mm) +{ + if (atomic_dec_and_test(&mm->mm_count)) + call_rcu(&mm->delayed_drop, __mmdrop_delayed); +} +#else +# define mmdrop_delayed(mm) mmdrop(mm) +#endif + /** * mmget() - Pin the address space associated with a &struct mm_struct. * @mm: The address space to pin. diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h index e5af028c0..994c25640 100644 --- a/include/linux/sched/rt.h +++ b/include/linux/sched/rt.h @@ -39,20 +39,12 @@ static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *p) } extern void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task); extern void rt_mutex_adjust_pi(struct task_struct *p); -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return tsk->pi_blocked_on != NULL; -} #else static inline struct task_struct *rt_mutex_get_top_task(struct task_struct *task) { return NULL; } # define rt_mutex_adjust_pi(p) do { } while (0) -static inline bool tsk_is_pi_blocked(struct task_struct *tsk) -{ - return false; -} #endif extern void normalize_rt_tasks(void); diff --git a/include/linux/sched/wake_q.h b/include/linux/sched/wake_q.h index 26a2013ac..6e2dff721 100644 --- a/include/linux/sched/wake_q.h +++ b/include/linux/sched/wake_q.h @@ -58,6 +58,17 @@ static inline bool wake_q_empty(struct wake_q_head *head) extern void wake_q_add(struct wake_q_head *head, struct task_struct *task); extern void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task); -extern void wake_up_q(struct wake_q_head *head); +extern void wake_q_add_sleeper(struct wake_q_head *head, struct task_struct *task); +extern void __wake_up_q(struct wake_q_head *head, bool sleeper); + +static inline void wake_up_q(struct wake_q_head *head) +{ + __wake_up_q(head, false); +} + +static inline void wake_up_q_sleeper(struct wake_q_head *head) +{ + __wake_up_q(head, true); +} #endif /* _LINUX_SCHED_WAKE_Q_H */ diff --git a/include/linux/serial_8250.h b/include/linux/serial_8250.h index 2b70f736b..68d756373 100644 --- a/include/linux/serial_8250.h +++ b/include/linux/serial_8250.h @@ -7,6 +7,7 @@ #ifndef _LINUX_SERIAL_8250_H #define _LINUX_SERIAL_8250_H +#include <linux/atomic.h> #include <linux/serial_core.h> #include <linux/serial_reg.h> #include <linux/platform_device.h> @@ -125,6 +126,8 @@ struct uart_8250_port { #define MSR_SAVE_FLAGS UART_MSR_ANY_DELTA unsigned char msr_saved_flags; + atomic_t console_printing; + struct uart_8250_dma *dma; const struct uart_8250_ops *ops; @@ -180,6 +183,8 @@ void serial8250_init_port(struct uart_8250_port *up); void serial8250_set_defaults(struct uart_8250_port *up); void serial8250_console_write(struct uart_8250_port *up, const char *s, unsigned int count); +void serial8250_console_write_atomic(struct uart_8250_port *up, const char *s, + unsigned int count); int serial8250_console_setup(struct uart_port *port, char *options, bool probe); int serial8250_console_exit(struct uart_port *port); diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index a5a5d1d4d..0470d1582 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -31,7 +31,7 @@ struct shmem_sb_info { struct percpu_counter used_blocks; /* How many are allocated */ unsigned long max_inodes; /* How many inodes are allowed */ unsigned long free_inodes; /* How many are left for allocation */ - spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ + raw_spinlock_t stat_lock; /* Serialize shmem_sb_info changes */ umode_t mode; /* Mount mode for root directory */ unsigned char huge; /* Whether to try for hugepages */ kuid_t uid; /* Mount uid for root directory */ diff --git a/include/linux/signal.h b/include/linux/signal.h index b256f9c65..ebf6c515a 100644 --- a/include/linux/signal.h +++ b/include/linux/signal.h @@ -265,6 +265,7 @@ static inline void init_sigpending(struct sigpending *sig) } extern void flush_sigqueue(struct sigpending *queue); +extern void flush_task_sigqueue(struct task_struct *tsk); /* Test if 'sig' is valid signal. Use this instead of testing _NSIG directly */ static inline int valid_signal(unsigned long sig) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index e5f61bdd4..ddcab976a 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -298,6 +298,7 @@ struct sk_buff_head { __u32 qlen; spinlock_t lock; + raw_spinlock_t raw_lock; }; struct sk_buff; @@ -1913,6 +1914,12 @@ static inline void skb_queue_head_init(struct sk_buff_head *list) __skb_queue_head_init(list); } +static inline void skb_queue_head_init_raw(struct sk_buff_head *list) +{ + raw_spin_lock_init(&list->raw_lock); + __skb_queue_head_init(list); +} + static inline void skb_queue_head_init_class(struct sk_buff_head *list, struct lock_class_key *class) { diff --git a/include/linux/smp.h b/include/linux/smp.h index 84a0b4828..8348fa412 100644 --- a/include/linux/smp.h +++ b/include/linux/smp.h @@ -260,6 +260,9 @@ static inline int get_boot_cpu_id(void) #define get_cpu() ({ preempt_disable(); __smp_processor_id(); }) #define put_cpu() preempt_enable() +#define get_cpu_light() ({ migrate_disable(); __smp_processor_id(); }) +#define put_cpu_light() migrate_enable() + /* * Callback to arch code if there's nosmp or maxcpus=0 on the * boot command line: diff --git a/include/linux/spinlock.h b/include/linux/spinlock.h index 79897841a..c3c70291b 100644 --- a/include/linux/spinlock.h +++ b/include/linux/spinlock.h @@ -309,7 +309,11 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) }) /* Include rwlock functions */ -#include <linux/rwlock.h> +#ifdef CONFIG_PREEMPT_RT +# include <linux/rwlock_rt.h> +#else +# include <linux/rwlock.h> +#endif /* * Pull the _spin_*()/_read_*()/_write_*() functions/declarations: @@ -320,6 +324,10 @@ static inline void do_raw_spin_unlock(raw_spinlock_t *lock) __releases(lock) # include <linux/spinlock_api_up.h> #endif +#ifdef CONFIG_PREEMPT_RT +# include <linux/spinlock_rt.h> +#else /* PREEMPT_RT */ + /* * Map the spin_lock functions to the raw variants for PREEMPT_RT=n */ @@ -454,6 +462,8 @@ static __always_inline int spin_is_contended(spinlock_t *lock) #define assert_spin_locked(lock) assert_raw_spin_locked(&(lock)->rlock) +#endif /* !PREEMPT_RT */ + /* * Pull the atomic_t declaration: * (asm-mips/atomic.h needs above definitions) diff --git a/include/linux/spinlock_api_smp.h b/include/linux/spinlock_api_smp.h index 19a9be9d9..da38149f2 100644 --- a/include/linux/spinlock_api_smp.h +++ b/include/linux/spinlock_api_smp.h @@ -187,6 +187,8 @@ static inline int __raw_spin_trylock_bh(raw_spinlock_t *lock) return 0; } -#include <linux/rwlock_api_smp.h> +#ifndef CONFIG_PREEMPT_RT +# include <linux/rwlock_api_smp.h> +#endif #endif /* __LINUX_SPINLOCK_API_SMP_H */ diff --git a/include/linux/spinlock_rt.h b/include/linux/spinlock_rt.h new file mode 100644 index 000000000..3085132ea --- /dev/null +++ b/include/linux/spinlock_rt.h @@ -0,0 +1,155 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __LINUX_SPINLOCK_RT_H +#define __LINUX_SPINLOCK_RT_H + +#ifndef __LINUX_SPINLOCK_H +#error Do not include directly. Use spinlock.h +#endif + +#include <linux/bug.h> + +extern void +__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key); + +#define spin_lock_init(slock) \ +do { \ + static struct lock_class_key __key; \ + \ + rt_mutex_init(&(slock)->lock); \ + __rt_spin_lock_init(slock, #slock, &__key); \ +} while (0) + +extern void __lockfunc rt_spin_lock(spinlock_t *lock); +extern void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass); +extern void __lockfunc rt_spin_lock_nest_lock(spinlock_t *lock, struct lockdep_map *nest_lock); +extern void __lockfunc rt_spin_unlock(spinlock_t *lock); +extern void __lockfunc rt_spin_lock_unlock(spinlock_t *lock); +extern int __lockfunc rt_spin_trylock_irqsave(spinlock_t *lock, unsigned long *flags); +extern int __lockfunc rt_spin_trylock_bh(spinlock_t *lock); +extern int __lockfunc rt_spin_trylock(spinlock_t *lock); +extern int atomic_dec_and_spin_lock(atomic_t *atomic, spinlock_t *lock); + +/* + * lockdep-less calls, for derived types like rwlock: + * (for trylock they can use rt_mutex_trylock() directly. + * Migrate disable handling must be done at the call site. + */ +extern void __lockfunc __rt_spin_lock(struct rt_mutex *lock); +extern void __lockfunc __rt_spin_trylock(struct rt_mutex *lock); +extern void __lockfunc __rt_spin_unlock(struct rt_mutex *lock); + +#define spin_lock(lock) rt_spin_lock(lock) + +#define spin_lock_bh(lock) \ + do { \ + local_bh_disable(); \ + rt_spin_lock(lock); \ + } while (0) + +#define spin_lock_irq(lock) spin_lock(lock) + +#define spin_do_trylock(lock) __cond_lock(lock, rt_spin_trylock(lock)) + +#define spin_trylock(lock) \ +({ \ + int __locked; \ + __locked = spin_do_trylock(lock); \ + __locked; \ +}) + +#ifdef CONFIG_LOCKDEP +# define spin_lock_nested(lock, subclass) \ + do { \ + rt_spin_lock_nested(lock, subclass); \ + } while (0) + +#define spin_lock_bh_nested(lock, subclass) \ + do { \ + local_bh_disable(); \ + rt_spin_lock_nested(lock, subclass); \ + } while (0) + +# define spin_lock_nest_lock(lock, subclass) \ + do { \ + typecheck(struct lockdep_map *, &(subclass)->dep_map); \ + rt_spin_lock_nest_lock(lock, &(subclass)->dep_map); \ + } while (0) + +# define spin_lock_irqsave_nested(lock, flags, subclass) \ + do { \ + typecheck(unsigned long, flags); \ + flags = 0; \ + rt_spin_lock_nested(lock, subclass); \ + } while (0) +#else +# define spin_lock_nested(lock, subclass) spin_lock(((void)(subclass), (lock))) +# define spin_lock_nest_lock(lock, subclass) spin_lock(((void)(subclass), (lock))) +# define spin_lock_bh_nested(lock, subclass) spin_lock_bh(((void)(subclass), (lock))) + +# define spin_lock_irqsave_nested(lock, flags, subclass) \ + do { \ + typecheck(unsigned long, flags); \ + flags = 0; \ + spin_lock(((void)(subclass), (lock))); \ + } while (0) +#endif + +#define spin_lock_irqsave(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + flags = 0; \ + spin_lock(lock); \ + } while (0) + +#define spin_unlock(lock) rt_spin_unlock(lock) + +#define spin_unlock_bh(lock) \ + do { \ + rt_spin_unlock(lock); \ + local_bh_enable(); \ + } while (0) + +#define spin_unlock_irq(lock) spin_unlock(lock) + +#define spin_unlock_irqrestore(lock, flags) \ + do { \ + typecheck(unsigned long, flags); \ + (void) flags; \ + spin_unlock(lock); \ + } while (0) + +#define spin_trylock_bh(lock) __cond_lock(lock, rt_spin_trylock_bh(lock)) +#define spin_trylock_irq(lock) spin_trylock(lock) + +#define spin_trylock_irqsave(lock, flags) \ +({ \ + int __locked; \ + \ + typecheck(unsigned long, flags); \ + flags = 0; \ + __locked = spin_trylock(lock); \ + __locked; \ +}) + +#ifdef CONFIG_GENERIC_LOCKBREAK +# define spin_is_contended(lock) ((lock)->break_lock) +#else +# define spin_is_contended(lock) (((void)(lock), 0)) +#endif + +static inline int spin_can_lock(spinlock_t *lock) +{ + return !rt_mutex_is_locked(&lock->lock); +} + +static inline int spin_is_locked(spinlock_t *lock) +{ + return rt_mutex_is_locked(&lock->lock); +} + +static inline void assert_spin_locked(spinlock_t *lock) +{ + BUG_ON(!spin_is_locked(lock)); +} + +#endif diff --git a/include/linux/spinlock_types.h b/include/linux/spinlock_types.h index b981caafe..8d896d3e1 100644 --- a/include/linux/spinlock_types.h +++ b/include/linux/spinlock_types.h @@ -9,93 +9,15 @@ * Released under the General Public License (GPL). */ -#if defined(CONFIG_SMP) -# include <asm/spinlock_types.h> -#else -# include <linux/spinlock_types_up.h> -#endif - -#include <linux/lockdep_types.h> - -typedef struct raw_spinlock { - arch_spinlock_t raw_lock; -#ifdef CONFIG_DEBUG_SPINLOCK - unsigned int magic, owner_cpu; - void *owner; -#endif -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map dep_map; -#endif -} raw_spinlock_t; - -#define SPINLOCK_MAGIC 0xdead4ead - -#define SPINLOCK_OWNER_INIT ((void *)-1L) - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define RAW_SPIN_DEP_MAP_INIT(lockname) \ - .dep_map = { \ - .name = #lockname, \ - .wait_type_inner = LD_WAIT_SPIN, \ - } -# define SPIN_DEP_MAP_INIT(lockname) \ - .dep_map = { \ - .name = #lockname, \ - .wait_type_inner = LD_WAIT_CONFIG, \ - } -#else -# define RAW_SPIN_DEP_MAP_INIT(lockname) -# define SPIN_DEP_MAP_INIT(lockname) -#endif +#include <linux/spinlock_types_raw.h> -#ifdef CONFIG_DEBUG_SPINLOCK -# define SPIN_DEBUG_INIT(lockname) \ - .magic = SPINLOCK_MAGIC, \ - .owner_cpu = -1, \ - .owner = SPINLOCK_OWNER_INIT, +#ifndef CONFIG_PREEMPT_RT +# include <linux/spinlock_types_nort.h> +# include <linux/rwlock_types.h> #else -# define SPIN_DEBUG_INIT(lockname) +# include <linux/rtmutex.h> +# include <linux/spinlock_types_rt.h> +# include <linux/rwlock_types_rt.h> #endif -#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \ - { \ - .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ - SPIN_DEBUG_INIT(lockname) \ - RAW_SPIN_DEP_MAP_INIT(lockname) } - -#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \ - (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname) - -#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x) - -typedef struct spinlock { - union { - struct raw_spinlock rlock; - -#ifdef CONFIG_DEBUG_LOCK_ALLOC -# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map)) - struct { - u8 __padding[LOCK_PADSIZE]; - struct lockdep_map dep_map; - }; -#endif - }; -} spinlock_t; - -#define ___SPIN_LOCK_INITIALIZER(lockname) \ - { \ - .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ - SPIN_DEBUG_INIT(lockname) \ - SPIN_DEP_MAP_INIT(lockname) } - -#define __SPIN_LOCK_INITIALIZER(lockname) \ - { { .rlock = ___SPIN_LOCK_INITIALIZER(lockname) } } - -#define __SPIN_LOCK_UNLOCKED(lockname) \ - (spinlock_t) __SPIN_LOCK_INITIALIZER(lockname) - -#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) - -#include <linux/rwlock_types.h> - #endif /* __LINUX_SPINLOCK_TYPES_H */ diff --git a/include/linux/spinlock_types_nort.h b/include/linux/spinlock_types_nort.h new file mode 100644 index 000000000..e4549f0dd --- /dev/null +++ b/include/linux/spinlock_types_nort.h @@ -0,0 +1,39 @@ +#ifndef __LINUX_SPINLOCK_TYPES_NORT_H +#define __LINUX_SPINLOCK_TYPES_NORT_H + +#ifndef __LINUX_SPINLOCK_TYPES_H +#error "Do not include directly. Include spinlock_types.h instead" +#endif + +/* + * The non RT version maps spinlocks to raw_spinlocks + */ +typedef struct spinlock { + union { + struct raw_spinlock rlock; + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define LOCK_PADSIZE (offsetof(struct raw_spinlock, dep_map)) + struct { + u8 __padding[LOCK_PADSIZE]; + struct lockdep_map dep_map; + }; +#endif + }; +} spinlock_t; + +#define ___SPIN_LOCK_INITIALIZER(lockname) \ +{ \ + .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ + SPIN_DEBUG_INIT(lockname) \ + SPIN_DEP_MAP_INIT(lockname) } + +#define __SPIN_LOCK_INITIALIZER(lockname) \ + { { .rlock = ___SPIN_LOCK_INITIALIZER(lockname) } } + +#define __SPIN_LOCK_UNLOCKED(lockname) \ + (spinlock_t) __SPIN_LOCK_INITIALIZER(lockname) + +#define DEFINE_SPINLOCK(x) spinlock_t x = __SPIN_LOCK_UNLOCKED(x) + +#endif diff --git a/include/linux/spinlock_types_raw.h b/include/linux/spinlock_types_raw.h new file mode 100644 index 000000000..1d4a180e9 --- /dev/null +++ b/include/linux/spinlock_types_raw.h @@ -0,0 +1,65 @@ +#ifndef __LINUX_SPINLOCK_TYPES_RAW_H +#define __LINUX_SPINLOCK_TYPES_RAW_H + +#include <linux/types.h> + +#if defined(CONFIG_SMP) +# include <asm/spinlock_types.h> +#else +# include <linux/spinlock_types_up.h> +#endif + +#include <linux/lockdep_types.h> + +typedef struct raw_spinlock { + arch_spinlock_t raw_lock; +#ifdef CONFIG_DEBUG_SPINLOCK + unsigned int magic, owner_cpu; + void *owner; +#endif +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} raw_spinlock_t; + +#define SPINLOCK_MAGIC 0xdead4ead + +#define SPINLOCK_OWNER_INIT ((void *)-1L) + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +# define RAW_SPIN_DEP_MAP_INIT(lockname) \ + .dep_map = { \ + .name = #lockname, \ + .wait_type_inner = LD_WAIT_SPIN, \ + } +# define SPIN_DEP_MAP_INIT(lockname) \ + .dep_map = { \ + .name = #lockname, \ + .wait_type_inner = LD_WAIT_CONFIG, \ + } +#else +# define RAW_SPIN_DEP_MAP_INIT(lockname) +# define SPIN_DEP_MAP_INIT(lockname) +#endif + +#ifdef CONFIG_DEBUG_SPINLOCK +# define SPIN_DEBUG_INIT(lockname) \ + .magic = SPINLOCK_MAGIC, \ + .owner_cpu = -1, \ + .owner = SPINLOCK_OWNER_INIT, +#else +# define SPIN_DEBUG_INIT(lockname) +#endif + +#define __RAW_SPIN_LOCK_INITIALIZER(lockname) \ +{ \ + .raw_lock = __ARCH_SPIN_LOCK_UNLOCKED, \ + SPIN_DEBUG_INIT(lockname) \ + RAW_SPIN_DEP_MAP_INIT(lockname) } + +#define __RAW_SPIN_LOCK_UNLOCKED(lockname) \ + (raw_spinlock_t) __RAW_SPIN_LOCK_INITIALIZER(lockname) + +#define DEFINE_RAW_SPINLOCK(x) raw_spinlock_t x = __RAW_SPIN_LOCK_UNLOCKED(x) + +#endif diff --git a/include/linux/spinlock_types_rt.h b/include/linux/spinlock_types_rt.h new file mode 100644 index 000000000..446da786e --- /dev/null +++ b/include/linux/spinlock_types_rt.h @@ -0,0 +1,38 @@ +// SPDX-License-Identifier: GPL-2.0-only +#ifndef __LINUX_SPINLOCK_TYPES_RT_H +#define __LINUX_SPINLOCK_TYPES_RT_H + +#ifndef __LINUX_SPINLOCK_TYPES_H +#error "Do not include directly. Include spinlock_types.h instead" +#endif + +#include <linux/cache.h> + +/* + * PREEMPT_RT: spinlocks - an RT mutex plus lock-break field: + */ +typedef struct spinlock { + struct rt_mutex lock; + unsigned int break_lock; +#ifdef CONFIG_DEBUG_LOCK_ALLOC + struct lockdep_map dep_map; +#endif +} spinlock_t; + +#define __RT_SPIN_INITIALIZER(name) \ + { \ + .wait_lock = __RAW_SPIN_LOCK_UNLOCKED(name.wait_lock), \ + .save_state = 1, \ + } +/* +.wait_list = PLIST_HEAD_INIT_RAW((name).lock.wait_list, (name).lock.wait_lock) +*/ + +#define __SPIN_LOCK_UNLOCKED(name) \ + { .lock = __RT_SPIN_INITIALIZER(name.lock), \ + SPIN_DEP_MAP_INIT(name) } + +#define DEFINE_SPINLOCK(name) \ + spinlock_t name = __SPIN_LOCK_UNLOCKED(name) + +#endif diff --git a/include/linux/spinlock_types_up.h b/include/linux/spinlock_types_up.h index c09b6407a..d9b371fa1 100644 --- a/include/linux/spinlock_types_up.h +++ b/include/linux/spinlock_types_up.h @@ -1,7 +1,7 @@ #ifndef __LINUX_SPINLOCK_TYPES_UP_H #define __LINUX_SPINLOCK_TYPES_UP_H -#ifndef __LINUX_SPINLOCK_TYPES_H +#if !defined(__LINUX_SPINLOCK_TYPES_H) && !defined(__LINUX_RT_MUTEX_H) # error "please don't include this file directly" #endif diff --git a/include/linux/stop_machine.h b/include/linux/stop_machine.h index 08ec8e2fd..8eac0050b 100644 --- a/include/linux/stop_machine.h +++ b/include/linux/stop_machine.h @@ -25,6 +25,7 @@ typedef int (*cpu_stop_fn_t)(void *arg); struct cpu_stop_work { struct list_head list; /* cpu_stopper->works */ cpu_stop_fn_t fn; + unsigned long caller; void *arg; struct cpu_stop_done *done; KABI_RESERVE(1) @@ -38,6 +39,8 @@ void stop_machine_park(int cpu); void stop_machine_unpark(int cpu); void stop_machine_yield(const struct cpumask *cpumask); +extern void print_stop_info(const char *log_lvl, struct task_struct *task); + #else /* CONFIG_SMP */ #include <linux/workqueue.h> @@ -82,6 +85,8 @@ static inline bool stop_one_cpu_nowait(unsigned int cpu, return false; } +static inline void print_stop_info(const char *log_lvl, struct task_struct *task) { } + #endif /* CONFIG_SMP */ /* diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h index 19f76d87f..1c89c7d9f 100644 --- a/include/linux/thread_info.h +++ b/include/linux/thread_info.h @@ -36,7 +36,17 @@ static inline long set_restart_fn(struct restart_block *restart, #define THREADINFO_GFP (GFP_KERNEL_ACCOUNT | __GFP_ZERO) -#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#ifdef CONFIG_PREEMPT_LAZY +#define tif_need_resched() (test_thread_flag(TIF_NEED_RESCHED) || \ + test_thread_flag(TIF_NEED_RESCHED_LAZY)) +#define tif_need_resched_now() (test_thread_flag(TIF_NEED_RESCHED)) +#define tif_need_resched_lazy() test_thread_flag(TIF_NEED_RESCHED_LAZY)) + +#else +#define tif_need_resched() test_thread_flag(TIF_NEED_RESCHED) +#define tif_need_resched_now() test_thread_flag(TIF_NEED_RESCHED) +#define tif_need_resched_lazy() 0 +#endif #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES static inline int arch_within_stack_frames(const void * const stack, diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h index 409385b25..4a0f567d6 100644 --- a/include/linux/trace_events.h +++ b/include/linux/trace_events.h @@ -67,6 +67,8 @@ struct trace_entry { unsigned char flags; unsigned char preempt_count; int pid; + unsigned char migrate_disable; + unsigned char preempt_lazy_count; }; #define TRACE_EVENT_TYPE_MAX \ @@ -152,17 +154,66 @@ static inline void tracing_generic_entry_update(struct trace_entry *entry, unsigned short type, unsigned int trace_ctx) { - struct task_struct *tsk = current; - entry->preempt_count = trace_ctx & 0xff; - entry->pid = (tsk) ? tsk->pid : 0; + entry->migrate_disable = (trace_ctx >> 8) & 0xff; + entry->preempt_lazy_count = (trace_ctx >> 16) & 0xff; + entry->pid = current->pid; entry->type = type; - entry->flags = trace_ctx >> 16; + entry->flags = trace_ctx >> 24; } -unsigned int tracing_gen_ctx_flags(unsigned long irqflags); -unsigned int tracing_gen_ctx(void); -unsigned int tracing_gen_ctx_dec(void); +unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); + +enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, + TRACE_FLAG_IRQS_NOSUPPORT = 0x02, + TRACE_FLAG_NEED_RESCHED = 0x04, + TRACE_FLAG_HARDIRQ = 0x08, + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_PREEMPT_RESCHED = 0x20, + TRACE_FLAG_NMI = 0x40, + TRACE_FLAG_NEED_RESCHED_LAZY = 0x80, +}; + +#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT +static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) +{ + unsigned int irq_status = irqs_disabled_flags(irqflags) ? + TRACE_FLAG_IRQS_OFF : 0; + return tracing_gen_ctx_irq_test(irq_status); +} +static inline unsigned int tracing_gen_ctx(void) +{ + unsigned long irqflags; + + local_save_flags(irqflags); + return tracing_gen_ctx_flags(irqflags); +} +#else + +static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) +{ + return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); +} +static inline unsigned int tracing_gen_ctx(void) +{ + return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); +} +#endif + +static inline unsigned int tracing_gen_ctx_dec(void) +{ + unsigned int trace_ctx; + + trace_ctx = tracing_gen_ctx(); + /* + * Subtract one from the preeption counter if preemption is enabled, + * see trace_event_buffer_reserve()for details. + */ + if (IS_ENABLED(CONFIG_PREEMPTION)) + trace_ctx--; + return trace_ctx; +} struct trace_event_file; diff --git a/include/linux/u64_stats_sync.h b/include/linux/u64_stats_sync.h index e81856c0b..66eb968a0 100644 --- a/include/linux/u64_stats_sync.h +++ b/include/linux/u64_stats_sync.h @@ -66,7 +66,7 @@ #include <linux/seqlock.h> struct u64_stats_sync { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG==32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) seqcount_t seq; #endif }; @@ -115,7 +115,7 @@ static inline void u64_stats_inc(u64_stats_t *p) } #endif -#if BITS_PER_LONG == 32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) #define u64_stats_init(syncp) seqcount_init(&(syncp)->seq) #else static inline void u64_stats_init(struct u64_stats_sync *syncp) @@ -125,15 +125,19 @@ static inline void u64_stats_init(struct u64_stats_sync *syncp) static inline void u64_stats_update_begin(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); write_seqcount_begin(&syncp->seq); #endif } static inline void u64_stats_update_end(struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); #endif } @@ -142,8 +146,11 @@ u64_stats_update_begin_irqsave(struct u64_stats_sync *syncp) { unsigned long flags = 0; -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) - local_irq_save(flags); +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_disable(); + else + local_irq_save(flags); write_seqcount_begin(&syncp->seq); #endif return flags; @@ -153,15 +160,18 @@ static inline void u64_stats_update_end_irqrestore(struct u64_stats_sync *syncp, unsigned long flags) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) write_seqcount_end(&syncp->seq); - local_irq_restore(flags); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + preempt_enable(); + else + local_irq_restore(flags); #endif } static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_begin(&syncp->seq); #else return 0; @@ -170,7 +180,7 @@ static inline unsigned int __u64_stats_fetch_begin(const struct u64_stats_sync * static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_disable(); #endif return __u64_stats_fetch_begin(syncp); @@ -179,7 +189,7 @@ static inline unsigned int u64_stats_fetch_begin(const struct u64_stats_sync *sy static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT)) return read_seqcount_retry(&syncp->seq, start); #else return false; @@ -189,7 +199,7 @@ static inline bool __u64_stats_fetch_retry(const struct u64_stats_sync *syncp, static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && (!defined(CONFIG_SMP) && !defined(CONFIG_PREEMPT_RT)) preempt_enable(); #endif return __u64_stats_fetch_retry(syncp, start); @@ -203,7 +213,9 @@ static inline bool u64_stats_fetch_retry(const struct u64_stats_sync *syncp, */ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync *syncp) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_disable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_disable(); #endif return __u64_stats_fetch_begin(syncp); @@ -212,7 +224,9 @@ static inline unsigned int u64_stats_fetch_begin_irq(const struct u64_stats_sync static inline bool u64_stats_fetch_retry_irq(const struct u64_stats_sync *syncp, unsigned int start) { -#if BITS_PER_LONG==32 && !defined(CONFIG_SMP) +#if BITS_PER_LONG == 32 && defined(CONFIG_PREEMPT_RT) + preempt_enable(); +#elif BITS_PER_LONG == 32 && !defined(CONFIG_SMP) local_irq_enable(); #endif return __u64_stats_fetch_retry(syncp, start); diff --git a/include/linux/vmstat.h b/include/linux/vmstat.h index 322dcbfcc..9a3a10ea3 100644 --- a/include/linux/vmstat.h +++ b/include/linux/vmstat.h @@ -63,7 +63,9 @@ DECLARE_PER_CPU(struct vm_event_state, vm_event_states); */ static inline void __count_vm_event(enum vm_event_item item) { + preempt_disable_rt(); raw_cpu_inc(vm_event_states.event[item]); + preempt_enable_rt(); } static inline void count_vm_event(enum vm_event_item item) @@ -73,7 +75,9 @@ static inline void count_vm_event(enum vm_event_item item) static inline void __count_vm_events(enum vm_event_item item, long delta) { + preempt_disable_rt(); raw_cpu_add(vm_event_states.event[item], delta); + preempt_enable_rt(); } static inline void count_vm_events(enum vm_event_item item, long delta) diff --git a/include/linux/vtime.h b/include/linux/vtime.h index 2cdeca062..041d6524d 100644 --- a/include/linux/vtime.h +++ b/include/linux/vtime.h @@ -83,36 +83,46 @@ static inline void vtime_init_idle(struct task_struct *tsk, int cpu) { } #endif #ifdef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE -extern void vtime_account_irq_enter(struct task_struct *tsk); -static inline void vtime_account_irq_exit(struct task_struct *tsk) -{ - /* On hard|softirq exit we always account to hard|softirq cputime */ - vtime_account_kernel(tsk); -} +extern void vtime_account_irq(struct task_struct *tsk, unsigned int offset); +extern void vtime_account_softirq(struct task_struct *tsk); +extern void vtime_account_hardirq(struct task_struct *tsk); extern void vtime_flush(struct task_struct *tsk); #else /* !CONFIG_VIRT_CPU_ACCOUNTING_NATIVE */ -static inline void vtime_account_irq_enter(struct task_struct *tsk) { } -static inline void vtime_account_irq_exit(struct task_struct *tsk) { } +static inline void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { } +static inline void vtime_account_softirq(struct task_struct *tsk) { } +static inline void vtime_account_hardirq(struct task_struct *tsk) { } static inline void vtime_flush(struct task_struct *tsk) { } #endif #ifdef CONFIG_IRQ_TIME_ACCOUNTING -extern void irqtime_account_irq(struct task_struct *tsk); +extern void irqtime_account_irq(struct task_struct *tsk, unsigned int offset); #else -static inline void irqtime_account_irq(struct task_struct *tsk) { } +static inline void irqtime_account_irq(struct task_struct *tsk, unsigned int offset) { } #endif -static inline void account_irq_enter_time(struct task_struct *tsk) +static inline void account_softirq_enter(struct task_struct *tsk) +{ + vtime_account_irq(tsk, SOFTIRQ_OFFSET); + irqtime_account_irq(tsk, SOFTIRQ_OFFSET); +} + +static inline void account_softirq_exit(struct task_struct *tsk) +{ + vtime_account_softirq(tsk); + irqtime_account_irq(tsk, 0); +} + +static inline void account_hardirq_enter(struct task_struct *tsk) { - vtime_account_irq_enter(tsk); - irqtime_account_irq(tsk); + vtime_account_irq(tsk, HARDIRQ_OFFSET); + irqtime_account_irq(tsk, HARDIRQ_OFFSET); } -static inline void account_irq_exit_time(struct task_struct *tsk) +static inline void account_hardirq_exit(struct task_struct *tsk) { - vtime_account_irq_exit(tsk); - irqtime_account_irq(tsk); + vtime_account_hardirq(tsk); + irqtime_account_irq(tsk, 0); } #endif /* _LINUX_KERNEL_VTIME_H */ diff --git a/include/linux/wait.h b/include/linux/wait.h index 9b8b08331..33001b534 100644 --- a/include/linux/wait.h +++ b/include/linux/wait.h @@ -10,6 +10,7 @@ #include <asm/current.h> #include <uapi/linux/wait.h> +#include <linux/atomic.h> typedef struct wait_queue_entry wait_queue_entry_t; diff --git a/include/linux/ww_mutex.h b/include/linux/ww_mutex.h index 6ecf2a022..3145de598 100644 --- a/include/linux/ww_mutex.h +++ b/include/linux/ww_mutex.h @@ -28,6 +28,14 @@ struct ww_class { unsigned int is_wait_die; }; +struct ww_mutex { + struct mutex base; + struct ww_acquire_ctx *ctx; +#ifdef CONFIG_DEBUG_MUTEXES + struct ww_class *ww_class; +#endif +}; + struct ww_acquire_ctx { struct task_struct *task; unsigned long stamp; diff --git a/include/net/gen_stats.h b/include/net/gen_stats.h index 1424e02ce..163f8415e 100644 --- a/include/net/gen_stats.h +++ b/include/net/gen_stats.h @@ -6,6 +6,7 @@ #include <linux/socket.h> #include <linux/rtnetlink.h> #include <linux/pkt_sched.h> +#include <net/net_seq_lock.h> /* Note: this used to be in include/uapi/linux/gen_stats.h */ struct gnet_stats_basic_packed { @@ -42,15 +43,15 @@ int gnet_stats_start_copy_compat(struct sk_buff *skb, int type, spinlock_t *lock, struct gnet_dump *d, int padattr); -int gnet_stats_copy_basic(const seqcount_t *running, +int gnet_stats_copy_basic(net_seqlock_t *running, struct gnet_dump *d, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); -void __gnet_stats_copy_basic(const seqcount_t *running, +void __gnet_stats_copy_basic(net_seqlock_t *running, struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); -int gnet_stats_copy_basic_hw(const seqcount_t *running, +int gnet_stats_copy_basic_hw(net_seqlock_t *running, struct gnet_dump *d, struct gnet_stats_basic_cpu __percpu *cpu, struct gnet_stats_basic_packed *b); @@ -70,13 +71,13 @@ int gen_new_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **rate_est, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt); + net_seqlock_t *running, struct nlattr *opt); void gen_kill_estimator(struct net_rate_estimator __rcu **ptr); int gen_replace_estimator(struct gnet_stats_basic_packed *bstats, struct gnet_stats_basic_cpu __percpu *cpu_bstats, struct net_rate_estimator __rcu **ptr, spinlock_t *lock, - seqcount_t *running, struct nlattr *opt); + net_seqlock_t *running, struct nlattr *opt); bool gen_estimator_active(struct net_rate_estimator __rcu **ptr); bool gen_estimator_read(struct net_rate_estimator __rcu **ptr, struct gnet_stats_rate_est64 *sample); diff --git a/include/net/net_seq_lock.h b/include/net/net_seq_lock.h new file mode 100644 index 000000000..67710bace --- /dev/null +++ b/include/net/net_seq_lock.h @@ -0,0 +1,15 @@ +#ifndef __NET_NET_SEQ_LOCK_H__ +#define __NET_NET_SEQ_LOCK_H__ + +#ifdef CONFIG_PREEMPT_RT +# define net_seqlock_t seqlock_t +# define net_seq_begin(__r) read_seqbegin(__r) +# define net_seq_retry(__r, __s) read_seqretry(__r, __s) + +#else +# define net_seqlock_t seqcount_t +# define net_seq_begin(__r) read_seqcount_begin(__r) +# define net_seq_retry(__r, __s) read_seqcount_retry(__r, __s) +#endif + +#endif diff --git a/include/net/netns/xfrm.h b/include/net/netns/xfrm.h index 9144e0f09..464d14b2a 100644 --- a/include/net/netns/xfrm.h +++ b/include/net/netns/xfrm.h @@ -74,7 +74,7 @@ struct netns_xfrm { struct dst_ops xfrm6_dst_ops; #endif spinlock_t xfrm_state_lock; - seqcount_t xfrm_state_hash_generation; + seqcount_spinlock_t xfrm_state_hash_generation; seqcount_spinlock_t xfrm_policy_hash_generation; spinlock_t xfrm_policy_lock; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index 3696246d3..36d01c65d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -10,6 +10,7 @@ #include <linux/percpu.h> #include <linux/dynamic_queue_limits.h> #include <linux/list.h> +#include <net/net_seq_lock.h> #include <linux/refcount.h> #include <linux/workqueue.h> #include <linux/mutex.h> @@ -102,7 +103,7 @@ struct Qdisc { struct sk_buff_head gso_skb ____cacheline_aligned_in_smp; struct qdisc_skb_head q; struct gnet_stats_basic_packed bstats; - seqcount_t running; + net_seqlock_t running; struct gnet_stats_queue qstats; unsigned long state; struct Qdisc *next_sched; @@ -146,7 +147,11 @@ static inline bool qdisc_is_running(struct Qdisc *qdisc) { if (qdisc->flags & TCQ_F_NOLOCK) return spin_is_locked(&qdisc->seqlock); +#ifdef CONFIG_PREEMPT_RT + return spin_is_locked(&qdisc->running.lock) ? true : false; +#else return (raw_read_seqcount(&qdisc->running) & 1) ? true : false; +#endif } static inline bool qdisc_is_percpu_stats(const struct Qdisc *q) @@ -207,17 +212,35 @@ static inline bool qdisc_run_begin(struct Qdisc *qdisc) } else if (qdisc_is_running(qdisc)) { return false; } +#ifdef CONFIG_PREEMPT_RT + if (spin_trylock(&qdisc->running.lock)) { + seqcount_t *s = &qdisc->running.seqcount.seqcount; + /* + * Variant of write_seqcount_t_begin() telling lockdep that a + * trylock was attempted. + */ + raw_write_seqcount_t_begin(s); + seqcount_acquire(&s->dep_map, 0, 1, _RET_IP_); + return true; + } + return false; +#else /* Variant of write_seqcount_begin() telling lockdep a trylock * was attempted. */ raw_write_seqcount_begin(&qdisc->running); seqcount_acquire(&qdisc->running.dep_map, 0, 1, _RET_IP_); return true; +#endif } static inline void qdisc_run_end(struct Qdisc *qdisc) { +#ifdef CONFIG_PREEMPT_RT + write_sequnlock(&qdisc->running); +#else write_seqcount_end(&qdisc->running); +#endif if (qdisc->flags & TCQ_F_NOLOCK) { spin_unlock(&qdisc->seqlock); @@ -605,7 +628,7 @@ static inline spinlock_t *qdisc_root_sleeping_lock(const struct Qdisc *qdisc) return qdisc_lock(root); } -static inline seqcount_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) +static inline net_seqlock_t *qdisc_root_sleeping_running(const struct Qdisc *qdisc) { struct Qdisc *root = qdisc_root_sleeping(qdisc); diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h index c96a4337a..e48f584ab 100644 --- a/include/trace/events/sched.h +++ b/include/trace/events/sched.h @@ -650,6 +650,18 @@ DECLARE_TRACE(sched_update_nr_running_tp, TP_PROTO(struct rq *rq, int change), TP_ARGS(rq, change)); +DECLARE_TRACE(sched_migrate_disable_tp, + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); + +DECLARE_TRACE(sched_migrate_enable_tp, + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); + +DECLARE_TRACE(sched_migrate_pull_tp, + TP_PROTO(struct task_struct *p), + TP_ARGS(p)); + #endif /* _TRACE_SCHED_H */ /* This part must be outside protection */ diff --git a/init/Kconfig b/init/Kconfig index 17533f1f1..c10f40c46 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -861,7 +861,7 @@ config NUMA_BALANCING bool "Memory placement aware NUMA scheduler" depends on ARCH_SUPPORTS_NUMA_BALANCING depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY - depends on SMP && NUMA && MIGRATION + depends on SMP && NUMA && MIGRATION && !PREEMPT_RT help This option adds support for automatic NUMA aware memory/task placement. The mechanism is quite primitive and is based on migrating memory when @@ -975,6 +975,7 @@ config CFS_BANDWIDTH config RT_GROUP_SCHED bool "Group scheduling for SCHED_RR/FIFO" depends on CGROUP_SCHED + depends on !PREEMPT_RT default n help This feature lets you explicitly allocate real CPU bandwidth @@ -1937,6 +1938,7 @@ choice config SLAB bool "SLAB" + depends on !PREEMPT_RT select HAVE_HARDENED_USERCOPY_ALLOCATOR help The regular slab allocator that is established and known to work @@ -1957,6 +1959,7 @@ config SLUB config SLOB depends on EXPERT bool "SLOB (Simple Allocator)" + depends on !PREEMPT_RT help SLOB replaces the stock allocator with a drastically simpler allocator. SLOB is generally more space efficient but @@ -2023,7 +2026,7 @@ config SHUFFLE_PAGE_ALLOCATOR config SLUB_CPU_PARTIAL default y - depends on SLUB && SMP + depends on SLUB && SMP && !PREEMPT_RT bool "SLUB per cpu partial cache" help Per cpu partial caches accelerate objects allocation and freeing diff --git a/kernel/Kconfig.locks b/kernel/Kconfig.locks index 3de8fd118..4198f0273 100644 --- a/kernel/Kconfig.locks +++ b/kernel/Kconfig.locks @@ -251,7 +251,7 @@ config ARCH_USE_QUEUED_RWLOCKS config QUEUED_RWLOCKS def_bool y if ARCH_USE_QUEUED_RWLOCKS - depends on SMP + depends on SMP && !PREEMPT_RT config ARCH_HAS_MMIOWB bool diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt index 416017301..90837a6cb 100644 --- a/kernel/Kconfig.preempt +++ b/kernel/Kconfig.preempt @@ -1,5 +1,11 @@ # SPDX-License-Identifier: GPL-2.0-only +config HAVE_PREEMPT_LAZY + bool + +config PREEMPT_LAZY + def_bool y if HAVE_PREEMPT_LAZY && PREEMPT_RT + choice prompt "Preemption Model" default PREEMPT_NONE @@ -60,6 +66,7 @@ config PREEMPT_RT bool "Fully Preemptible Kernel (Real-Time)" depends on EXPERT && ARCH_SUPPORTS_RT select PREEMPTION + select RT_MUTEXES help This option turns the kernel into a real-time kernel by replacing various locking primitives (spinlocks, rwlocks, etc.) with diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 01966adce..bab193bde 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -351,7 +351,7 @@ void cpuset_read_unlock(void) percpu_up_read(&cpuset_rwsem); } -static DEFINE_SPINLOCK(callback_lock); +static DEFINE_RAW_SPINLOCK(callback_lock); static struct workqueue_struct *cpuset_migrate_mm_wq; @@ -1290,7 +1290,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, * Newly added CPUs will be removed from effective_cpus and * newly deleted ones will be added back to effective_cpus. */ - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); if (adding) { cpumask_or(parent->subparts_cpus, parent->subparts_cpus, tmp->addmask); @@ -1312,7 +1312,7 @@ static int update_parent_subparts_cpumask(struct cpuset *cpuset, int cmd, if (cpuset->partition_root_state != new_prs) cpuset->partition_root_state = new_prs; - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); return cmd == partcmd_update; } @@ -1415,7 +1415,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) continue; rcu_read_unlock(); - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cpumask_copy(cp->effective_cpus, tmp->new_cpus); if (cp->nr_subparts_cpus && (new_prs != PRS_ENABLED)) { @@ -1449,7 +1449,7 @@ static void update_cpumasks_hier(struct cpuset *cs, struct tmpmasks *tmp) if (new_prs != cp->partition_root_state) cp->partition_root_state = new_prs; - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && !cpumask_equal(cp->cpus_allowed, cp->effective_cpus)); @@ -1567,7 +1567,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, return -EINVAL; } - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, trialcs->cpus_allowed); /* @@ -1578,7 +1578,7 @@ static int update_cpumask(struct cpuset *cs, struct cpuset *trialcs, cs->cpus_allowed); cs->nr_subparts_cpus = cpumask_weight(cs->subparts_cpus); } - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); update_cpumasks_hier(cs, &tmp); @@ -1772,9 +1772,9 @@ static void update_nodemasks_hier(struct cpuset *cs, nodemask_t *new_mems) continue; rcu_read_unlock(); - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cp->effective_mems = *new_mems; - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); WARN_ON(!is_in_v2_mode() && !nodes_equal(cp->mems_allowed, cp->effective_mems)); @@ -1842,9 +1842,9 @@ static int update_nodemask(struct cpuset *cs, struct cpuset *trialcs, if (retval < 0) goto done; - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cs->mems_allowed = trialcs->mems_allowed; - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); /* use trialcs->mems_allowed as a temp variable */ update_nodemasks_hier(cs, &trialcs->mems_allowed); @@ -1935,9 +1935,9 @@ static int update_flag(cpuset_flagbits_t bit, struct cpuset *cs, spread_flag_changed = ((is_spread_slab(cs) != is_spread_slab(trialcs)) || (is_spread_page(cs) != is_spread_page(trialcs))); - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cs->flags = trialcs->flags; - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); if (!cpumask_empty(trialcs->cpus_allowed) && balance_flag_changed) rebuild_sched_domains_locked(); @@ -2028,9 +2028,9 @@ static int update_prstate(struct cpuset *cs, int new_prs) rebuild_sched_domains_locked(); out: if (!err) { - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cs->partition_root_state = new_prs; - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); } free_cpumasks(NULL, &tmpmask); @@ -2448,7 +2448,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) cpuset_filetype_t type = seq_cft(sf)->private; int ret = 0; - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); switch (type) { case FILE_CPULIST: @@ -2470,7 +2470,7 @@ static int cpuset_common_seq_show(struct seq_file *sf, void *v) ret = -EINVAL; } - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); return ret; } @@ -2783,14 +2783,14 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) cpuset_inc(); - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { cpumask_copy(cs->effective_cpus, parent->effective_cpus); cs->effective_mems = parent->effective_mems; cs->use_parent_ecpus = true; parent->child_ecpus_count++; } - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); if (!test_bit(CGRP_CPUSET_CLONE_CHILDREN, &css->cgroup->flags)) goto out_unlock; @@ -2817,12 +2817,12 @@ static int cpuset_css_online(struct cgroup_subsys_state *css) } rcu_read_unlock(); - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cs->mems_allowed = parent->mems_allowed; cs->effective_mems = parent->mems_allowed; cpumask_copy(cs->cpus_allowed, parent->cpus_allowed); cpumask_copy(cs->effective_cpus, parent->cpus_allowed); - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); out_unlock: percpu_up_write(&cpuset_rwsem); put_online_cpus(); @@ -2878,7 +2878,7 @@ static void cpuset_css_free(struct cgroup_subsys_state *css) static void cpuset_bind(struct cgroup_subsys_state *root_css) { percpu_down_write(&cpuset_rwsem); - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); if (is_in_v2_mode()) { cpumask_copy(top_cpuset.cpus_allowed, cpu_possible_mask); @@ -2889,7 +2889,7 @@ static void cpuset_bind(struct cgroup_subsys_state *root_css) top_cpuset.mems_allowed = top_cpuset.effective_mems; } - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); percpu_up_write(&cpuset_rwsem); } @@ -2986,12 +2986,12 @@ hotplug_update_tasks_legacy(struct cpuset *cs, { bool is_empty; - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cpumask_copy(cs->cpus_allowed, new_cpus); cpumask_copy(cs->effective_cpus, new_cpus); cs->mems_allowed = *new_mems; cs->effective_mems = *new_mems; - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); /* * Don't call update_tasks_cpumask() if the cpuset becomes empty, @@ -3028,10 +3028,10 @@ hotplug_update_tasks(struct cpuset *cs, if (nodes_empty(*new_mems)) *new_mems = parent_cs(cs)->effective_mems; - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cpumask_copy(cs->effective_cpus, new_cpus); cs->effective_mems = *new_mems; - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); if (cpus_updated) update_tasks_cpumask(cs); @@ -3098,10 +3098,10 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) if (is_partition_root(cs) && (cpumask_empty(&new_cpus) || (parent->partition_root_state == PRS_ERROR))) { if (cs->nr_subparts_cpus) { - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cs->nr_subparts_cpus = 0; cpumask_clear(cs->subparts_cpus); - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); compute_effective_cpumask(&new_cpus, cs, parent); } @@ -3115,9 +3115,9 @@ static void cpuset_hotplug_update_tasks(struct cpuset *cs, struct tmpmasks *tmp) cpumask_empty(&new_cpus)) { update_parent_subparts_cpumask(cs, partcmd_disable, NULL, tmp); - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); cs->partition_root_state = PRS_ERROR; - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); } cpuset_force_rebuild(); } @@ -3197,7 +3197,7 @@ static void cpuset_hotplug_workfn(struct work_struct *work) /* synchronize cpus_allowed to cpu_active_mask */ if (cpus_updated) { - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); if (!on_dfl) cpumask_copy(top_cpuset.cpus_allowed, &new_cpus); /* @@ -3217,17 +3217,17 @@ static void cpuset_hotplug_workfn(struct work_struct *work) } } cpumask_copy(top_cpuset.effective_cpus, &new_cpus); - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); /* we don't mess with cpumasks of tasks in top_cpuset */ } /* synchronize mems_allowed to N_MEMORY */ if (mems_updated) { - spin_lock_irq(&callback_lock); + raw_spin_lock_irq(&callback_lock); if (!on_dfl) top_cpuset.mems_allowed = new_mems; top_cpuset.effective_mems = new_mems; - spin_unlock_irq(&callback_lock); + raw_spin_unlock_irq(&callback_lock); update_tasks_nodemask(&top_cpuset); } @@ -3328,11 +3328,11 @@ void cpuset_cpus_allowed(struct task_struct *tsk, struct cpumask *pmask) { unsigned long flags; - spin_lock_irqsave(&callback_lock, flags); + raw_spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); guarantee_online_cpus(task_cs(tsk), pmask); rcu_read_unlock(); - spin_unlock_irqrestore(&callback_lock, flags); + raw_spin_unlock_irqrestore(&callback_lock, flags); } /** @@ -3393,11 +3393,11 @@ nodemask_t cpuset_mems_allowed(struct task_struct *tsk) nodemask_t mask; unsigned long flags; - spin_lock_irqsave(&callback_lock, flags); + raw_spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); guarantee_online_mems(task_cs(tsk), &mask); rcu_read_unlock(); - spin_unlock_irqrestore(&callback_lock, flags); + raw_spin_unlock_irqrestore(&callback_lock, flags); return mask; } @@ -3489,14 +3489,14 @@ bool __cpuset_node_allowed(int node, gfp_t gfp_mask) return true; /* Not hardwall and node outside mems_allowed: scan up cpusets */ - spin_lock_irqsave(&callback_lock, flags); + raw_spin_lock_irqsave(&callback_lock, flags); rcu_read_lock(); cs = nearest_hardwall_ancestor(task_cs(current)); allowed = node_isset(node, cs->mems_allowed); rcu_read_unlock(); - spin_unlock_irqrestore(&callback_lock, flags); + raw_spin_unlock_irqrestore(&callback_lock, flags); return allowed; } diff --git a/kernel/cgroup/rstat.c b/kernel/cgroup/rstat.c index d2ae14d0b..7b3bea56d 100644 --- a/kernel/cgroup/rstat.c +++ b/kernel/cgroup/rstat.c @@ -156,8 +156,9 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) raw_spinlock_t *cpu_lock = per_cpu_ptr(&cgroup_rstat_cpu_lock, cpu); struct cgroup *pos = NULL; + unsigned long flags; - raw_spin_lock(cpu_lock); + raw_spin_lock_irqsave(cpu_lock, flags); while ((pos = cgroup_rstat_cpu_pop_updated(pos, cgrp, cpu))) { struct cgroup_subsys_state *css; @@ -169,7 +170,7 @@ static void cgroup_rstat_flush_locked(struct cgroup *cgrp, bool may_sleep) css->ss->css_rstat_flush(css, cpu); rcu_read_unlock(); } - raw_spin_unlock(cpu_lock); + raw_spin_unlock_irqrestore(cpu_lock, flags); /* if @may_sleep, play nice and yield if necessary */ if (may_sleep && (need_resched() || diff --git a/kernel/cpu.c b/kernel/cpu.c index c06ced18f..10b6287af 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -1662,7 +1662,7 @@ static struct cpuhp_step cpuhp_hp_states[] = { .name = "ap:online", }, /* - * Handled on controll processor until the plugged processor manages + * Handled on control processor until the plugged processor manages * this itself. */ [CPUHP_TEARDOWN_CPU] = { @@ -1671,6 +1671,13 @@ static struct cpuhp_step cpuhp_hp_states[] = { .teardown.single = takedown_cpu, .cant_stop = true, }, + + [CPUHP_AP_SCHED_WAIT_EMPTY] = { + .name = "sched:waitempty", + .startup.single = NULL, + .teardown.single = sched_cpu_wait_empty, + }, + /* Handle smpboot threads park/unpark */ [CPUHP_AP_SMPBOOT_THREADS] = { .name = "smpboot/threads:online", diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 930ac1b25..dbf1d126a 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2101,7 +2101,7 @@ static int kdb_dmesg(int argc, const char **argv) int adjust = 0; int n = 0; int skip = 0; - struct kmsg_dumper dumper = { .active = 1 }; + struct kmsg_dumper_iter iter = { .active = 1 }; size_t len; char buf[201]; @@ -2126,8 +2126,8 @@ static int kdb_dmesg(int argc, const char **argv) kdb_set(2, setargs); } - kmsg_dump_rewind_nolock(&dumper); - while (kmsg_dump_get_line_nolock(&dumper, 1, NULL, 0, NULL)) + kmsg_dump_rewind(&iter); + while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL)) n++; if (lines < 0) { @@ -2159,8 +2159,8 @@ static int kdb_dmesg(int argc, const char **argv) if (skip >= n || skip < 0) return 0; - kmsg_dump_rewind_nolock(&dumper); - while (kmsg_dump_get_line_nolock(&dumper, 1, buf, sizeof(buf), &len)) { + kmsg_dump_rewind(&iter); + while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) { if (skip) { skip--; continue; diff --git a/kernel/entry/common.c b/kernel/entry/common.c index 18a29ca01..f9ff81391 100644 --- a/kernel/entry/common.c +++ b/kernel/entry/common.c @@ -2,6 +2,7 @@ #include <linux/context_tracking.h> #include <linux/entry-common.h> +#include <linux/highmem.h> #include <linux/livepatch.h> #include <linux/audit.h> @@ -148,9 +149,17 @@ static unsigned long exit_to_user_mode_loop(struct pt_regs *regs, local_irq_enable_exit_to_user(ti_work); - if (ti_work & _TIF_NEED_RESCHED) + if (ti_work & _TIF_NEED_RESCHED_MASK) schedule(); +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (unlikely(current->forced_info.si_signo)) { + struct task_struct *t = current; + force_sig_info(&t->forced_info); + t->forced_info.si_signo = 0; + } +#endif + if (ti_work & _TIF_UPROBE) uprobe_notify_resume(regs); @@ -206,6 +215,7 @@ static void exit_to_user_mode_prepare(struct pt_regs *regs) /* Ensure that the address limit is intact and no locks are held */ addr_limit_user_check(); + kmap_assert_nomap(); lockdep_assert_irqs_disabled(); lockdep_sys_exit(); } @@ -365,7 +375,7 @@ void irqentry_exit_cond_resched(void) rcu_irq_exit_check_preempt(); if (IS_ENABLED(CONFIG_DEBUG_ENTRY)) WARN_ON_ONCE(!on_thread_stack()); - if (need_resched()) + if (should_resched(0)) preempt_schedule_irq(); } } diff --git a/kernel/exit.c b/kernel/exit.c index d13d67fc5..f5933bd07 100644 --- a/kernel/exit.c +++ b/kernel/exit.c @@ -152,7 +152,7 @@ static void __exit_signal(struct task_struct *tsk) * Do this under ->siglock, we can race with another thread * doing sigqueue_free() if we have SIGQUEUE_PREALLOC signals. */ - flush_sigqueue(&tsk->pending); + flush_task_sigqueue(tsk); tsk->sighand = NULL; spin_unlock(&sighand->siglock); diff --git a/kernel/fork.c b/kernel/fork.c index bf27ee90a..f264e23f6 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -42,6 +42,7 @@ #include <linux/mmu_notifier.h> #include <linux/fs.h> #include <linux/mm.h> +#include <linux/kprobes.h> #include <linux/vmacache.h> #include <linux/nsproxy.h> #include <linux/capability.h> @@ -290,7 +291,7 @@ static inline void free_thread_stack(struct task_struct *tsk) return; } - vfree_atomic(tsk->stack); + vfree(tsk->stack); return; } #endif @@ -691,6 +692,19 @@ void __mmdrop(struct mm_struct *mm) } EXPORT_SYMBOL_GPL(__mmdrop); +#ifdef CONFIG_PREEMPT_RT +/* + * RCU callback for delayed mm drop. Not strictly rcu, but we don't + * want another facility to make this work. + */ +void __mmdrop_delayed(struct rcu_head *rhp) +{ + struct mm_struct *mm = container_of(rhp, struct mm_struct, delayed_drop); + + __mmdrop(mm); +} +#endif + static void mmdrop_async_fn(struct work_struct *work) { struct mm_struct *mm; @@ -732,6 +746,15 @@ void __put_task_struct(struct task_struct *tsk) WARN_ON(refcount_read(&tsk->usage)); WARN_ON(tsk == current); + /* + * Remove function-return probe instances associated with this + * task and put them back on the free list. + */ + kprobe_flush_task(tsk); + + /* Task is done with its stack. */ + put_task_stack(tsk); + io_uring_free(tsk); cgroup_free(tsk); task_numa_free(tsk, true); @@ -929,10 +952,12 @@ static struct task_struct *dup_task_struct(struct task_struct *orig, int node) tsk->splice_pipe = NULL; tsk->task_frag.page = NULL; tsk->wake_q.next = NULL; + tsk->wake_q_sleeper.next = NULL; account_kernel_stack(tsk, 1); kcov_task_init(tsk); + kmap_local_fork(tsk); #ifdef CONFIG_FAULT_INJECTION tsk->fail_nth = 0; @@ -2028,6 +2053,7 @@ static __latent_entropy struct task_struct *copy_process( spin_lock_init(&p->alloc_lock); init_sigpending(&p->pending); + p->sigqueue_cache = NULL; p->utime = p->stime = p->gtime = 0; #ifdef CONFIG_ARCH_HAS_SCALED_CPUTIME diff --git a/kernel/futex.c b/kernel/futex.c index 98a6e1b80..b2b275bc1 100644 --- a/kernel/futex.c +++ b/kernel/futex.c @@ -1498,6 +1498,7 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ struct task_struct *new_owner; bool postunlock = false; DEFINE_WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_sleeper_q); int ret = 0; new_owner = rt_mutex_next_owner(&pi_state->pi_mutex); @@ -1547,14 +1548,15 @@ static int wake_futex_pi(u32 __user *uaddr, u32 uval, struct futex_pi_state *pi_ * not fail. */ pi_state_update_owner(pi_state, new_owner); - postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q); + postunlock = __rt_mutex_futex_unlock(&pi_state->pi_mutex, &wake_q, + &wake_sleeper_q); } out_unlock: raw_spin_unlock_irq(&pi_state->pi_mutex.wait_lock); if (postunlock) - rt_mutex_postunlock(&wake_q); + rt_mutex_postunlock(&wake_q, &wake_sleeper_q); return ret; } @@ -2155,6 +2157,16 @@ static int futex_requeue(u32 __user *uaddr1, unsigned int flags, */ requeue_pi_wake_futex(this, &key2, hb2); continue; + } else if (ret == -EAGAIN) { + /* + * Waiter was woken by timeout or + * signal and has set pi_blocked_on to + * PI_WAKEUP_INPROGRESS before we + * tried to enqueue it on the rtmutex. + */ + this->pi_state = NULL; + put_pi_state(pi_state); + continue; } else if (ret) { /* * rt_mutex_start_proxy_lock() detected a @@ -2847,7 +2859,7 @@ static int futex_lock_pi(u32 __user *uaddr, unsigned int flags, goto no_block; } - rt_mutex_init_waiter(&rt_waiter); + rt_mutex_init_waiter(&rt_waiter, false); /* * On PREEMPT_RT_FULL, when hb->lock becomes an rt_mutex, we must not @@ -3172,7 +3184,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, { struct hrtimer_sleeper timeout, *to; struct rt_mutex_waiter rt_waiter; - struct futex_hash_bucket *hb; + struct futex_hash_bucket *hb, *hb2; union futex_key key2 = FUTEX_KEY_INIT; struct futex_q q = futex_q_init; int res, ret; @@ -3193,7 +3205,7 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * The waiter is allocated on our stack, manipulated by the requeue * code while we sleep on uaddr. */ - rt_mutex_init_waiter(&rt_waiter); + rt_mutex_init_waiter(&rt_waiter, false); ret = get_futex_key(uaddr2, flags & FLAGS_SHARED, &key2, FUTEX_WRITE); if (unlikely(ret != 0)) @@ -3224,20 +3236,55 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, /* Queue the futex_q, drop the hb lock, wait for wakeup. */ futex_wait_queue_me(hb, &q, to); - spin_lock(&hb->lock); - ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); - spin_unlock(&hb->lock); - if (ret) - goto out; + /* + * On RT we must avoid races with requeue and trying to block + * on two mutexes (hb->lock and uaddr2's rtmutex) by + * serializing access to pi_blocked_on with pi_lock. + */ + raw_spin_lock_irq(¤t->pi_lock); + if (current->pi_blocked_on) { + /* + * We have been requeued or are in the process of + * being requeued. + */ + raw_spin_unlock_irq(¤t->pi_lock); + } else { + /* + * Setting pi_blocked_on to PI_WAKEUP_INPROGRESS + * prevents a concurrent requeue from moving us to the + * uaddr2 rtmutex. After that we can safely acquire + * (and possibly block on) hb->lock. + */ + current->pi_blocked_on = PI_WAKEUP_INPROGRESS; + raw_spin_unlock_irq(¤t->pi_lock); + + spin_lock(&hb->lock); + + /* + * Clean up pi_blocked_on. We might leak it otherwise + * when we succeeded with the hb->lock in the fast + * path. + */ + raw_spin_lock_irq(¤t->pi_lock); + current->pi_blocked_on = NULL; + raw_spin_unlock_irq(¤t->pi_lock); + + ret = handle_early_requeue_pi_wakeup(hb, &q, &key2, to); + spin_unlock(&hb->lock); + if (ret) + goto out; + } /* - * In order for us to be here, we know our q.key == key2, and since - * we took the hb->lock above, we also know that futex_requeue() has - * completed and we no longer have to concern ourselves with a wakeup - * race with the atomic proxy lock acquisition by the requeue code. The - * futex_requeue dropped our key1 reference and incremented our key2 - * reference count. + * In order to be here, we have either been requeued, are in + * the process of being requeued, or requeue successfully + * acquired uaddr2 on our behalf. If pi_blocked_on was + * non-null above, we may be racing with a requeue. Do not + * rely on q->lock_ptr to be hb2->lock until after blocking on + * hb->lock or hb2->lock. The futex_requeue dropped our key1 + * reference and incremented our key2 reference count. */ + hb2 = hash_futex(&key2); /* Check if the requeue code acquired the second futex for us. */ if (!q.rt_waiter) { @@ -3246,14 +3293,15 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, * did a lock-steal - fix up the PI-state in that case. */ if (q.pi_state && (q.pi_state->owner != current)) { - spin_lock(q.lock_ptr); + spin_lock(&hb2->lock); + BUG_ON(&hb2->lock != q.lock_ptr); ret = fixup_pi_state_owner(uaddr2, &q, current); /* * Drop the reference to the pi state which * the requeue_pi() code acquired for us. */ put_pi_state(q.pi_state); - spin_unlock(q.lock_ptr); + spin_unlock(&hb2->lock); /* * Adjust the return value. It's either -EFAULT or * success (1) but the caller expects 0 for success. @@ -3272,7 +3320,8 @@ static int futex_wait_requeue_pi(u32 __user *uaddr, unsigned int flags, pi_mutex = &q.pi_state->pi_mutex; ret = rt_mutex_wait_proxy_lock(pi_mutex, to, &rt_waiter); - spin_lock(q.lock_ptr); + spin_lock(&hb2->lock); + BUG_ON(&hb2->lock != q.lock_ptr); if (ret && !rt_mutex_cleanup_proxy_lock(pi_mutex, &rt_waiter)) ret = 0; diff --git a/kernel/irq/handle.c b/kernel/irq/handle.c index 762a928e1..7929fcdb7 100644 --- a/kernel/irq/handle.c +++ b/kernel/irq/handle.c @@ -192,10 +192,16 @@ irqreturn_t handle_irq_event_percpu(struct irq_desc *desc) { irqreturn_t retval; unsigned int flags = 0; + struct pt_regs *regs = get_irq_regs(); + u64 ip = regs ? instruction_pointer(regs) : 0; retval = __handle_irq_event_percpu(desc, &flags); - add_interrupt_randomness(desc->irq_data.irq, flags); +#ifdef CONFIG_PREEMPT_RT + desc->random_ip = ip; +#else + add_interrupt_randomness(desc->irq_data.irq, flags, ip); +#endif if (!noirqdebug) note_interrupt(desc, retval); diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c index d3033e1f9..4f7885934 100644 --- a/kernel/irq/manage.c +++ b/kernel/irq/manage.c @@ -1260,6 +1260,8 @@ static int irq_thread(void *data) irqreturn_t (*handler_fn)(struct irq_desc *desc, struct irqaction *action); + sched_set_fifo(current); + if (force_irqthreads && test_bit(IRQTF_FORCED_THREAD, &action->thread_flags)) handler_fn = irq_forced_thread_fn; @@ -1280,6 +1282,12 @@ static int irq_thread(void *data) if (action_ret == IRQ_WAKE_THREAD) irq_wake_secondary(desc, action); + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + migrate_disable(); + add_interrupt_randomness(action->irq, 0, + desc->random_ip ^ (unsigned long) action); + migrate_enable(); + } wake_threads_waitq(desc); } @@ -1425,8 +1433,6 @@ setup_irq_thread(struct irqaction *new, unsigned int irq, bool secondary) if (IS_ERR(t)) return PTR_ERR(t); - sched_set_fifo(t); - /* * We keep the reference to the task struct even if * the thread dies to avoid that the interrupt code @@ -2823,7 +2829,7 @@ EXPORT_SYMBOL_GPL(irq_get_irqchip_state); * This call sets the internal irqchip state of an interrupt, * depending on the value of @which. * - * This function should be called with preemption disabled if the + * This function should be called with migration disabled if the * interrupt controller has per-cpu registers. */ int irq_set_irqchip_state(unsigned int irq, enum irqchip_irq_state which, diff --git a/kernel/irq/spurious.c b/kernel/irq/spurious.c index f865e5f4d..dc7311dd7 100644 --- a/kernel/irq/spurious.c +++ b/kernel/irq/spurious.c @@ -443,6 +443,10 @@ MODULE_PARM_DESC(noirqdebug, "Disable irq lockup detection when true"); static int __init irqfixup_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + pr_warn("irqfixup boot option not supported w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 1; printk(KERN_WARNING "Misrouted IRQ fixup support enabled.\n"); printk(KERN_WARNING "This may impact system performance.\n"); @@ -455,6 +459,10 @@ module_param(irqfixup, int, 0644); static int __init irqpoll_setup(char *str) { +#ifdef CONFIG_PREEMPT_RT + pr_warn("irqpoll boot option not supported w/ CONFIG_PREEMPT_RT\n"); + return 1; +#endif irqfixup = 2; printk(KERN_WARNING "Misrouted IRQ fixup and polling support " "enabled\n"); diff --git a/kernel/irq_work.c b/kernel/irq_work.c index fbff25adb..711bd5e87 100644 --- a/kernel/irq_work.c +++ b/kernel/irq_work.c @@ -18,11 +18,37 @@ #include <linux/cpu.h> #include <linux/notifier.h> #include <linux/smp.h> +#include <linux/smpboot.h> +#include <linux/interrupt.h> #include <asm/processor.h> static DEFINE_PER_CPU(struct llist_head, raised_list); static DEFINE_PER_CPU(struct llist_head, lazy_list); +static DEFINE_PER_CPU(struct task_struct *, irq_workd); + +static void wake_irq_workd(void) +{ + struct task_struct *tsk = __this_cpu_read(irq_workd); + + if (!llist_empty(this_cpu_ptr(&lazy_list)) && tsk) + wake_up_process(tsk); +} + +#ifdef CONFIG_SMP +static void irq_work_wake(struct irq_work *entry) +{ + wake_irq_workd(); +} + +static DEFINE_PER_CPU(struct irq_work, irq_work_wakeup) = + IRQ_WORK_INIT_HARD(irq_work_wake); +#endif + +static int irq_workd_should_run(unsigned int cpu) +{ + return !llist_empty(this_cpu_ptr(&lazy_list)); +} /* * Claim the entry so that no one else will poke at it. @@ -52,15 +78,30 @@ void __weak arch_irq_work_raise(void) /* Enqueue on current CPU, work must already be claimed and preempt disabled */ static void __irq_work_queue_local(struct irq_work *work) { + struct llist_head *list; + bool rt_lazy_work = false; + bool lazy_work = false; + int work_flags; + + work_flags = atomic_read(&work->node.a_flags); + if (work_flags & IRQ_WORK_LAZY) + lazy_work = true; + else if (IS_ENABLED(CONFIG_PREEMPT_RT) && + !(work_flags & IRQ_WORK_HARD_IRQ)) + rt_lazy_work = true; + + if (lazy_work || rt_lazy_work) + list = this_cpu_ptr(&lazy_list); + else + list = this_cpu_ptr(&raised_list); + + if (!llist_add(&work->node.llist, list)) + return; + + /* If the work is "lazy", handle it from next tick if any */ - if (atomic_read(&work->node.a_flags) & IRQ_WORK_LAZY) { - if (llist_add(&work->node.llist, this_cpu_ptr(&lazy_list)) && - tick_nohz_tick_stopped()) - arch_irq_work_raise(); - } else { - if (llist_add(&work->node.llist, this_cpu_ptr(&raised_list))) - arch_irq_work_raise(); - } + if (!lazy_work || tick_nohz_tick_stopped()) + arch_irq_work_raise(); } /* Enqueue the irq work @work on the current CPU */ @@ -102,10 +143,28 @@ bool irq_work_queue_on(struct irq_work *work, int cpu) if (cpu != smp_processor_id()) { /* Arch remote IPI send/receive backend aren't NMI safe */ WARN_ON_ONCE(in_nmi()); + + /* + * On PREEMPT_RT the items which are not marked as + * IRQ_WORK_HARD_IRQ are added to the lazy list and a HARD work + * item is used on the remote CPU to wake the thread. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && + !(atomic_read(&work->node.a_flags) & IRQ_WORK_HARD_IRQ)) { + + if (!llist_add(&work->node.llist, &per_cpu(lazy_list, cpu))) + goto out; + + work = &per_cpu(irq_work_wakeup, cpu); + if (!irq_work_claim(work)) + goto out; + } + __smp_call_single_queue(cpu, &work->node.llist); } else { __irq_work_queue_local(work); } +out: preempt_enable(); return true; @@ -120,9 +179,8 @@ bool irq_work_needs_cpu(void) raised = this_cpu_ptr(&raised_list); lazy = this_cpu_ptr(&lazy_list); - if (llist_empty(raised) || arch_irq_work_has_interrupt()) - if (llist_empty(lazy)) - return false; + if (llist_empty(raised) && llist_empty(lazy)) + return false; /* All work should have been flushed before going offline */ WARN_ON_ONCE(cpu_is_offline(smp_processor_id())); @@ -153,6 +211,10 @@ void irq_work_single(void *arg) */ flags &= ~IRQ_WORK_PENDING; (void)atomic_cmpxchg(&work->node.a_flags, flags, flags & ~IRQ_WORK_BUSY); + + if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || + !arch_irq_work_has_interrupt()) + rcuwait_wake_up(&work->irqwait); } static void irq_work_run_list(struct llist_head *list) @@ -160,7 +222,12 @@ static void irq_work_run_list(struct llist_head *list) struct irq_work *work, *tmp; struct llist_node *llnode; - BUG_ON(!irqs_disabled()); + /* + * On PREEMPT_RT IRQ-work which is not marked as HARD will be processed + * in a per-CPU thread in preemptible context. Only the items which are + * marked as IRQ_WORK_HARD_IRQ will be processed in hardirq context. + */ + BUG_ON(!irqs_disabled() && !IS_ENABLED(CONFIG_PREEMPT_RT)); if (llist_empty(list)) return; @@ -177,7 +244,10 @@ static void irq_work_run_list(struct llist_head *list) void irq_work_run(void) { irq_work_run_list(this_cpu_ptr(&raised_list)); - irq_work_run_list(this_cpu_ptr(&lazy_list)); + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); + else + wake_irq_workd(); } EXPORT_SYMBOL_GPL(irq_work_run); @@ -187,7 +257,11 @@ void irq_work_tick(void) if (!llist_empty(raised) && !arch_irq_work_has_interrupt()) irq_work_run_list(raised); - irq_work_run_list(this_cpu_ptr(&lazy_list)); + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) + irq_work_run_list(this_cpu_ptr(&lazy_list)); + else + wake_irq_workd(); } /* @@ -197,8 +271,42 @@ void irq_work_tick(void) void irq_work_sync(struct irq_work *work) { lockdep_assert_irqs_enabled(); + might_sleep(); + + if ((IS_ENABLED(CONFIG_PREEMPT_RT) && !irq_work_is_hard(work)) || + !arch_irq_work_has_interrupt()) { + rcuwait_wait_event(&work->irqwait, !irq_work_is_busy(work), + TASK_UNINTERRUPTIBLE); + return; + } while (irq_work_is_busy(work)) cpu_relax(); } EXPORT_SYMBOL_GPL(irq_work_sync); + +static void run_irq_workd(unsigned int cpu) +{ + irq_work_run_list(this_cpu_ptr(&lazy_list)); +} + +static void irq_workd_setup(unsigned int cpu) +{ + sched_set_fifo_low(current); +} + +static struct smp_hotplug_thread irqwork_threads = { + .store = &irq_workd, + .setup = irq_workd_setup, + .thread_should_run = irq_workd_should_run, + .thread_fn = run_irq_workd, + .thread_comm = "irq_work/%u", +}; + +static __init int irq_work_init_threads(void) +{ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) + BUG_ON(smpboot_register_percpu_thread(&irqwork_threads)); + return 0; +} +early_initcall(irq_work_init_threads); \ No newline at end of file diff --git a/kernel/kexec_core.c b/kernel/kexec_core.c index b9a6f4658..c26219f34 100644 --- a/kernel/kexec_core.c +++ b/kernel/kexec_core.c @@ -984,7 +984,6 @@ void crash_kexec(struct pt_regs *regs) old_cpu = atomic_cmpxchg(&panic_cpu, PANIC_CPU_INVALID, this_cpu); if (old_cpu == PANIC_CPU_INVALID) { /* This is the 1st CPU which comes here, so go ahead. */ - printk_safe_flush_on_panic(); __crash_kexec(regs); /* diff --git a/kernel/ksysfs.c b/kernel/ksysfs.c index 35859da8b..dfff31ed6 100644 --- a/kernel/ksysfs.c +++ b/kernel/ksysfs.c @@ -138,6 +138,15 @@ KERNEL_ATTR_RO(vmcoreinfo); #endif /* CONFIG_CRASH_CORE */ +#if defined(CONFIG_PREEMPT_RT) +static ssize_t realtime_show(struct kobject *kobj, + struct kobj_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", 1); +} +KERNEL_ATTR_RO(realtime); +#endif + /* whether file capabilities are enabled */ static ssize_t fscaps_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -228,6 +237,9 @@ static struct attribute * kernel_attrs[] = { #ifndef CONFIG_TINY_RCU &rcu_expedited_attr.attr, &rcu_normal_attr.attr, +#endif +#ifdef CONFIG_PREEMPT_RT + &realtime_attr.attr, #endif NULL }; diff --git a/kernel/kthread.c b/kernel/kthread.c index 508fe5278..3ce6a31db 100644 --- a/kernel/kthread.c +++ b/kernel/kthread.c @@ -264,6 +264,7 @@ EXPORT_SYMBOL_GPL(kthread_parkme); static int kthread(void *_create) { + static const struct sched_param param = { .sched_priority = 0 }; /* Copy data: it's on kthread's stack */ struct kthread_create_info *create = _create; int (*threadfn)(void *data) = create->threadfn; @@ -294,6 +295,13 @@ static int kthread(void *_create) init_completion(&self->parked); current->vfork_done = &self->exited; + /* + * The new thread inherited kthreadd's priority and CPU mask. Reset + * back to default in case they have been changed. + */ + sched_setscheduler_nocheck(current, SCHED_NORMAL, ¶m); + set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_KTHREAD)); + /* OK, tell user we're spawned, wait for stop or wakeup */ __set_current_state(TASK_UNINTERRUPTIBLE); create->result = current; @@ -391,7 +399,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), } task = create->result; if (!IS_ERR(task)) { - static const struct sched_param param = { .sched_priority = 0 }; char name[TASK_COMM_LEN]; /* @@ -400,13 +407,6 @@ struct task_struct *__kthread_create_on_node(int (*threadfn)(void *data), */ vsnprintf(name, sizeof(name), namefmt, args); set_task_comm(task, name); - /* - * root may have changed our (kthreadd's) priority or CPU mask. - * The kernel thread should not inherit these properties. - */ - sched_setscheduler_nocheck(task, SCHED_NORMAL, ¶m); - set_cpus_allowed_ptr(task, - housekeeping_cpumask(HK_FLAG_KTHREAD)); } kfree(create); return task; diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 6d11cfb9b..c7fbf737e 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -3,7 +3,7 @@ # and is generally not a function of system call inputs. KCOV_INSTRUMENT := n -obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o +obj-y += semaphore.o rwsem.o percpu-rwsem.o # Avoid recursion lockdep -> KCSAN -> ... -> lockdep. KCSAN_SANITIZE_lockdep.o := n @@ -15,19 +15,23 @@ CFLAGS_REMOVE_mutex-debug.o = $(CC_FLAGS_FTRACE) CFLAGS_REMOVE_rtmutex-debug.o = $(CC_FLAGS_FTRACE) endif -obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o obj-$(CONFIG_LOCKDEP) += lockdep.o ifeq ($(CONFIG_PROC_FS),y) obj-$(CONFIG_LOCKDEP) += lockdep_proc.o endif obj-$(CONFIG_SMP) += spinlock.o -obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o obj-$(CONFIG_PROVE_LOCKING) += spinlock.o obj-$(CONFIG_QUEUED_SPINLOCKS) += qspinlock.o obj-$(CONFIG_RT_MUTEXES) += rtmutex.o obj-$(CONFIG_DEBUG_RT_MUTEXES) += rtmutex-debug.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o +ifneq ($(CONFIG_PREEMPT_RT),y) +obj-y += mutex.o +obj-$(CONFIG_LOCK_SPIN_ON_OWNER) += osq_lock.o +obj-$(CONFIG_DEBUG_MUTEXES) += mutex-debug.o +endif +obj-$(CONFIG_PREEMPT_RT) += mutex-rt.o rwsem-rt.o rwlock-rt.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o obj-$(CONFIG_WW_MUTEX_SELFTEST) += test-ww_mutex.o diff --git a/kernel/locking/lockdep.c b/kernel/locking/lockdep.c index 1f6a2f122..cfd0164f8 100644 --- a/kernel/locking/lockdep.c +++ b/kernel/locking/lockdep.c @@ -5410,6 +5410,7 @@ static noinstr void check_flags(unsigned long flags) } } +#ifndef CONFIG_PREEMPT_RT /* * We dont accurately track softirq state in e.g. * hardirq contexts (such as on 4KSTACKS), so only @@ -5424,6 +5425,7 @@ static noinstr void check_flags(unsigned long flags) DEBUG_LOCKS_WARN_ON(!current->softirqs_enabled); } } +#endif if (!debug_locks) print_irqtrace_events(current); diff --git a/kernel/locking/mutex-rt.c b/kernel/locking/mutex-rt.c new file mode 100644 index 000000000..2b849e6b9 --- /dev/null +++ b/kernel/locking/mutex-rt.c @@ -0,0 +1,224 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Real-Time Preemption Support + * + * started by Ingo Molnar: + * + * Copyright (C) 2004-2006 Red Hat, Inc., Ingo Molnar <mingo@redhat.com> + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * + * historic credit for proving that Linux spinlocks can be implemented via + * RT-aware mutexes goes to many people: The Pmutex project (Dirk Grambow + * and others) who prototyped it on 2.4 and did lots of comparative + * research and analysis; TimeSys, for proving that you can implement a + * fully preemptible kernel via the use of IRQ threading and mutexes; + * Bill Huey for persuasively arguing on lkml that the mutex model is the + * right one; and to MontaVista, who ported pmutexes to 2.6. + * + * This code is a from-scratch implementation and is not based on pmutexes, + * but the idea of converting spinlocks to mutexes is used here too. + * + * lock debugging, locking tree, deadlock detection: + * + * Copyright (C) 2004, LynuxWorks, Inc., Igor Manyilov, Bill Huey + * Released under the General Public License (GPL). + * + * Includes portions of the generic R/W semaphore implementation from: + * + * Copyright (c) 2001 David Howells (dhowells@redhat.com). + * - Derived partially from idea by Andrea Arcangeli <andrea@suse.de> + * - Derived also from comments by Linus + * + * Pending ownership of locks and ownership stealing: + * + * Copyright (C) 2005, Kihon Technologies Inc., Steven Rostedt + * + * (also by Steven Rostedt) + * - Converted single pi_lock to individual task locks. + * + * By Esben Nielsen: + * Doing priority inheritance with help of the scheduler. + * + * Copyright (C) 2006, Timesys Corp., Thomas Gleixner <tglx@timesys.com> + * - major rework based on Esben Nielsens initial patch + * - replaced thread_info references by task_struct refs + * - removed task->pending_owner dependency + * - BKL drop/reacquire for semaphore style locks to avoid deadlocks + * in the scheduler return path as discussed with Steven Rostedt + * + * Copyright (C) 2006, Kihon Technologies Inc. + * Steven Rostedt <rostedt@goodmis.org> + * - debugged and patched Thomas Gleixner's rework. + * - added back the cmpxchg to the rework. + * - turned atomic require back on for SMP. + */ + +#include <linux/spinlock.h> +#include <linux/rtmutex.h> +#include <linux/sched.h> +#include <linux/delay.h> +#include <linux/module.h> +#include <linux/kallsyms.h> +#include <linux/syscalls.h> +#include <linux/interrupt.h> +#include <linux/plist.h> +#include <linux/fs.h> +#include <linux/futex.h> +#include <linux/hrtimer.h> +#include <linux/blkdev.h> + +#include "rtmutex_common.h" + +/* + * struct mutex functions + */ +void __mutex_do_init(struct mutex *mutex, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)mutex, sizeof(*mutex)); + lockdep_init_map(&mutex->dep_map, name, key, 0); +#endif + mutex->lock.save_state = 0; +} +EXPORT_SYMBOL(__mutex_do_init); + +static int _mutex_lock_blk_flush(struct mutex *lock, int state) +{ + /* + * Flush blk before ->pi_blocked_on is set. At schedule() time it is too + * late if one of the callbacks needs to acquire a sleeping lock. + */ + if (blk_needs_flush_plug(current)) + blk_schedule_flush_plug(current); + return __rt_mutex_lock_state(&lock->lock, state); +} + +void __lockfunc _mutex_lock(struct mutex *lock) +{ + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(_mutex_lock); + +void __lockfunc _mutex_lock_io_nested(struct mutex *lock, int subclass) +{ + int token; + + token = io_schedule_prepare(); + + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); + __rt_mutex_lock_state(&lock->lock, TASK_UNINTERRUPTIBLE); + + io_schedule_finish(token); +} +EXPORT_SYMBOL_GPL(_mutex_lock_io_nested); + +int __lockfunc _mutex_lock_interruptible(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible); + +int __lockfunc _mutex_lock_killable(struct mutex *lock) +{ + int ret; + + mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); + ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_killable); + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __lockfunc _mutex_lock_nested(struct mutex *lock, int subclass) +{ + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); + _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(_mutex_lock_nested); + +void __lockfunc _mutex_lock_nest_lock(struct mutex *lock, struct lockdep_map *nest) +{ + mutex_acquire_nest(&lock->dep_map, 0, 0, nest, _RET_IP_); + _mutex_lock_blk_flush(lock, TASK_UNINTERRUPTIBLE); +} +EXPORT_SYMBOL(_mutex_lock_nest_lock); + +int __lockfunc _mutex_lock_interruptible_nested(struct mutex *lock, int subclass) +{ + int ret; + + mutex_acquire_nest(&lock->dep_map, subclass, 0, NULL, _RET_IP_); + ret = _mutex_lock_blk_flush(lock, TASK_INTERRUPTIBLE); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_interruptible_nested); + +int __lockfunc _mutex_lock_killable_nested(struct mutex *lock, int subclass) +{ + int ret; + + mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + ret = _mutex_lock_blk_flush(lock, TASK_KILLABLE); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; +} +EXPORT_SYMBOL(_mutex_lock_killable_nested); +#endif + +int __lockfunc _mutex_trylock(struct mutex *lock) +{ + int ret = __rt_mutex_trylock(&lock->lock); + + if (ret) + mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); + + return ret; +} +EXPORT_SYMBOL(_mutex_trylock); + +void __lockfunc _mutex_unlock(struct mutex *lock) +{ + mutex_release(&lock->dep_map, _RET_IP_); + __rt_mutex_unlock(&lock->lock); +} +EXPORT_SYMBOL(_mutex_unlock); + +/** + * atomic_dec_and_mutex_lock - return holding mutex if we dec to 0 + * @cnt: the atomic which we are to dec + * @lock: the mutex to return holding if we dec to 0 + * + * return true and hold lock if we dec to 0, return false otherwise + */ +int atomic_dec_and_mutex_lock(atomic_t *cnt, struct mutex *lock) +{ + /* dec if we can't possibly hit 0 */ + if (atomic_add_unless(cnt, -1, 1)) + return 0; + /* we might hit 0, so take the lock */ + mutex_lock(lock); + if (!atomic_dec_and_test(cnt)) { + /* when we actually did the dec, we didn't hit 0 */ + mutex_unlock(lock); + return 0; + } + /* we hit 0, and we hold the lock */ + return 1; +} +EXPORT_SYMBOL(atomic_dec_and_mutex_lock); diff --git a/kernel/locking/rtmutex-debug.c b/kernel/locking/rtmutex-debug.c index 36e69100e..fb1501003 100644 --- a/kernel/locking/rtmutex-debug.c +++ b/kernel/locking/rtmutex-debug.c @@ -32,110 +32,12 @@ #include "rtmutex_common.h" -static void printk_task(struct task_struct *p) -{ - if (p) - printk("%16s:%5d [%p, %3d]", p->comm, task_pid_nr(p), p, p->prio); - else - printk("<none>"); -} - -static void printk_lock(struct rt_mutex *lock, int print_owner) -{ - if (lock->name) - printk(" [%p] {%s}\n", - lock, lock->name); - else - printk(" [%p] {%s:%d}\n", - lock, lock->file, lock->line); - - if (print_owner && rt_mutex_owner(lock)) { - printk(".. ->owner: %p\n", lock->owner); - printk(".. held by: "); - printk_task(rt_mutex_owner(lock)); - printk("\n"); - } -} - void rt_mutex_debug_task_free(struct task_struct *task) { DEBUG_LOCKS_WARN_ON(!RB_EMPTY_ROOT(&task->pi_waiters.rb_root)); DEBUG_LOCKS_WARN_ON(task->pi_blocked_on); } -/* - * We fill out the fields in the waiter to store the information about - * the deadlock. We print when we return. act_waiter can be NULL in - * case of a remove waiter operation. - */ -void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, - struct rt_mutex_waiter *act_waiter, - struct rt_mutex *lock) -{ - struct task_struct *task; - - if (!debug_locks || chwalk == RT_MUTEX_FULL_CHAINWALK || !act_waiter) - return; - - task = rt_mutex_owner(act_waiter->lock); - if (task && task != current) { - act_waiter->deadlock_task_pid = get_pid(task_pid(task)); - act_waiter->deadlock_lock = lock; - } -} - -void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter) -{ - struct task_struct *task; - - if (!waiter->deadlock_lock || !debug_locks) - return; - - rcu_read_lock(); - task = pid_task(waiter->deadlock_task_pid, PIDTYPE_PID); - if (!task) { - rcu_read_unlock(); - return; - } - - if (!debug_locks_off()) { - rcu_read_unlock(); - return; - } - - pr_warn("\n"); - pr_warn("============================================\n"); - pr_warn("WARNING: circular locking deadlock detected!\n"); - pr_warn("%s\n", print_tainted()); - pr_warn("--------------------------------------------\n"); - printk("%s/%d is deadlocking current task %s/%d\n\n", - task->comm, task_pid_nr(task), - current->comm, task_pid_nr(current)); - - printk("\n1) %s/%d is trying to acquire this lock:\n", - current->comm, task_pid_nr(current)); - printk_lock(waiter->lock, 1); - - printk("\n2) %s/%d is blocked on this lock:\n", - task->comm, task_pid_nr(task)); - printk_lock(waiter->deadlock_lock, 1); - - debug_show_held_locks(current); - debug_show_held_locks(task); - - printk("\n%s/%d's [blocked] stackdump:\n\n", - task->comm, task_pid_nr(task)); - show_stack(task, NULL, KERN_DEFAULT); - printk("\n%s/%d's [current] stackdump:\n\n", - current->comm, task_pid_nr(current)); - dump_stack(); - debug_show_all_locks(); - rcu_read_unlock(); - - printk("[ turning off deadlock detection." - "Please report this trace. ]\n\n"); -} - void debug_rt_mutex_lock(struct rt_mutex *lock) { } @@ -158,12 +60,10 @@ void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock) void debug_rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) { memset(waiter, 0x11, sizeof(*waiter)); - waiter->deadlock_task_pid = NULL; } void debug_rt_mutex_free_waiter(struct rt_mutex_waiter *waiter) { - put_pid(waiter->deadlock_task_pid); memset(waiter, 0x22, sizeof(*waiter)); } @@ -173,10 +73,8 @@ void debug_rt_mutex_init(struct rt_mutex *lock, const char *name, struct lock_cl * Make sure we are not reinitializing a held lock: */ debug_check_no_locks_freed((void *)lock, sizeof(*lock)); - lock->name = name; #ifdef CONFIG_DEBUG_LOCK_ALLOC lockdep_init_map(&lock->dep_map, name, key, 0); #endif } - diff --git a/kernel/locking/rtmutex-debug.h b/kernel/locking/rtmutex-debug.h index fc549713b..659e93e25 100644 --- a/kernel/locking/rtmutex-debug.h +++ b/kernel/locking/rtmutex-debug.h @@ -18,20 +18,9 @@ extern void debug_rt_mutex_unlock(struct rt_mutex *lock); extern void debug_rt_mutex_proxy_lock(struct rt_mutex *lock, struct task_struct *powner); extern void debug_rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void debug_rt_mutex_deadlock(enum rtmutex_chainwalk chwalk, - struct rt_mutex_waiter *waiter, - struct rt_mutex *lock); -extern void debug_rt_mutex_print_deadlock(struct rt_mutex_waiter *waiter); -# define debug_rt_mutex_reset_waiter(w) \ - do { (w)->deadlock_lock = NULL; } while (0) static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *waiter, enum rtmutex_chainwalk walk) { return (waiter != NULL); } - -static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) -{ - debug_rt_mutex_print_deadlock(w); -} diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c index 2f8cd616d..4ea87d6c9 100644 --- a/kernel/locking/rtmutex.c +++ b/kernel/locking/rtmutex.c @@ -8,6 +8,11 @@ * Copyright (C) 2005-2006 Timesys Corp., Thomas Gleixner <tglx@timesys.com> * Copyright (C) 2005 Kihon Technologies Inc., Steven Rostedt * Copyright (C) 2006 Esben Nielsen + * Adaptive Spinlocks: + * Copyright (C) 2008 Novell, Inc., Gregory Haskins, Sven Dietrich, + * and Peter Morreale, + * Adaptive Spinlocks simplification: + * Copyright (C) 2008 Red Hat, Inc., Steven Rostedt <srostedt@redhat.com> * * See Documentation/locking/rt-mutex-design.rst for details. */ @@ -19,6 +24,7 @@ #include <linux/sched/wake_q.h> #include <linux/sched/debug.h> #include <linux/timer.h> +#include <linux/ww_mutex.h> #include "rtmutex_common.h" @@ -136,6 +142,12 @@ static void fixup_rt_mutex_waiters(struct rt_mutex *lock) WRITE_ONCE(*p, owner & ~RT_MUTEX_HAS_WAITERS); } +static int rt_mutex_real_waiter(struct rt_mutex_waiter *waiter) +{ + return waiter && waiter != PI_WAKEUP_INPROGRESS && + waiter != PI_REQUEUE_INPROGRESS; +} + /* * We can speed up the acquire/release, if there's no debugging state to be * set up. @@ -227,7 +239,7 @@ static inline bool unlock_rt_mutex_safe(struct rt_mutex *lock, * Only use with rt_mutex_waiter_{less,equal}() */ #define task_to_waiter(p) \ - &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline } + &(struct rt_mutex_waiter){ .prio = (p)->prio, .deadline = (p)->dl.deadline, .task = (p) } static inline int rt_mutex_waiter_less(struct rt_mutex_waiter *left, @@ -267,6 +279,27 @@ rt_mutex_waiter_equal(struct rt_mutex_waiter *left, return 1; } +#define STEAL_NORMAL 0 +#define STEAL_LATERAL 1 + +static inline int +rt_mutex_steal(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, int mode) +{ + struct rt_mutex_waiter *top_waiter = rt_mutex_top_waiter(lock); + + if (waiter == top_waiter || rt_mutex_waiter_less(waiter, top_waiter)) + return 1; + + /* + * Note that RT tasks are excluded from lateral-steals + * to prevent the introduction of an unbounded latency. + */ + if (mode == STEAL_NORMAL || rt_task(waiter->task)) + return 0; + + return rt_mutex_waiter_equal(waiter, top_waiter); +} + static void rt_mutex_enqueue(struct rt_mutex *lock, struct rt_mutex_waiter *waiter) { @@ -371,6 +404,14 @@ static bool rt_mutex_cond_detect_deadlock(struct rt_mutex_waiter *waiter, return debug_rt_mutex_detect_deadlock(waiter, chwalk); } +static void rt_mutex_wake_waiter(struct rt_mutex_waiter *waiter) +{ + if (waiter->savestate) + wake_up_lock_sleeper(waiter->task); + else + wake_up_process(waiter->task); +} + /* * Max number of times we'll walk the boosting chain: */ @@ -378,7 +419,8 @@ int max_lock_depth = 1024; static inline struct rt_mutex *task_blocked_on_lock(struct task_struct *p) { - return p->pi_blocked_on ? p->pi_blocked_on->lock : NULL; + return rt_mutex_real_waiter(p->pi_blocked_on) ? + p->pi_blocked_on->lock : NULL; } /* @@ -514,7 +556,7 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * reached or the state of the chain has changed while we * dropped the locks. */ - if (!waiter) + if (!rt_mutex_real_waiter(waiter)) goto out_unlock_pi; /* @@ -597,7 +639,6 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * walk, we detected a deadlock. */ if (lock == orig_lock || rt_mutex_owner(lock) == top_task) { - debug_rt_mutex_deadlock(chwalk, orig_waiter, lock); raw_spin_unlock(&lock->wait_lock); ret = -EDEADLK; goto out_unlock_pi; @@ -694,13 +735,16 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * follow here. This is the end of the chain we are walking. */ if (!rt_mutex_owner(lock)) { + struct rt_mutex_waiter *lock_top_waiter; + /* * If the requeue [7] above changed the top waiter, * then we need to wake the new top waiter up to try * to get the lock. */ - if (prerequeue_top_waiter != rt_mutex_top_waiter(lock)) - wake_up_process(rt_mutex_top_waiter(lock)->task); + lock_top_waiter = rt_mutex_top_waiter(lock); + if (prerequeue_top_waiter != lock_top_waiter) + rt_mutex_wake_waiter(lock_top_waiter); raw_spin_unlock_irq(&lock->wait_lock); return 0; } @@ -801,9 +845,11 @@ static int rt_mutex_adjust_prio_chain(struct task_struct *task, * @task: The task which wants to acquire the lock * @waiter: The waiter that is queued to the lock's wait tree if the * callsite called task_blocked_on_lock(), otherwise NULL + * @mode: Lock steal mode (STEAL_NORMAL, STEAL_LATERAL) */ -static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, - struct rt_mutex_waiter *waiter) +static int __try_to_take_rt_mutex(struct rt_mutex *lock, + struct task_struct *task, + struct rt_mutex_waiter *waiter, int mode) { lockdep_assert_held(&lock->wait_lock); @@ -839,12 +885,11 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (waiter) { /* - * If waiter is not the highest priority waiter of - * @lock, give up. + * If waiter is not the highest priority waiter of @lock, + * or its peer when lateral steal is allowed, give up. */ - if (waiter != rt_mutex_top_waiter(lock)) + if (!rt_mutex_steal(lock, waiter, mode)) return 0; - /* * We can acquire the lock. Remove the waiter from the * lock waiters tree. @@ -862,14 +907,12 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, */ if (rt_mutex_has_waiters(lock)) { /* - * If @task->prio is greater than or equal to - * the top waiter priority (kernel view), - * @task lost. + * If @task->prio is greater than the top waiter + * priority (kernel view), or equal to it when a + * lateral steal is forbidden, @task lost. */ - if (!rt_mutex_waiter_less(task_to_waiter(task), - rt_mutex_top_waiter(lock))) + if (!rt_mutex_steal(lock, task_to_waiter(task), mode)) return 0; - /* * The current top waiter stays enqueued. We * don't have to change anything in the lock @@ -916,6 +959,329 @@ static int try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, return 1; } +#ifdef CONFIG_PREEMPT_RT +/* + * preemptible spin_lock functions: + */ +static inline void rt_spin_lock_fastlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + might_sleep_no_state_check(); + + if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) + return; + else + slowfn(lock); +} + +static inline void rt_spin_lock_fastunlock(struct rt_mutex *lock, + void (*slowfn)(struct rt_mutex *lock)) +{ + if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) + return; + else + slowfn(lock); +} +#ifdef CONFIG_SMP +/* + * Note that owner is a speculative pointer and dereferencing relies + * on rcu_read_lock() and the check against the lock owner. + */ +static int adaptive_wait(struct rt_mutex *lock, + struct task_struct *owner) +{ + int res = 0; + + rcu_read_lock(); + for (;;) { + if (owner != rt_mutex_owner(lock)) + break; + /* + * Ensure that owner->on_cpu is dereferenced _after_ + * checking the above to be valid. + */ + barrier(); + if (!owner->on_cpu) { + res = 1; + break; + } + cpu_relax(); + } + rcu_read_unlock(); + return res; +} +#else +static int adaptive_wait(struct rt_mutex *lock, + struct task_struct *orig_owner) +{ + return 1; +} +#endif + +static int task_blocks_on_rt_mutex(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + struct task_struct *task, + enum rtmutex_chainwalk chwalk); +/* + * Slow path lock function spin_lock style: this variant is very + * careful not to miss any non-lock wakeups. + * + * We store the current state under p->pi_lock in p->saved_state and + * the try_to_wake_up() code handles this accordingly. + */ +void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + unsigned long flags) +{ + struct task_struct *lock_owner, *self = current; + struct rt_mutex_waiter *top_waiter; + int ret; + + if (__try_to_take_rt_mutex(lock, self, NULL, STEAL_LATERAL)) + return; + + BUG_ON(rt_mutex_owner(lock) == self); + + /* + * We save whatever state the task is in and we'll restore it + * after acquiring the lock taking real wakeups into account + * as well. We are serialized via pi_lock against wakeups. See + * try_to_wake_up(). + */ + raw_spin_lock(&self->pi_lock); + self->saved_state = self->state; + __set_current_state_no_track(TASK_UNINTERRUPTIBLE); + raw_spin_unlock(&self->pi_lock); + + ret = task_blocks_on_rt_mutex(lock, waiter, self, RT_MUTEX_MIN_CHAINWALK); + BUG_ON(ret); + + for (;;) { + /* Try to acquire the lock again. */ + if (__try_to_take_rt_mutex(lock, self, waiter, STEAL_LATERAL)) + break; + + top_waiter = rt_mutex_top_waiter(lock); + lock_owner = rt_mutex_owner(lock); + + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + if (top_waiter != waiter || adaptive_wait(lock, lock_owner)) + preempt_schedule_lock(); + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + + raw_spin_lock(&self->pi_lock); + __set_current_state_no_track(TASK_UNINTERRUPTIBLE); + raw_spin_unlock(&self->pi_lock); + } + + /* + * Restore the task state to current->saved_state. We set it + * to the original state above and the try_to_wake_up() code + * has possibly updated it when a real (non-rtmutex) wakeup + * happened while we were blocked. Clear saved_state so + * try_to_wakeup() does not get confused. + */ + raw_spin_lock(&self->pi_lock); + __set_current_state_no_track(self->saved_state); + self->saved_state = TASK_RUNNING; + raw_spin_unlock(&self->pi_lock); + + /* + * try_to_take_rt_mutex() sets the waiter bit + * unconditionally. We might have to fix that up: + */ + fixup_rt_mutex_waiters(lock); + + BUG_ON(rt_mutex_has_waiters(lock) && waiter == rt_mutex_top_waiter(lock)); + BUG_ON(!RB_EMPTY_NODE(&waiter->tree_entry)); +} + +static void noinline __sched rt_spin_lock_slowlock(struct rt_mutex *lock) +{ + struct rt_mutex_waiter waiter; + unsigned long flags; + + rt_mutex_init_waiter(&waiter, true); + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + rt_spin_lock_slowlock_locked(lock, &waiter, flags); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + debug_rt_mutex_free_waiter(&waiter); +} + +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock, + struct wake_q_head *wake_q, + struct wake_q_head *wq_sleeper); +/* + * Slow path to release a rt_mutex spin_lock style + */ +void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock) +{ + unsigned long flags; + DEFINE_WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_sleeper_q); + bool postunlock; + + raw_spin_lock_irqsave(&lock->wait_lock, flags); + postunlock = __rt_mutex_unlock_common(lock, &wake_q, &wake_sleeper_q); + raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + + if (postunlock) + rt_mutex_postunlock(&wake_q, &wake_sleeper_q); +} + +void __lockfunc rt_spin_lock(spinlock_t *lock) +{ + spin_acquire(&lock->dep_map, 0, 0, _RET_IP_); + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_spin_lock); + +void __lockfunc __rt_spin_lock(struct rt_mutex *lock) +{ + rt_spin_lock_fastlock(lock, rt_spin_lock_slowlock); +} + +#ifdef CONFIG_DEBUG_LOCK_ALLOC +void __lockfunc rt_spin_lock_nested(spinlock_t *lock, int subclass) +{ + spin_acquire(&lock->dep_map, subclass, 0, _RET_IP_); + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_spin_lock_nested); + +void __lockfunc rt_spin_lock_nest_lock(spinlock_t *lock, + struct lockdep_map *nest_lock) +{ + spin_acquire_nest(&lock->dep_map, 0, 0, nest_lock, _RET_IP_); + rt_spin_lock_fastlock(&lock->lock, rt_spin_lock_slowlock); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_spin_lock_nest_lock); +#endif + +void __lockfunc rt_spin_unlock(spinlock_t *lock) +{ + /* NOTE: we always pass in '1' for nested, for simplicity */ + spin_release(&lock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + rt_spin_lock_fastunlock(&lock->lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(rt_spin_unlock); + +void __lockfunc __rt_spin_unlock(struct rt_mutex *lock) +{ + rt_spin_lock_fastunlock(lock, rt_spin_lock_slowunlock); +} +EXPORT_SYMBOL(__rt_spin_unlock); + +/* + * Wait for the lock to get unlocked: instead of polling for an unlock + * (like raw spinlocks do), we lock and unlock, to force the kernel to + * schedule if there's contention: + */ +void __lockfunc rt_spin_lock_unlock(spinlock_t *lock) +{ + spin_lock(lock); + spin_unlock(lock); +} +EXPORT_SYMBOL(rt_spin_lock_unlock); + +int __lockfunc rt_spin_trylock(spinlock_t *lock) +{ + int ret; + + ret = __rt_mutex_trylock(&lock->lock); + if (ret) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock); + +int __lockfunc rt_spin_trylock_bh(spinlock_t *lock) +{ + int ret; + + local_bh_disable(); + ret = __rt_mutex_trylock(&lock->lock); + if (ret) { + spin_acquire(&lock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } else { + local_bh_enable(); + } + return ret; +} +EXPORT_SYMBOL(rt_spin_trylock_bh); + +void +__rt_spin_lock_init(spinlock_t *lock, const char *name, struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held lock: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif +} +EXPORT_SYMBOL(__rt_spin_lock_init); + +#endif /* PREEMPT_RT */ + +#ifdef CONFIG_PREEMPT_RT + static inline int __sched +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) +{ + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock); + struct ww_acquire_ctx *hold_ctx = READ_ONCE(ww->ctx); + + if (!hold_ctx) + return 0; + + if (unlikely(ctx == hold_ctx)) + return -EALREADY; + + if (ctx->stamp - hold_ctx->stamp <= LONG_MAX && + (ctx->stamp != hold_ctx->stamp || ctx > hold_ctx)) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(ctx->contending_lock); + ctx->contending_lock = ww; +#endif + return -EDEADLK; + } + + return 0; +} +#else + static inline int __sched +__mutex_lock_check_stamp(struct rt_mutex *lock, struct ww_acquire_ctx *ctx) +{ + BUG(); + return 0; +} + +#endif + +static inline int +try_to_take_rt_mutex(struct rt_mutex *lock, struct task_struct *task, + struct rt_mutex_waiter *waiter) +{ + return __try_to_take_rt_mutex(lock, task, waiter, STEAL_NORMAL); +} + /* * Task blocks on lock. * @@ -948,6 +1314,22 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, return -EDEADLK; raw_spin_lock(&task->pi_lock); + /* + * In the case of futex requeue PI, this will be a proxy + * lock. The task will wake unaware that it is enqueueed on + * this lock. Avoid blocking on two locks and corrupting + * pi_blocked_on via the PI_WAKEUP_INPROGRESS + * flag. futex_wait_requeue_pi() sets this when it wakes up + * before requeue (due to a signal or timeout). Do not enqueue + * the task if PI_WAKEUP_INPROGRESS is set. + */ + if (task != current && task->pi_blocked_on == PI_WAKEUP_INPROGRESS) { + raw_spin_unlock(&task->pi_lock); + return -EAGAIN; + } + + BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)); + waiter->task = task; waiter->lock = lock; waiter->prio = task->prio; @@ -971,7 +1353,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, rt_mutex_enqueue_pi(owner, waiter); rt_mutex_adjust_prio(owner); - if (owner->pi_blocked_on) + if (rt_mutex_real_waiter(owner->pi_blocked_on)) chain_walk = 1; } else if (rt_mutex_cond_detect_deadlock(waiter, chwalk)) { chain_walk = 1; @@ -1013,6 +1395,7 @@ static int task_blocks_on_rt_mutex(struct rt_mutex *lock, * Called with lock->wait_lock held and interrupts disabled. */ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, + struct wake_q_head *wake_sleeper_q, struct rt_mutex *lock) { struct rt_mutex_waiter *waiter; @@ -1052,7 +1435,10 @@ static void mark_wakeup_next_waiter(struct wake_q_head *wake_q, * Pairs with preempt_enable() in rt_mutex_postunlock(); */ preempt_disable(); - wake_q_add(wake_q, waiter->task); + if (waiter->savestate) + wake_q_add_sleeper(wake_sleeper_q, waiter->task); + else + wake_q_add(wake_q, waiter->task); raw_spin_unlock(¤t->pi_lock); } @@ -1067,7 +1453,7 @@ static void remove_waiter(struct rt_mutex *lock, { bool is_top_waiter = (waiter == rt_mutex_top_waiter(lock)); struct task_struct *owner = rt_mutex_owner(lock); - struct rt_mutex *next_lock; + struct rt_mutex *next_lock = NULL; lockdep_assert_held(&lock->wait_lock); @@ -1093,7 +1479,8 @@ static void remove_waiter(struct rt_mutex *lock, rt_mutex_adjust_prio(owner); /* Store the lock on which owner is blocked or NULL */ - next_lock = task_blocked_on_lock(owner); + if (rt_mutex_real_waiter(owner->pi_blocked_on)) + next_lock = task_blocked_on_lock(owner); raw_spin_unlock(&owner->pi_lock); @@ -1129,26 +1516,28 @@ void rt_mutex_adjust_pi(struct task_struct *task) raw_spin_lock_irqsave(&task->pi_lock, flags); waiter = task->pi_blocked_on; - if (!waiter || rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { + if (!rt_mutex_real_waiter(waiter) || + rt_mutex_waiter_equal(waiter, task_to_waiter(task))) { raw_spin_unlock_irqrestore(&task->pi_lock, flags); return; } next_lock = waiter->lock; - raw_spin_unlock_irqrestore(&task->pi_lock, flags); /* gets dropped in rt_mutex_adjust_prio_chain()! */ get_task_struct(task); + raw_spin_unlock_irqrestore(&task->pi_lock, flags); rt_mutex_adjust_prio_chain(task, RT_MUTEX_MIN_CHAINWALK, NULL, next_lock, NULL, task); } -void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) +void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savestate) { debug_rt_mutex_init_waiter(waiter); RB_CLEAR_NODE(&waiter->pi_tree_entry); RB_CLEAR_NODE(&waiter->tree_entry); waiter->task = NULL; + waiter->savestate = savestate; } /** @@ -1164,7 +1553,8 @@ void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter) static int __sched __rt_mutex_slowlock(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - struct rt_mutex_waiter *waiter) + struct rt_mutex_waiter *waiter, + struct ww_acquire_ctx *ww_ctx) { int ret = 0; @@ -1173,24 +1563,23 @@ __rt_mutex_slowlock(struct rt_mutex *lock, int state, if (try_to_take_rt_mutex(lock, current, waiter)) break; - /* - * TASK_INTERRUPTIBLE checks for signals and - * timeout. Ignored otherwise. - */ - if (likely(state == TASK_INTERRUPTIBLE)) { - /* Signal pending? */ - if (signal_pending(current)) - ret = -EINTR; - if (timeout && !timeout->task) - ret = -ETIMEDOUT; + if (timeout && !timeout->task) { + ret = -ETIMEDOUT; + break; + } + if (signal_pending_state(state, current)) { + ret = -EINTR; + break; + } + + if (ww_ctx && ww_ctx->acquired > 0) { + ret = __mutex_lock_check_stamp(lock, ww_ctx); if (ret) break; } raw_spin_unlock_irq(&lock->wait_lock); - debug_rt_mutex_print_deadlock(waiter); - schedule(); raw_spin_lock_irq(&lock->wait_lock); @@ -1211,43 +1600,110 @@ static void rt_mutex_handle_deadlock(int res, int detect_deadlock, if (res != -EDEADLOCK || detect_deadlock) return; - /* - * Yell lowdly and stop the task right here. - */ - rt_mutex_print_deadlock(w); while (1) { set_current_state(TASK_INTERRUPTIBLE); schedule(); } } -/* - * Slow path lock function: - */ -static int __sched -rt_mutex_slowlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk) +static __always_inline void ww_mutex_lock_acquired(struct ww_mutex *ww, + struct ww_acquire_ctx *ww_ctx) { - struct rt_mutex_waiter waiter; - unsigned long flags; - int ret = 0; +#ifdef CONFIG_DEBUG_MUTEXES + /* + * If this WARN_ON triggers, you used ww_mutex_lock to acquire, + * but released with a normal mutex_unlock in this call. + * + * This should never happen, always use ww_mutex_unlock. + */ + DEBUG_LOCKS_WARN_ON(ww->ctx); + + /* + * Not quite done after calling ww_acquire_done() ? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->done_acquire); - rt_mutex_init_waiter(&waiter); + if (ww_ctx->contending_lock) { + /* + * After -EDEADLK you tried to + * acquire a different ww_mutex? Bad! + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->contending_lock != ww); + + /* + * You called ww_mutex_lock after receiving -EDEADLK, + * but 'forgot' to unlock everything else first? + */ + DEBUG_LOCKS_WARN_ON(ww_ctx->acquired > 0); + ww_ctx->contending_lock = NULL; + } /* - * Technically we could use raw_spin_[un]lock_irq() here, but this can - * be called in early boot if the cmpxchg() fast path is disabled - * (debug, no architecture support). In this case we will acquire the - * rtmutex with lock->wait_lock held. But we cannot unconditionally - * enable interrupts in that early boot case. So we need to use the - * irqsave/restore variants. + * Naughty, using a different class will lead to undefined behavior! */ - raw_spin_lock_irqsave(&lock->wait_lock, flags); + DEBUG_LOCKS_WARN_ON(ww_ctx->ww_class != ww->ww_class); +#endif + ww_ctx->acquired++; +} + +#ifdef CONFIG_PREEMPT_RT +static void ww_mutex_account_lock(struct rt_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ + struct ww_mutex *ww = container_of(lock, struct ww_mutex, base.lock); + struct rt_mutex_waiter *waiter, *n; + + /* + * This branch gets optimized out for the common case, + * and is only important for ww_mutex_lock. + */ + ww_mutex_lock_acquired(ww, ww_ctx); + ww->ctx = ww_ctx; + + /* + * Give any possible sleeping processes the chance to wake up, + * so they can recheck if they have to back off. + */ + rbtree_postorder_for_each_entry_safe(waiter, n, &lock->waiters.rb_root, + tree_entry) { + /* XXX debug rt mutex waiter wakeup */ + + BUG_ON(waiter->lock != lock); + rt_mutex_wake_waiter(waiter); + } +} + +#else + +static void ww_mutex_account_lock(struct rt_mutex *lock, + struct ww_acquire_ctx *ww_ctx) +{ + BUG(); +} +#endif + +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk, + struct ww_acquire_ctx *ww_ctx, + struct rt_mutex_waiter *waiter) +{ + int ret; + +#ifdef CONFIG_PREEMPT_RT + if (ww_ctx) { + struct ww_mutex *ww; + + ww = container_of(lock, struct ww_mutex, base.lock); + if (unlikely(ww_ctx == READ_ONCE(ww->ctx))) + return -EALREADY; + } +#endif /* Try to acquire the lock again: */ if (try_to_take_rt_mutex(lock, current, NULL)) { - raw_spin_unlock_irqrestore(&lock->wait_lock, flags); + if (ww_ctx) + ww_mutex_account_lock(lock, ww_ctx); return 0; } @@ -1257,16 +1713,26 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, if (unlikely(timeout)) hrtimer_start_expires(&timeout->timer, HRTIMER_MODE_ABS); - ret = task_blocks_on_rt_mutex(lock, &waiter, current, chwalk); + ret = task_blocks_on_rt_mutex(lock, waiter, current, chwalk); - if (likely(!ret)) + if (likely(!ret)) { /* sleep on the mutex */ - ret = __rt_mutex_slowlock(lock, state, timeout, &waiter); + ret = __rt_mutex_slowlock(lock, state, timeout, waiter, + ww_ctx); + } else if (ww_ctx) { + /* ww_mutex received EDEADLK, let it become EALREADY */ + ret = __mutex_lock_check_stamp(lock, ww_ctx); + BUG_ON(!ret); + } if (unlikely(ret)) { __set_current_state(TASK_RUNNING); - remove_waiter(lock, &waiter); - rt_mutex_handle_deadlock(ret, chwalk, &waiter); + remove_waiter(lock, waiter); + /* ww_mutex wants to report EDEADLK/EALREADY, let it */ + if (!ww_ctx) + rt_mutex_handle_deadlock(ret, chwalk, waiter); + } else if (ww_ctx) { + ww_mutex_account_lock(lock, ww_ctx); } /* @@ -1274,6 +1740,36 @@ rt_mutex_slowlock(struct rt_mutex *lock, int state, * unconditionally. We might have to fix that up. */ fixup_rt_mutex_waiters(lock); + return ret; +} + +/* + * Slow path lock function: + */ +static int __sched +rt_mutex_slowlock(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk, + struct ww_acquire_ctx *ww_ctx) +{ + struct rt_mutex_waiter waiter; + unsigned long flags; + int ret = 0; + + rt_mutex_init_waiter(&waiter, false); + + /* + * Technically we could use raw_spin_[un]lock_irq() here, but this can + * be called in early boot if the cmpxchg() fast path is disabled + * (debug, no architecture support). In this case we will acquire the + * rtmutex with lock->wait_lock held. But we cannot unconditionally + * enable interrupts in that early boot case. So we need to use the + * irqsave/restore variants. + */ + raw_spin_lock_irqsave(&lock->wait_lock, flags); + + ret = rt_mutex_slowlock_locked(lock, state, timeout, chwalk, ww_ctx, + &waiter); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); @@ -1334,7 +1830,8 @@ static inline int rt_mutex_slowtrylock(struct rt_mutex *lock) * Return whether the current task needs to call rt_mutex_postunlock(). */ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, - struct wake_q_head *wake_q) + struct wake_q_head *wake_q, + struct wake_q_head *wake_sleeper_q) { unsigned long flags; @@ -1388,7 +1885,7 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, * * Queue the next waiter for wakeup once we release the wait_lock. */ - mark_wakeup_next_waiter(wake_q, lock); + mark_wakeup_next_waiter(wake_q, wake_sleeper_q, lock); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); return true; /* call rt_mutex_postunlock() */ @@ -1402,29 +1899,16 @@ static bool __sched rt_mutex_slowunlock(struct rt_mutex *lock, */ static inline int rt_mutex_fastlock(struct rt_mutex *lock, int state, + struct ww_acquire_ctx *ww_ctx, int (*slowfn)(struct rt_mutex *lock, int state, struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk)) + enum rtmutex_chainwalk chwalk, + struct ww_acquire_ctx *ww_ctx)) { if (likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) return 0; - return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK); -} - -static inline int -rt_mutex_timed_fastlock(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk, - int (*slowfn)(struct rt_mutex *lock, int state, - struct hrtimer_sleeper *timeout, - enum rtmutex_chainwalk chwalk)) -{ - if (chwalk == RT_MUTEX_MIN_CHAINWALK && - likely(rt_mutex_cmpxchg_acquire(lock, NULL, current))) - return 0; - - return slowfn(lock, state, timeout, chwalk); + return slowfn(lock, state, NULL, RT_MUTEX_MIN_CHAINWALK, ww_ctx); } static inline int @@ -1440,9 +1924,11 @@ rt_mutex_fasttrylock(struct rt_mutex *lock, /* * Performs the wakeup of the the top-waiter and re-enables preemption. */ -void rt_mutex_postunlock(struct wake_q_head *wake_q) +void rt_mutex_postunlock(struct wake_q_head *wake_q, + struct wake_q_head *wake_sleeper_q) { wake_up_q(wake_q); + wake_up_q_sleeper(wake_sleeper_q); /* Pairs with preempt_disable() in rt_mutex_slowunlock() */ preempt_enable(); @@ -1451,23 +1937,46 @@ void rt_mutex_postunlock(struct wake_q_head *wake_q) static inline void rt_mutex_fastunlock(struct rt_mutex *lock, bool (*slowfn)(struct rt_mutex *lock, - struct wake_q_head *wqh)) + struct wake_q_head *wqh, + struct wake_q_head *wq_sleeper)) { DEFINE_WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_sleeper_q); if (likely(rt_mutex_cmpxchg_release(lock, current, NULL))) return; - if (slowfn(lock, &wake_q)) - rt_mutex_postunlock(&wake_q); + if (slowfn(lock, &wake_q, &wake_sleeper_q)) + rt_mutex_postunlock(&wake_q, &wake_sleeper_q); } -static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) +int __sched __rt_mutex_lock_state(struct rt_mutex *lock, int state) { might_sleep(); + return rt_mutex_fastlock(lock, state, NULL, rt_mutex_slowlock); +} + +/** + * rt_mutex_lock_state - lock a rt_mutex with a given state + * + * @lock: The rt_mutex to be locked + * @state: The state to set when blocking on the rt_mutex + */ +static inline int __sched rt_mutex_lock_state(struct rt_mutex *lock, + unsigned int subclass, int state) +{ + int ret; mutex_acquire(&lock->dep_map, subclass, 0, _RET_IP_); - rt_mutex_fastlock(lock, TASK_UNINTERRUPTIBLE, rt_mutex_slowlock); + ret = __rt_mutex_lock_state(lock, state); + if (ret) + mutex_release(&lock->dep_map, _RET_IP_); + return ret; +} + +static inline void __rt_mutex_lock(struct rt_mutex *lock, unsigned int subclass) +{ + rt_mutex_lock_state(lock, subclass, TASK_UNINTERRUPTIBLE); } #ifdef CONFIG_DEBUG_LOCK_ALLOC @@ -1508,16 +2017,7 @@ EXPORT_SYMBOL_GPL(rt_mutex_lock); */ int __sched rt_mutex_lock_interruptible(struct rt_mutex *lock) { - int ret; - - might_sleep(); - - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_fastlock(lock, TASK_INTERRUPTIBLE, rt_mutex_slowlock); - if (ret) - mutex_release(&lock->dep_map, _RET_IP_); - - return ret; + return rt_mutex_lock_state(lock, 0, TASK_INTERRUPTIBLE); } EXPORT_SYMBOL_GPL(rt_mutex_lock_interruptible); @@ -1534,36 +2034,17 @@ int __sched __rt_mutex_futex_trylock(struct rt_mutex *lock) return __rt_mutex_slowtrylock(lock); } -/** - * rt_mutex_timed_lock - lock a rt_mutex interruptible - * the timeout structure is provided - * by the caller - * - * @lock: the rt_mutex to be locked - * @timeout: timeout structure or NULL (no timeout) - * - * Returns: - * 0 on success - * -EINTR when interrupted by a signal - * -ETIMEDOUT when the timeout expired - */ -int -rt_mutex_timed_lock(struct rt_mutex *lock, struct hrtimer_sleeper *timeout) +int __sched __rt_mutex_trylock(struct rt_mutex *lock) { - int ret; - - might_sleep(); - - mutex_acquire(&lock->dep_map, 0, 0, _RET_IP_); - ret = rt_mutex_timed_fastlock(lock, TASK_INTERRUPTIBLE, timeout, - RT_MUTEX_MIN_CHAINWALK, - rt_mutex_slowlock); - if (ret) - mutex_release(&lock->dep_map, _RET_IP_); +#ifdef CONFIG_PREEMPT_RT + if (WARN_ON_ONCE(in_irq() || in_nmi())) +#else + if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) +#endif + return 0; - return ret; + return rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); } -EXPORT_SYMBOL_GPL(rt_mutex_timed_lock); /** * rt_mutex_trylock - try to lock a rt_mutex @@ -1580,10 +2061,7 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) { int ret; - if (WARN_ON_ONCE(in_irq() || in_nmi() || in_serving_softirq())) - return 0; - - ret = rt_mutex_fasttrylock(lock, rt_mutex_slowtrylock); + ret = __rt_mutex_trylock(lock); if (ret) mutex_acquire(&lock->dep_map, 0, 1, _RET_IP_); @@ -1591,6 +2069,11 @@ int __sched rt_mutex_trylock(struct rt_mutex *lock) } EXPORT_SYMBOL_GPL(rt_mutex_trylock); +void __sched __rt_mutex_unlock(struct rt_mutex *lock) +{ + rt_mutex_fastunlock(lock, rt_mutex_slowunlock); +} + /** * rt_mutex_unlock - unlock a rt_mutex * @@ -1599,16 +2082,13 @@ EXPORT_SYMBOL_GPL(rt_mutex_trylock); void __sched rt_mutex_unlock(struct rt_mutex *lock) { mutex_release(&lock->dep_map, _RET_IP_); - rt_mutex_fastunlock(lock, rt_mutex_slowunlock); + __rt_mutex_unlock(lock); } EXPORT_SYMBOL_GPL(rt_mutex_unlock); -/** - * Futex variant, that since futex variants do not use the fast-path, can be - * simple and will not need to retry. - */ -bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wake_q) +static bool __sched __rt_mutex_unlock_common(struct rt_mutex *lock, + struct wake_q_head *wake_q, + struct wake_q_head *wq_sleeper) { lockdep_assert_held(&lock->wait_lock); @@ -1625,23 +2105,35 @@ bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, * avoid inversion prior to the wakeup. preempt_disable() * therein pairs with rt_mutex_postunlock(). */ - mark_wakeup_next_waiter(wake_q, lock); + mark_wakeup_next_waiter(wake_q, wq_sleeper, lock); return true; /* call postunlock() */ } +/** + * Futex variant, that since futex variants do not use the fast-path, can be + * simple and will not need to retry. + */ +bool __sched __rt_mutex_futex_unlock(struct rt_mutex *lock, + struct wake_q_head *wake_q, + struct wake_q_head *wq_sleeper) +{ + return __rt_mutex_unlock_common(lock, wake_q, wq_sleeper); +} + void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) { DEFINE_WAKE_Q(wake_q); + DEFINE_WAKE_Q(wake_sleeper_q); unsigned long flags; bool postunlock; raw_spin_lock_irqsave(&lock->wait_lock, flags); - postunlock = __rt_mutex_futex_unlock(lock, &wake_q); + postunlock = __rt_mutex_futex_unlock(lock, &wake_q, &wake_sleeper_q); raw_spin_unlock_irqrestore(&lock->wait_lock, flags); if (postunlock) - rt_mutex_postunlock(&wake_q); + rt_mutex_postunlock(&wake_q, &wake_sleeper_q); } /** @@ -1655,9 +2147,6 @@ void __sched rt_mutex_futex_unlock(struct rt_mutex *lock) void rt_mutex_destroy(struct rt_mutex *lock) { WARN_ON(rt_mutex_is_locked(lock)); -#ifdef CONFIG_DEBUG_RT_MUTEXES - lock->magic = NULL; -#endif } EXPORT_SYMBOL_GPL(rt_mutex_destroy); @@ -1680,7 +2169,7 @@ void __rt_mutex_init(struct rt_mutex *lock, const char *name, if (name && key) debug_rt_mutex_init(lock, name, key); } -EXPORT_SYMBOL_GPL(__rt_mutex_init); +EXPORT_SYMBOL(__rt_mutex_init); /** * rt_mutex_init_proxy_locked - initialize and lock a rt_mutex on behalf of a @@ -1700,6 +2189,14 @@ void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner) { __rt_mutex_init(lock, NULL, NULL); +#ifdef CONFIG_DEBUG_SPINLOCK + /* + * get another key class for the wait_lock. LOCK_PI and UNLOCK_PI is + * holding the ->wait_lock of the proxy_lock while unlocking a sleeping + * lock. + */ + raw_spin_lock_init(&lock->wait_lock); +#endif debug_rt_mutex_proxy_lock(lock, proxy_owner); rt_mutex_set_owner(lock, proxy_owner); } @@ -1722,6 +2219,26 @@ void rt_mutex_proxy_unlock(struct rt_mutex *lock) rt_mutex_set_owner(lock, NULL); } +static void fixup_rt_mutex_blocked(struct rt_mutex *lock) +{ + struct task_struct *tsk = current; + /* + * RT has a problem here when the wait got interrupted by a timeout + * or a signal. task->pi_blocked_on is still set. The task must + * acquire the hash bucket lock when returning from this function. + * + * If the hash bucket lock is contended then the + * BUG_ON(rt_mutex_real_waiter(task->pi_blocked_on)) in + * task_blocks_on_rt_mutex() will trigger. This can be avoided by + * clearing task->pi_blocked_on which removes the task from the + * boosting chain of the rtmutex. That's correct because the task + * is not longer blocked on it. + */ + raw_spin_lock(&tsk->pi_lock); + tsk->pi_blocked_on = NULL; + raw_spin_unlock(&tsk->pi_lock); +} + /** * __rt_mutex_start_proxy_lock() - Start lock acquisition for another task * @lock: the rt_mutex to take @@ -1752,6 +2269,34 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, if (try_to_take_rt_mutex(lock, task, NULL)) return 1; +#ifdef CONFIG_PREEMPT_RT + /* + * In PREEMPT_RT there's an added race. + * If the task, that we are about to requeue, times out, + * it can set the PI_WAKEUP_INPROGRESS. This tells the requeue + * to skip this task. But right after the task sets + * its pi_blocked_on to PI_WAKEUP_INPROGRESS it can then + * block on the spin_lock(&hb->lock), which in RT is an rtmutex. + * This will replace the PI_WAKEUP_INPROGRESS with the actual + * lock that it blocks on. We *must not* place this task + * on this proxy lock in that case. + * + * To prevent this race, we first take the task's pi_lock + * and check if it has updated its pi_blocked_on. If it has, + * we assume that it woke up and we return -EAGAIN. + * Otherwise, we set the task's pi_blocked_on to + * PI_REQUEUE_INPROGRESS, so that if the task is waking up + * it will know that we are in the process of requeuing it. + */ + raw_spin_lock(&task->pi_lock); + if (task->pi_blocked_on) { + raw_spin_unlock(&task->pi_lock); + return -EAGAIN; + } + task->pi_blocked_on = PI_REQUEUE_INPROGRESS; + raw_spin_unlock(&task->pi_lock); +#endif + /* We enforce deadlock detection for futexes */ ret = task_blocks_on_rt_mutex(lock, waiter, task, RT_MUTEX_FULL_CHAINWALK); @@ -1766,7 +2311,8 @@ int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, ret = 0; } - debug_rt_mutex_print_deadlock(waiter); + if (ret) + fixup_rt_mutex_blocked(lock); return ret; } @@ -1851,12 +2397,15 @@ int rt_mutex_wait_proxy_lock(struct rt_mutex *lock, raw_spin_lock_irq(&lock->wait_lock); /* sleep on the mutex */ set_current_state(TASK_INTERRUPTIBLE); - ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter); + ret = __rt_mutex_slowlock(lock, TASK_INTERRUPTIBLE, to, waiter, NULL); /* * try_to_take_rt_mutex() sets the waiter bit unconditionally. We might * have to fix that up. */ fixup_rt_mutex_waiters(lock); + if (ret) + fixup_rt_mutex_blocked(lock); + raw_spin_unlock_irq(&lock->wait_lock); return ret; @@ -1918,3 +2467,97 @@ bool rt_mutex_cleanup_proxy_lock(struct rt_mutex *lock, return cleanup; } + +static inline int +ww_mutex_deadlock_injection(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ +#ifdef CONFIG_DEBUG_WW_MUTEX_SLOWPATH + unsigned int tmp; + + if (ctx->deadlock_inject_countdown-- == 0) { + tmp = ctx->deadlock_inject_interval; + if (tmp > UINT_MAX/4) + tmp = UINT_MAX; + else + tmp = tmp*2 + tmp + tmp/2; + + ctx->deadlock_inject_interval = tmp; + ctx->deadlock_inject_countdown = tmp; + ctx->contending_lock = lock; + + ww_mutex_unlock(lock); + + return -EDEADLK; + } +#endif + + return 0; +} + +#ifdef CONFIG_PREEMPT_RT +int __sched +ww_mutex_lock_interruptible(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + + mutex_acquire_nest(&lock->base.dep_map, 0, 0, + ctx ? &ctx->dep_map : NULL, _RET_IP_); + ret = rt_mutex_slowlock(&lock->base.lock, TASK_INTERRUPTIBLE, NULL, 0, + ctx); + if (ret) + mutex_release(&lock->base.dep_map, _RET_IP_); + else if (!ret && ctx && ctx->acquired > 1) + return ww_mutex_deadlock_injection(lock, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(ww_mutex_lock_interruptible); + +int __sched +ww_mutex_lock(struct ww_mutex *lock, struct ww_acquire_ctx *ctx) +{ + int ret; + + might_sleep(); + + mutex_acquire_nest(&lock->base.dep_map, 0, 0, + ctx ? &ctx->dep_map : NULL, _RET_IP_); + ret = rt_mutex_slowlock(&lock->base.lock, TASK_UNINTERRUPTIBLE, NULL, 0, + ctx); + if (ret) + mutex_release(&lock->base.dep_map, _RET_IP_); + else if (!ret && ctx && ctx->acquired > 1) + return ww_mutex_deadlock_injection(lock, ctx); + + return ret; +} +EXPORT_SYMBOL_GPL(ww_mutex_lock); + +void __sched ww_mutex_unlock(struct ww_mutex *lock) +{ + /* + * The unlocking fastpath is the 0->1 transition from 'locked' + * into 'unlocked' state: + */ + if (lock->ctx) { +#ifdef CONFIG_DEBUG_MUTEXES + DEBUG_LOCKS_WARN_ON(!lock->ctx->acquired); +#endif + if (lock->ctx->acquired > 0) + lock->ctx->acquired--; + lock->ctx = NULL; + } + + mutex_release(&lock->base.dep_map, _RET_IP_); + __rt_mutex_unlock(&lock->base.lock); +} +EXPORT_SYMBOL(ww_mutex_unlock); + +int __rt_mutex_owner_current(struct rt_mutex *lock) +{ + return rt_mutex_owner(lock) == current; +} +EXPORT_SYMBOL(__rt_mutex_owner_current); +#endif diff --git a/kernel/locking/rtmutex.h b/kernel/locking/rtmutex.h index 732f96abf..338ccd291 100644 --- a/kernel/locking/rtmutex.h +++ b/kernel/locking/rtmutex.h @@ -19,15 +19,8 @@ #define debug_rt_mutex_proxy_unlock(l) do { } while (0) #define debug_rt_mutex_unlock(l) do { } while (0) #define debug_rt_mutex_init(m, n, k) do { } while (0) -#define debug_rt_mutex_deadlock(d, a ,l) do { } while (0) -#define debug_rt_mutex_print_deadlock(w) do { } while (0) #define debug_rt_mutex_reset_waiter(w) do { } while (0) -static inline void rt_mutex_print_deadlock(struct rt_mutex_waiter *w) -{ - WARN(1, "rtmutex deadlock detected\n"); -} - static inline bool debug_rt_mutex_detect_deadlock(struct rt_mutex_waiter *w, enum rtmutex_chainwalk walk) { diff --git a/kernel/locking/rtmutex_common.h b/kernel/locking/rtmutex_common.h index ca6fb4890..248a7d915 100644 --- a/kernel/locking/rtmutex_common.h +++ b/kernel/locking/rtmutex_common.h @@ -15,6 +15,7 @@ #include <linux/rtmutex.h> #include <linux/sched/wake_q.h> +#include <linux/sched/debug.h> /* * This is the control structure for tasks blocked on a rt_mutex, @@ -29,12 +30,8 @@ struct rt_mutex_waiter { struct rb_node pi_tree_entry; struct task_struct *task; struct rt_mutex *lock; -#ifdef CONFIG_DEBUG_RT_MUTEXES - unsigned long ip; - struct pid *deadlock_task_pid; - struct rt_mutex *deadlock_lock; -#endif int prio; + bool savestate; u64 deadline; }; @@ -130,11 +127,14 @@ enum rtmutex_chainwalk { /* * PI-futex support (proxy locking functions, etc.): */ +#define PI_WAKEUP_INPROGRESS ((struct rt_mutex_waiter *) 1) +#define PI_REQUEUE_INPROGRESS ((struct rt_mutex_waiter *) 2) + extern struct task_struct *rt_mutex_next_owner(struct rt_mutex *lock); extern void rt_mutex_init_proxy_locked(struct rt_mutex *lock, struct task_struct *proxy_owner); extern void rt_mutex_proxy_unlock(struct rt_mutex *lock); -extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter); +extern void rt_mutex_init_waiter(struct rt_mutex_waiter *waiter, bool savetate); extern int __rt_mutex_start_proxy_lock(struct rt_mutex *lock, struct rt_mutex_waiter *waiter, struct task_struct *task); @@ -152,9 +152,27 @@ extern int __rt_mutex_futex_trylock(struct rt_mutex *l); extern void rt_mutex_futex_unlock(struct rt_mutex *lock); extern bool __rt_mutex_futex_unlock(struct rt_mutex *lock, - struct wake_q_head *wqh); - -extern void rt_mutex_postunlock(struct wake_q_head *wake_q); + struct wake_q_head *wqh, + struct wake_q_head *wq_sleeper); + +extern void rt_mutex_postunlock(struct wake_q_head *wake_q, + struct wake_q_head *wake_sleeper_q); + +/* RW semaphore special interface */ +struct ww_acquire_ctx; + +extern int __rt_mutex_lock_state(struct rt_mutex *lock, int state); +extern int __rt_mutex_trylock(struct rt_mutex *lock); +extern void __rt_mutex_unlock(struct rt_mutex *lock); +int __sched rt_mutex_slowlock_locked(struct rt_mutex *lock, int state, + struct hrtimer_sleeper *timeout, + enum rtmutex_chainwalk chwalk, + struct ww_acquire_ctx *ww_ctx, + struct rt_mutex_waiter *waiter); +void __sched rt_spin_lock_slowlock_locked(struct rt_mutex *lock, + struct rt_mutex_waiter *waiter, + unsigned long flags); +void __sched rt_spin_lock_slowunlock(struct rt_mutex *lock); #ifdef CONFIG_DEBUG_RT_MUTEXES # include "rtmutex-debug.h" diff --git a/kernel/locking/rwlock-rt.c b/kernel/locking/rwlock-rt.c new file mode 100644 index 000000000..3d2d1f14b --- /dev/null +++ b/kernel/locking/rwlock-rt.c @@ -0,0 +1,334 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/sched/debug.h> +#include <linux/export.h> + +#include "rtmutex_common.h" +#include <linux/rwlock_types_rt.h> + +/* + * RT-specific reader/writer locks + * + * write_lock() + * 1) Lock lock->rtmutex + * 2) Remove the reader BIAS to force readers into the slow path + * 3) Wait until all readers have left the critical region + * 4) Mark it write locked + * + * write_unlock() + * 1) Remove the write locked marker + * 2) Set the reader BIAS so readers can use the fast path again + * 3) Unlock lock->rtmutex to release blocked readers + * + * read_lock() + * 1) Try fast path acquisition (reader BIAS is set) + * 2) Take lock->rtmutex.wait_lock which protects the writelocked flag + * 3) If !writelocked, acquire it for read + * 4) If writelocked, block on lock->rtmutex + * 5) unlock lock->rtmutex, goto 1) + * + * read_unlock() + * 1) Try fast path release (reader count != 1) + * 2) Wake the writer waiting in write_lock()#3 + * + * read_lock()#3 has the consequence, that rw locks on RT are not writer + * fair, but writers, which should be avoided in RT tasks (think tasklist + * lock), are subject to the rtmutex priority/DL inheritance mechanism. + * + * It's possible to make the rw locks writer fair by keeping a list of + * active readers. A blocked writer would force all newly incoming readers + * to block on the rtmutex, but the rtmutex would have to be proxy locked + * for one reader after the other. We can't use multi-reader inheritance + * because there is no way to support that with + * SCHED_DEADLINE. Implementing the one by one reader boosting/handover + * mechanism is a major surgery for a very dubious value. + * + * The risk of writer starvation is there, but the pathological use cases + * which trigger it are not necessarily the typical RT workloads. + */ + +void __rwlock_biased_rt_init(struct rt_rw_lock *lock, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)lock, sizeof(*lock)); + lockdep_init_map(&lock->dep_map, name, key, 0); +#endif + atomic_set(&lock->readers, READER_BIAS); + rt_mutex_init(&lock->rtmutex); + lock->rtmutex.save_state = 1; +} + +static int __read_rt_trylock(struct rt_rw_lock *lock) +{ + int r, old; + + /* + * Increment reader count, if lock->readers < 0, i.e. READER_BIAS is + * set. + */ + for (r = atomic_read(&lock->readers); r < 0;) { + old = atomic_cmpxchg(&lock->readers, r, r + 1); + if (likely(old == r)) + return 1; + r = old; + } + return 0; +} + +static void __read_rt_lock(struct rt_rw_lock *lock) +{ + struct rt_mutex *m = &lock->rtmutex; + struct rt_mutex_waiter waiter; + unsigned long flags; + + if (__read_rt_trylock(lock)) + return; + + raw_spin_lock_irqsave(&m->wait_lock, flags); + /* + * Allow readers as long as the writer has not completely + * acquired the semaphore for write. + */ + if (atomic_read(&lock->readers) != WRITER_BIAS) { + atomic_inc(&lock->readers); + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + return; + } + + /* + * Call into the slow lock path with the rtmutex->wait_lock + * held, so this can't result in the following race: + * + * Reader1 Reader2 Writer + * read_lock() + * write_lock() + * rtmutex_lock(m) + * swait() + * read_lock() + * unlock(m->wait_lock) + * read_unlock() + * swake() + * lock(m->wait_lock) + * lock->writelocked=true + * unlock(m->wait_lock) + * + * write_unlock() + * lock->writelocked=false + * rtmutex_unlock(m) + * read_lock() + * write_lock() + * rtmutex_lock(m) + * swait() + * rtmutex_lock(m) + * + * That would put Reader1 behind the writer waiting on + * Reader2 to call read_unlock() which might be unbound. + */ + rt_mutex_init_waiter(&waiter, true); + rt_spin_lock_slowlock_locked(m, &waiter, flags); + /* + * The slowlock() above is guaranteed to return with the rtmutex is + * now held, so there can't be a writer active. Increment the reader + * count and immediately drop the rtmutex again. + */ + atomic_inc(&lock->readers); + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + rt_spin_lock_slowunlock(m); + + debug_rt_mutex_free_waiter(&waiter); +} + +static void __read_rt_unlock(struct rt_rw_lock *lock) +{ + struct rt_mutex *m = &lock->rtmutex; + struct task_struct *tsk; + + /* + * sem->readers can only hit 0 when a writer is waiting for the + * active readers to leave the critical region. + */ + if (!atomic_dec_and_test(&lock->readers)) + return; + + raw_spin_lock_irq(&m->wait_lock); + /* + * Wake the writer, i.e. the rtmutex owner. It might release the + * rtmutex concurrently in the fast path, but to clean up the rw + * lock it needs to acquire m->wait_lock. The worst case which can + * happen is a spurious wakeup. + */ + tsk = rt_mutex_owner(m); + if (tsk) + wake_up_process(tsk); + + raw_spin_unlock_irq(&m->wait_lock); +} + +static void __write_unlock_common(struct rt_rw_lock *lock, int bias, + unsigned long flags) +{ + struct rt_mutex *m = &lock->rtmutex; + + atomic_add(READER_BIAS - bias, &lock->readers); + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + rt_spin_lock_slowunlock(m); +} + +static void __write_rt_lock(struct rt_rw_lock *lock) +{ + struct rt_mutex *m = &lock->rtmutex; + struct task_struct *self = current; + unsigned long flags; + + /* Take the rtmutex as a first step */ + __rt_spin_lock(m); + + /* Force readers into slow path */ + atomic_sub(READER_BIAS, &lock->readers); + + raw_spin_lock_irqsave(&m->wait_lock, flags); + + raw_spin_lock(&self->pi_lock); + self->saved_state = self->state; + __set_current_state_no_track(TASK_UNINTERRUPTIBLE); + raw_spin_unlock(&self->pi_lock); + + for (;;) { + /* Have all readers left the critical region? */ + if (!atomic_read(&lock->readers)) { + atomic_set(&lock->readers, WRITER_BIAS); + raw_spin_lock(&self->pi_lock); + __set_current_state_no_track(self->saved_state); + self->saved_state = TASK_RUNNING; + raw_spin_unlock(&self->pi_lock); + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + return; + } + + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + + if (atomic_read(&lock->readers) != 0) + preempt_schedule_lock(); + + raw_spin_lock_irqsave(&m->wait_lock, flags); + + raw_spin_lock(&self->pi_lock); + __set_current_state_no_track(TASK_UNINTERRUPTIBLE); + raw_spin_unlock(&self->pi_lock); + } +} + +static int __write_rt_trylock(struct rt_rw_lock *lock) +{ + struct rt_mutex *m = &lock->rtmutex; + unsigned long flags; + + if (!__rt_mutex_trylock(m)) + return 0; + + atomic_sub(READER_BIAS, &lock->readers); + + raw_spin_lock_irqsave(&m->wait_lock, flags); + if (!atomic_read(&lock->readers)) { + atomic_set(&lock->readers, WRITER_BIAS); + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + return 1; + } + __write_unlock_common(lock, 0, flags); + return 0; +} + +static void __write_rt_unlock(struct rt_rw_lock *lock) +{ + struct rt_mutex *m = &lock->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&m->wait_lock, flags); + __write_unlock_common(lock, WRITER_BIAS, flags); +} + +int __lockfunc rt_read_can_lock(rwlock_t *rwlock) +{ + return atomic_read(&rwlock->readers) < 0; +} + +int __lockfunc rt_write_can_lock(rwlock_t *rwlock) +{ + return atomic_read(&rwlock->readers) == READER_BIAS; +} + +/* + * The common functions which get wrapped into the rwlock API. + */ +int __lockfunc rt_read_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = __read_rt_trylock(rwlock); + if (ret) { + rwlock_acquire_read(&rwlock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_read_trylock); + +int __lockfunc rt_write_trylock(rwlock_t *rwlock) +{ + int ret; + + ret = __write_rt_trylock(rwlock); + if (ret) { + rwlock_acquire(&rwlock->dep_map, 0, 1, _RET_IP_); + rcu_read_lock(); + migrate_disable(); + } + return ret; +} +EXPORT_SYMBOL(rt_write_trylock); + +void __lockfunc rt_read_lock(rwlock_t *rwlock) +{ + rwlock_acquire_read(&rwlock->dep_map, 0, 0, _RET_IP_); + __read_rt_lock(rwlock); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_read_lock); + +void __lockfunc rt_write_lock(rwlock_t *rwlock) +{ + rwlock_acquire(&rwlock->dep_map, 0, 0, _RET_IP_); + __write_rt_lock(rwlock); + rcu_read_lock(); + migrate_disable(); +} +EXPORT_SYMBOL(rt_write_lock); + +void __lockfunc rt_read_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + __read_rt_unlock(rwlock); +} +EXPORT_SYMBOL(rt_read_unlock); + +void __lockfunc rt_write_unlock(rwlock_t *rwlock) +{ + rwlock_release(&rwlock->dep_map, _RET_IP_); + migrate_enable(); + rcu_read_unlock(); + __write_rt_unlock(rwlock); +} +EXPORT_SYMBOL(rt_write_unlock); + +void __rt_rwlock_init(rwlock_t *rwlock, char *name, struct lock_class_key *key) +{ + __rwlock_biased_rt_init(rwlock, name, key); +} +EXPORT_SYMBOL(__rt_rwlock_init); diff --git a/kernel/locking/rwsem-rt.c b/kernel/locking/rwsem-rt.c new file mode 100644 index 000000000..b61edc4dc --- /dev/null +++ b/kernel/locking/rwsem-rt.c @@ -0,0 +1,317 @@ +// SPDX-License-Identifier: GPL-2.0-only +#include <linux/rwsem.h> +#include <linux/sched/debug.h> +#include <linux/sched/signal.h> +#include <linux/export.h> +#include <linux/blkdev.h> + +#include "rtmutex_common.h" + +/* + * RT-specific reader/writer semaphores + * + * down_write() + * 1) Lock sem->rtmutex + * 2) Remove the reader BIAS to force readers into the slow path + * 3) Wait until all readers have left the critical region + * 4) Mark it write locked + * + * up_write() + * 1) Remove the write locked marker + * 2) Set the reader BIAS so readers can use the fast path again + * 3) Unlock sem->rtmutex to release blocked readers + * + * down_read() + * 1) Try fast path acquisition (reader BIAS is set) + * 2) Take sem->rtmutex.wait_lock which protects the writelocked flag + * 3) If !writelocked, acquire it for read + * 4) If writelocked, block on sem->rtmutex + * 5) unlock sem->rtmutex, goto 1) + * + * up_read() + * 1) Try fast path release (reader count != 1) + * 2) Wake the writer waiting in down_write()#3 + * + * down_read()#3 has the consequence, that rw semaphores on RT are not writer + * fair, but writers, which should be avoided in RT tasks (think mmap_sem), + * are subject to the rtmutex priority/DL inheritance mechanism. + * + * It's possible to make the rw semaphores writer fair by keeping a list of + * active readers. A blocked writer would force all newly incoming readers to + * block on the rtmutex, but the rtmutex would have to be proxy locked for one + * reader after the other. We can't use multi-reader inheritance because there + * is no way to support that with SCHED_DEADLINE. Implementing the one by one + * reader boosting/handover mechanism is a major surgery for a very dubious + * value. + * + * The risk of writer starvation is there, but the pathological use cases + * which trigger it are not necessarily the typical RT workloads. + */ + +void __rwsem_init(struct rw_semaphore *sem, const char *name, + struct lock_class_key *key) +{ +#ifdef CONFIG_DEBUG_LOCK_ALLOC + /* + * Make sure we are not reinitializing a held semaphore: + */ + debug_check_no_locks_freed((void *)sem, sizeof(*sem)); + lockdep_init_map(&sem->dep_map, name, key, 0); +#endif + atomic_set(&sem->readers, READER_BIAS); +} +EXPORT_SYMBOL(__rwsem_init); + +int __down_read_trylock(struct rw_semaphore *sem) +{ + int r, old; + + /* + * Increment reader count, if sem->readers < 0, i.e. READER_BIAS is + * set. + */ + for (r = atomic_read(&sem->readers); r < 0;) { + old = atomic_cmpxchg(&sem->readers, r, r + 1); + if (likely(old == r)) + return 1; + r = old; + } + return 0; +} + +static int __sched __down_read_common(struct rw_semaphore *sem, int state) +{ + struct rt_mutex *m = &sem->rtmutex; + struct rt_mutex_waiter waiter; + int ret; + + if (__down_read_trylock(sem)) + return 0; + + /* + * Flush blk before ->pi_blocked_on is set. At schedule() time it is too + * late if one of the callbacks needs to acquire a sleeping lock. + */ + if (blk_needs_flush_plug(current)) + blk_schedule_flush_plug(current); + + might_sleep(); + raw_spin_lock_irq(&m->wait_lock); + /* + * Allow readers as long as the writer has not completely + * acquired the semaphore for write. + */ + if (atomic_read(&sem->readers) != WRITER_BIAS) { + atomic_inc(&sem->readers); + raw_spin_unlock_irq(&m->wait_lock); + return 0; + } + + /* + * Call into the slow lock path with the rtmutex->wait_lock + * held, so this can't result in the following race: + * + * Reader1 Reader2 Writer + * down_read() + * down_write() + * rtmutex_lock(m) + * swait() + * down_read() + * unlock(m->wait_lock) + * up_read() + * swake() + * lock(m->wait_lock) + * sem->writelocked=true + * unlock(m->wait_lock) + * + * up_write() + * sem->writelocked=false + * rtmutex_unlock(m) + * down_read() + * down_write() + * rtmutex_lock(m) + * swait() + * rtmutex_lock(m) + * + * That would put Reader1 behind the writer waiting on + * Reader2 to call up_read() which might be unbound. + */ + rt_mutex_init_waiter(&waiter, false); + ret = rt_mutex_slowlock_locked(m, state, NULL, RT_MUTEX_MIN_CHAINWALK, + NULL, &waiter); + /* + * The slowlock() above is guaranteed to return with the rtmutex (for + * ret = 0) is now held, so there can't be a writer active. Increment + * the reader count and immediately drop the rtmutex again. + * For ret != 0 we don't hold the rtmutex and need unlock the wait_lock. + * We don't own the lock then. + */ + if (!ret) + atomic_inc(&sem->readers); + raw_spin_unlock_irq(&m->wait_lock); + if (!ret) + __rt_mutex_unlock(m); + + debug_rt_mutex_free_waiter(&waiter); + return ret; +} + +void __down_read(struct rw_semaphore *sem) +{ + int ret; + + ret = __down_read_common(sem, TASK_UNINTERRUPTIBLE); + WARN_ON_ONCE(ret); +} + +int __down_read_interruptible(struct rw_semaphore *sem) +{ + int ret; + + ret = __down_read_common(sem, TASK_INTERRUPTIBLE); + if (likely(!ret)) + return ret; + WARN_ONCE(ret != -EINTR, "Unexpected state: %d\n", ret); + return -EINTR; +} + +int __down_read_killable(struct rw_semaphore *sem) +{ + int ret; + + ret = __down_read_common(sem, TASK_KILLABLE); + if (likely(!ret)) + return ret; + WARN_ONCE(ret != -EINTR, "Unexpected state: %d\n", ret); + return -EINTR; +} + +void __up_read(struct rw_semaphore *sem) +{ + struct rt_mutex *m = &sem->rtmutex; + struct task_struct *tsk; + + /* + * sem->readers can only hit 0 when a writer is waiting for the + * active readers to leave the critical region. + */ + if (!atomic_dec_and_test(&sem->readers)) + return; + + raw_spin_lock_irq(&m->wait_lock); + /* + * Wake the writer, i.e. the rtmutex owner. It might release the + * rtmutex concurrently in the fast path (due to a signal), but to + * clean up the rwsem it needs to acquire m->wait_lock. The worst + * case which can happen is a spurious wakeup. + */ + tsk = rt_mutex_owner(m); + if (tsk) + wake_up_process(tsk); + + raw_spin_unlock_irq(&m->wait_lock); +} + +static void __up_write_unlock(struct rw_semaphore *sem, int bias, + unsigned long flags) +{ + struct rt_mutex *m = &sem->rtmutex; + + atomic_add(READER_BIAS - bias, &sem->readers); + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + __rt_mutex_unlock(m); +} + +static int __sched __down_write_common(struct rw_semaphore *sem, int state) +{ + struct rt_mutex *m = &sem->rtmutex; + unsigned long flags; + + /* + * Flush blk before ->pi_blocked_on is set. At schedule() time it is too + * late if one of the callbacks needs to acquire a sleeping lock. + */ + if (blk_needs_flush_plug(current)) + blk_schedule_flush_plug(current); + + /* Take the rtmutex as a first step */ + if (__rt_mutex_lock_state(m, state)) + return -EINTR; + + /* Force readers into slow path */ + atomic_sub(READER_BIAS, &sem->readers); + might_sleep(); + + set_current_state(state); + for (;;) { + raw_spin_lock_irqsave(&m->wait_lock, flags); + /* Have all readers left the critical region? */ + if (!atomic_read(&sem->readers)) { + atomic_set(&sem->readers, WRITER_BIAS); + __set_current_state(TASK_RUNNING); + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + return 0; + } + + if (signal_pending_state(state, current)) { + __set_current_state(TASK_RUNNING); + __up_write_unlock(sem, 0, flags); + return -EINTR; + } + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + + if (atomic_read(&sem->readers) != 0) { + schedule(); + set_current_state(state); + } + } +} + +void __sched __down_write(struct rw_semaphore *sem) +{ + __down_write_common(sem, TASK_UNINTERRUPTIBLE); +} + +int __sched __down_write_killable(struct rw_semaphore *sem) +{ + return __down_write_common(sem, TASK_KILLABLE); +} + +int __down_write_trylock(struct rw_semaphore *sem) +{ + struct rt_mutex *m = &sem->rtmutex; + unsigned long flags; + + if (!__rt_mutex_trylock(m)) + return 0; + + atomic_sub(READER_BIAS, &sem->readers); + + raw_spin_lock_irqsave(&m->wait_lock, flags); + if (!atomic_read(&sem->readers)) { + atomic_set(&sem->readers, WRITER_BIAS); + raw_spin_unlock_irqrestore(&m->wait_lock, flags); + return 1; + } + __up_write_unlock(sem, 0, flags); + return 0; +} + +void __up_write(struct rw_semaphore *sem) +{ + struct rt_mutex *m = &sem->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&m->wait_lock, flags); + __up_write_unlock(sem, WRITER_BIAS, flags); +} + +void __downgrade_write(struct rw_semaphore *sem) +{ + struct rt_mutex *m = &sem->rtmutex; + unsigned long flags; + + raw_spin_lock_irqsave(&m->wait_lock, flags); + /* Release it and account current as reader */ + __up_write_unlock(sem, WRITER_BIAS - 1, flags); +} diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c index a163542d1..3821c7204 100644 --- a/kernel/locking/rwsem.c +++ b/kernel/locking/rwsem.c @@ -28,6 +28,7 @@ #include <linux/rwsem.h> #include <linux/atomic.h> +#ifndef CONFIG_PREEMPT_RT #include "lock_events.h" /* @@ -1494,6 +1495,7 @@ static inline void __downgrade_write(struct rw_semaphore *sem) if (tmp & RWSEM_FLAG_WAITERS) rwsem_downgrade_wake(sem); } +#endif /* * lock for reading @@ -1657,7 +1659,9 @@ void down_read_non_owner(struct rw_semaphore *sem) { might_sleep(); __down_read(sem); +#ifndef CONFIG_PREEMPT_RT __rwsem_set_reader_owned(sem, NULL); +#endif } EXPORT_SYMBOL(down_read_non_owner); @@ -1686,7 +1690,9 @@ EXPORT_SYMBOL(down_write_killable_nested); void up_read_non_owner(struct rw_semaphore *sem) { +#ifndef CONFIG_PREEMPT_RT DEBUG_RWSEMS_WARN_ON(!is_rwsem_reader_owned(sem), sem); +#endif __up_read(sem); } EXPORT_SYMBOL(up_read_non_owner); diff --git a/kernel/locking/spinlock.c b/kernel/locking/spinlock.c index 0ff08380f..45445a2f1 100644 --- a/kernel/locking/spinlock.c +++ b/kernel/locking/spinlock.c @@ -124,8 +124,11 @@ void __lockfunc __raw_##op##_lock_bh(locktype##_t *lock) \ * __[spin|read|write]_lock_bh() */ BUILD_LOCK_OPS(spin, raw_spinlock); + +#ifndef CONFIG_PREEMPT_RT BUILD_LOCK_OPS(read, rwlock); BUILD_LOCK_OPS(write, rwlock); +#endif #endif @@ -209,6 +212,8 @@ void __lockfunc _raw_spin_unlock_bh(raw_spinlock_t *lock) EXPORT_SYMBOL(_raw_spin_unlock_bh); #endif +#ifndef CONFIG_PREEMPT_RT + #ifndef CONFIG_INLINE_READ_TRYLOCK int __lockfunc _raw_read_trylock(rwlock_t *lock) { @@ -353,6 +358,8 @@ void __lockfunc _raw_write_unlock_bh(rwlock_t *lock) EXPORT_SYMBOL(_raw_write_unlock_bh); #endif +#endif /* !PREEMPT_RT */ + #ifdef CONFIG_DEBUG_LOCK_ALLOC void __lockfunc _raw_spin_lock_nested(raw_spinlock_t *lock, int subclass) diff --git a/kernel/locking/spinlock_debug.c b/kernel/locking/spinlock_debug.c index b9d93087e..72e306e0e 100644 --- a/kernel/locking/spinlock_debug.c +++ b/kernel/locking/spinlock_debug.c @@ -31,6 +31,7 @@ void __raw_spin_lock_init(raw_spinlock_t *lock, const char *name, EXPORT_SYMBOL(__raw_spin_lock_init); +#ifndef CONFIG_PREEMPT_RT void __rwlock_init(rwlock_t *lock, const char *name, struct lock_class_key *key) { @@ -48,6 +49,7 @@ void __rwlock_init(rwlock_t *lock, const char *name, } EXPORT_SYMBOL(__rwlock_init); +#endif static void spin_dump(raw_spinlock_t *lock, const char *msg) { @@ -139,6 +141,7 @@ void do_raw_spin_unlock(raw_spinlock_t *lock) arch_spin_unlock(&lock->raw_lock); } +#ifndef CONFIG_PREEMPT_RT static void rwlock_bug(rwlock_t *lock, const char *msg) { if (!debug_locks_off()) @@ -228,3 +231,5 @@ void do_raw_write_unlock(rwlock_t *lock) debug_write_unlock(lock); arch_write_unlock(&lock->raw_lock); } + +#endif diff --git a/kernel/notifier.c b/kernel/notifier.c index 1b019cbca..c20782f07 100644 --- a/kernel/notifier.c +++ b/kernel/notifier.c @@ -142,9 +142,9 @@ int atomic_notifier_chain_register(struct atomic_notifier_head *nh, unsigned long flags; int ret; - spin_lock_irqsave(&nh->lock, flags); + raw_spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_register(&nh->head, n); - spin_unlock_irqrestore(&nh->lock, flags); + raw_spin_unlock_irqrestore(&nh->lock, flags); return ret; } EXPORT_SYMBOL_GPL(atomic_notifier_chain_register); @@ -164,9 +164,9 @@ int atomic_notifier_chain_unregister(struct atomic_notifier_head *nh, unsigned long flags; int ret; - spin_lock_irqsave(&nh->lock, flags); + raw_spin_lock_irqsave(&nh->lock, flags); ret = notifier_chain_unregister(&nh->head, n); - spin_unlock_irqrestore(&nh->lock, flags); + raw_spin_unlock_irqrestore(&nh->lock, flags); synchronize_rcu(); return ret; } @@ -182,9 +182,9 @@ int atomic_notifier_call_chain_robust(struct atomic_notifier_head *nh, * Musn't use RCU; because then the notifier list can * change between the up and down traversal. */ - spin_lock_irqsave(&nh->lock, flags); + raw_spin_lock_irqsave(&nh->lock, flags); ret = notifier_call_chain_robust(&nh->head, val_up, val_down, v); - spin_unlock_irqrestore(&nh->lock, flags); + raw_spin_unlock_irqrestore(&nh->lock, flags); return ret; } diff --git a/kernel/panic.c b/kernel/panic.c index d991c3b1b..fa3025e0c 100644 --- a/kernel/panic.c +++ b/kernel/panic.c @@ -177,12 +177,28 @@ static void panic_print_sys_info(void) void panic(const char *fmt, ...) { static char buf[1024]; + va_list args2; va_list args; long i, i_next = 0, len; int state = 0; int old_cpu, this_cpu; bool _crash_kexec_post_notifiers = crash_kexec_post_notifiers; + console_verbose(); + pr_emerg("Kernel panic - not syncing:\n"); + va_start(args2, fmt); + va_copy(args, args2); + vprintk(fmt, args2); + va_end(args2); +#ifdef CONFIG_DEBUG_BUGVERBOSE + /* + * Avoid nested stack-dumping if a panic occurs during oops processing + */ + if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) + dump_stack(); +#endif + pr_flush(1000, true); + /* * Disable local interrupts. This will prevent panic_smp_self_stop * from deadlocking the first cpu that invokes the panic, since @@ -213,24 +229,13 @@ void panic(const char *fmt, ...) if (old_cpu != PANIC_CPU_INVALID && old_cpu != this_cpu) panic_smp_self_stop(); - console_verbose(); bust_spinlocks(1); - va_start(args, fmt); len = vscnprintf(buf, sizeof(buf), fmt, args); va_end(args); if (len && buf[len - 1] == '\n') buf[len - 1] = '\0'; - pr_emerg("Kernel panic - not syncing: %s\n", buf); -#ifdef CONFIG_DEBUG_BUGVERBOSE - /* - * Avoid nested stack-dumping if a panic occurs during oops processing - */ - if (!test_taint(TAINT_DIE) && oops_in_progress <= 1) - dump_stack(); -#endif - /* * If kgdb is enabled, give it a chance to run before we stop all * the other CPUs or else we won't be able to debug processes left @@ -247,7 +252,6 @@ void panic(const char *fmt, ...) * Bypass the panic_cpu check and call __crash_kexec directly. */ if (!_crash_kexec_post_notifiers) { - printk_safe_flush_on_panic(); __crash_kexec(NULL); /* @@ -298,8 +302,6 @@ void panic(const char *fmt, ...) */ atomic_notifier_call_chain(&panic_notifier_list, 0, buf); - /* Call flush even twice. It tries harder with a single online CPU */ - printk_safe_flush_on_panic(); kmsg_dump(KMSG_DUMP_PANIC); /* @@ -569,9 +571,11 @@ static u64 oops_id; static int init_oops_id(void) { +#ifndef CONFIG_PREEMPT_RT if (!oops_id) get_random_bytes(&oops_id, sizeof(oops_id)); else +#endif oops_id++; return 0; @@ -582,6 +586,7 @@ static void print_oops_end_marker(void) { init_oops_id(); pr_warn("---[ end trace %016llx ]---\n", (unsigned long long)oops_id); + pr_flush(1000, true); } /* diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index eee3dc9b6..59cb24e25 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,5 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y = printk.o -obj-$(CONFIG_PRINTK) += printk_safe.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o obj-$(CONFIG_PRINTK) += printk_ringbuffer.o diff --git a/kernel/printk/internal.h b/kernel/printk/internal.h index b1c155328..e69de29bb 100644 --- a/kernel/printk/internal.h +++ b/kernel/printk/internal.h @@ -1,37 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-or-later */ -/* - * internal.h - printk internal definitions - */ -#include <linux/percpu.h> - -#ifdef CONFIG_PRINTK - -#define PRINTK_SAFE_CONTEXT_MASK 0x007ffffff -#define PRINTK_NMI_DIRECT_CONTEXT_MASK 0x008000000 -#define PRINTK_NMI_CONTEXT_MASK 0xff0000000 - -#define PRINTK_NMI_CONTEXT_OFFSET 0x010000000 - -extern raw_spinlock_t logbuf_lock; - -__printf(4, 0) -int vprintk_store(int facility, int level, - const struct dev_printk_info *dev_info, - const char *fmt, va_list args); - -__printf(1, 0) int vprintk_default(const char *fmt, va_list args); -__printf(1, 0) int vprintk_deferred(const char *fmt, va_list args); -__printf(1, 0) int vprintk_func(const char *fmt, va_list args); - -void printk_safe_init(void); -bool printk_percpu_data_ready(void); - -void defer_console_output(void); - -#else - -__printf(1, 0) int vprintk_func(const char *fmt, va_list args) { return 0; } - -static inline void printk_safe_init(void) { } -static inline bool printk_percpu_data_ready(void) { return false; } -#endif /* CONFIG_PRINTK */ diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 43f8f2573..54a4b01a4 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -44,6 +44,9 @@ #include <linux/irq_work.h> #include <linux/ctype.h> #include <linux/uio.h> +#include <linux/kthread.h> +#include <linux/kdb.h> +#include <linux/clocksource.h> #include <linux/sched/clock.h> #include <linux/sched/debug.h> #include <linux/sched/task_stack.h> @@ -58,7 +61,6 @@ #include "printk_ringbuffer.h" #include "console_cmdline.h" #include "braille.h" -#include "internal.h" int console_printk[4] = { CONSOLE_LOGLEVEL_DEFAULT, /* console_loglevel */ @@ -93,12 +95,6 @@ EXPORT_SYMBOL_GPL(console_drivers); */ int __read_mostly suppress_printk; -#ifdef CONFIG_LOCKDEP -static struct lockdep_map console_lock_dep_map = { - .name = "console_lock" -}; -#endif - enum devkmsg_log_bits { __DEVKMSG_LOG_BIT_ON = 0, __DEVKMSG_LOG_BIT_OFF, @@ -225,19 +221,7 @@ static int nr_ext_console_drivers; static int __down_trylock_console_sem(unsigned long ip) { - int lock_failed; - unsigned long flags; - - /* - * Here and in __up_console_sem() we need to be in safe mode, - * because spindump/WARN/etc from under console ->lock will - * deadlock in printk()->down_trylock_console_sem() otherwise. - */ - printk_safe_enter_irqsave(flags); - lock_failed = down_trylock(&console_sem); - printk_safe_exit_irqrestore(flags); - - if (lock_failed) + if (down_trylock(&console_sem)) return 1; mutex_acquire(&console_lock_dep_map, 0, 1, ip); return 0; @@ -246,13 +230,9 @@ static int __down_trylock_console_sem(unsigned long ip) static void __up_console_sem(unsigned long ip) { - unsigned long flags; - mutex_release(&console_lock_dep_map, ip); - printk_safe_enter_irqsave(flags); up(&console_sem); - printk_safe_exit_irqrestore(flags); } #define up_console_sem() __up_console_sem(_RET_IP_) @@ -266,11 +246,6 @@ static void __up_console_sem(unsigned long ip) */ static int console_locked, console_suspended; -/* - * If exclusive_console is non-NULL then only this console is to be printed to. - */ -static struct console *exclusive_console; - /* * Array of consoles built from command line options (console=) */ @@ -355,61 +330,43 @@ enum log_flags { LOG_CONT = 8, /* text is a fragment of a continuation line */ }; -/* - * The logbuf_lock protects kmsg buffer, indices, counters. This can be taken - * within the scheduler's rq lock. It must be released before calling - * console_unlock() or anything else that might wake up a process. - */ -DEFINE_RAW_SPINLOCK(logbuf_lock); +#ifdef CONFIG_PRINTK +/* syslog_lock protects syslog_* variables and write access to clear_seq. */ +static DEFINE_SPINLOCK(syslog_lock); -/* - * Helper macros to lock/unlock logbuf_lock and switch between - * printk-safe/unsafe modes. - */ -#define logbuf_lock_irq() \ - do { \ - printk_safe_enter_irq(); \ - raw_spin_lock(&logbuf_lock); \ - } while (0) - -#define logbuf_unlock_irq() \ - do { \ - raw_spin_unlock(&logbuf_lock); \ - printk_safe_exit_irq(); \ - } while (0) - -#define logbuf_lock_irqsave(flags) \ - do { \ - printk_safe_enter_irqsave(flags); \ - raw_spin_lock(&logbuf_lock); \ - } while (0) - -#define logbuf_unlock_irqrestore(flags) \ - do { \ - raw_spin_unlock(&logbuf_lock); \ - printk_safe_exit_irqrestore(flags); \ - } while (0) +/* Set to enable sync mode. Once set, it is never cleared. */ +static bool sync_mode; -#ifdef CONFIG_PRINTK DECLARE_WAIT_QUEUE_HEAD(log_wait); +/* All 3 protected by @syslog_lock. */ /* the next printk record to read by syslog(READ) or /proc/kmsg */ static u64 syslog_seq; static size_t syslog_partial; static bool syslog_time; -/* the next printk record to write to the console */ -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; +struct latched_seq { + seqcount_latch_t latch; + u64 val[2]; +}; -/* the next printk record to read after the last 'clear' command */ -static u64 clear_seq; +/* + * The next printk record to read after the last 'clear' command. There are + * two copies (updated with seqcount_latch) so that reads can locklessly + * access a valid value. Writers are synchronized by @syslog_lock. + */ +static struct latched_seq clear_seq = { + .latch = SEQCNT_LATCH_ZERO(clear_seq.latch), + .val[0] = 0, + .val[1] = 0, +}; #ifdef CONFIG_PRINTK_CALLER #define PREFIX_MAX 48 #else #define PREFIX_MAX 32 #endif + +/* the maximum size allowed to be reserved for a record */ #define LOG_LINE_MAX (1024 - PREFIX_MAX) #define LOG_LEVEL(v) ((v) & 0x07) @@ -447,11 +404,36 @@ static struct printk_ringbuffer *prb = &printk_rb_static; */ static bool __printk_percpu_data_ready __read_mostly; -bool printk_percpu_data_ready(void) +static bool printk_percpu_data_ready(void) { return __printk_percpu_data_ready; } +/* Must be called under syslog_lock. */ +static void latched_seq_write(struct latched_seq *ls, u64 val) +{ + raw_write_seqcount_latch(&ls->latch); + ls->val[0] = val; + raw_write_seqcount_latch(&ls->latch); + ls->val[1] = val; +} + +/* Can be called from any context. */ +static u64 latched_seq_read_nolock(struct latched_seq *ls) +{ + unsigned int seq; + unsigned int idx; + u64 val; + + do { + seq = raw_read_seqcount_latch(&ls->latch); + idx = seq & 0x1; + val = ls->val[idx]; + } while (read_seqcount_latch_retry(&ls->latch, seq)); + + return val; +} + /* Return log buffer address */ char *log_buf_addr_get(void) { @@ -493,52 +475,6 @@ static void truncate_msg(u16 *text_len, u16 *trunc_msg_len) *trunc_msg_len = 0; } -/* insert record into the buffer, discard old ones, update heads */ -static int log_store(u32 caller_id, int facility, int level, - enum log_flags flags, u64 ts_nsec, - const struct dev_printk_info *dev_info, - const char *text, u16 text_len) -{ - struct prb_reserved_entry e; - struct printk_record r; - u16 trunc_msg_len = 0; - - prb_rec_init_wr(&r, text_len); - - if (!prb_reserve(&e, prb, &r)) { - /* truncate the message if it is too long for empty buffer */ - truncate_msg(&text_len, &trunc_msg_len); - prb_rec_init_wr(&r, text_len + trunc_msg_len); - /* survive when the log buffer is too small for trunc_msg */ - if (!prb_reserve(&e, prb, &r)) - return 0; - } - - /* fill message */ - memcpy(&r.text_buf[0], text, text_len); - if (trunc_msg_len) - memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); - r.info->text_len = text_len + trunc_msg_len; - r.info->facility = facility; - r.info->level = level & 7; - r.info->flags = flags & 0x1f; - if (ts_nsec > 0) - r.info->ts_nsec = ts_nsec; - else - r.info->ts_nsec = local_clock(); - r.info->caller_id = caller_id; - if (dev_info) - memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); - - /* A message without a trailing newline can be continued. */ - if (!(flags & LOG_NEWLINE)) - prb_commit(&e); - else - prb_final_commit(&e); - - return (text_len + trunc_msg_len); -} - int dmesg_restrict = IS_ENABLED(CONFIG_SECURITY_DMESG_RESTRICT); static int syslog_action_restricted(int type) @@ -667,7 +603,7 @@ static ssize_t msg_print_ext_body(char *buf, size_t size, /* /dev/kmsg - userspace message inject/listen interface */ struct devkmsg_user { - u64 seq; + atomic64_t seq; struct ratelimit_state rs; struct mutex lock; char buf[CONSOLE_EXT_LOG_MAX]; @@ -768,27 +704,22 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, if (ret) return ret; - logbuf_lock_irq(); - if (!prb_read_valid(prb, user->seq, r)) { + if (!prb_read_valid(prb, atomic64_read(&user->seq), r)) { if (file->f_flags & O_NONBLOCK) { ret = -EAGAIN; - logbuf_unlock_irq(); goto out; } - logbuf_unlock_irq(); ret = wait_event_interruptible(log_wait, - prb_read_valid(prb, user->seq, r)); + prb_read_valid(prb, atomic64_read(&user->seq), r)); if (ret) goto out; - logbuf_lock_irq(); } - if (r->info->seq != user->seq) { + if (r->info->seq != atomic64_read(&user->seq)) { /* our last seen message is gone, return error and reset */ - user->seq = r->info->seq; + atomic64_set(&user->seq, r->info->seq); ret = -EPIPE; - logbuf_unlock_irq(); goto out; } @@ -797,8 +728,7 @@ static ssize_t devkmsg_read(struct file *file, char __user *buf, &r->text_buf[0], r->info->text_len, &r->info->dev_info); - user->seq = r->info->seq + 1; - logbuf_unlock_irq(); + atomic64_set(&user->seq, r->info->seq + 1); if (len > count) { ret = -EINVAL; @@ -833,11 +763,10 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) if (offset) return -ESPIPE; - logbuf_lock_irq(); switch (whence) { case SEEK_SET: /* the first record */ - user->seq = prb_first_valid_seq(prb); + atomic64_set(&user->seq, prb_first_valid_seq(prb)); break; case SEEK_DATA: /* @@ -845,16 +774,15 @@ static loff_t devkmsg_llseek(struct file *file, loff_t offset, int whence) * like issued by 'dmesg -c'. Reading /dev/kmsg itself * changes no global state, and does not clear anything. */ - user->seq = clear_seq; + atomic64_set(&user->seq, latched_seq_read_nolock(&clear_seq)); break; case SEEK_END: /* after the last record */ - user->seq = prb_next_seq(prb); + atomic64_set(&user->seq, prb_next_seq(prb)); break; default: ret = -EINVAL; } - logbuf_unlock_irq(); return ret; } @@ -869,15 +797,13 @@ static __poll_t devkmsg_poll(struct file *file, poll_table *wait) poll_wait(file, &log_wait, wait); - logbuf_lock_irq(); - if (prb_read_valid_info(prb, user->seq, &info, NULL)) { + if (prb_read_valid_info(prb, atomic64_read(&user->seq), &info, NULL)) { /* return error when data has vanished underneath us */ - if (info.seq != user->seq) + if (info.seq != atomic64_read(&user->seq)) ret = EPOLLIN|EPOLLRDNORM|EPOLLERR|EPOLLPRI; else ret = EPOLLIN|EPOLLRDNORM; } - logbuf_unlock_irq(); return ret; } @@ -910,9 +836,7 @@ static int devkmsg_open(struct inode *inode, struct file *file) prb_rec_init_rd(&user->record, &user->info, &user->text_buf[0], sizeof(user->text_buf)); - logbuf_lock_irq(); - user->seq = prb_first_valid_seq(prb); - logbuf_unlock_irq(); + atomic64_set(&user->seq, prb_first_valid_seq(prb)); file->private_data = user; return 0; @@ -1004,6 +928,9 @@ void log_buf_vmcoreinfo_setup(void) VMCOREINFO_SIZE(atomic_long_t); VMCOREINFO_TYPE_OFFSET(atomic_long_t, counter); + + VMCOREINFO_STRUCT_SIZE(latched_seq); + VMCOREINFO_OFFSET(latched_seq, val); } #endif @@ -1075,9 +1002,6 @@ static inline void log_buf_add_cpu(void) {} static void __init set_percpu_data_ready(void) { - printk_safe_init(); - /* Make sure we set this flag only after printk_safe() init is done */ - barrier(); __printk_percpu_data_ready = true; } @@ -1117,7 +1041,6 @@ void __init setup_log_buf(int early) struct printk_record r; size_t new_descs_size; size_t new_infos_size; - unsigned long flags; char *new_log_buf; unsigned int free; u64 seq; @@ -1175,8 +1098,6 @@ void __init setup_log_buf(int early) new_descs, ilog2(new_descs_count), new_infos); - logbuf_lock_irqsave(flags); - log_buf_len = new_log_buf_len; log_buf = new_log_buf; new_log_buf_len = 0; @@ -1192,8 +1113,6 @@ void __init setup_log_buf(int early) */ prb = &printk_rb_dynamic; - logbuf_unlock_irqrestore(flags); - if (seq != prb_next_seq(&printk_rb_static)) { pr_err("dropped %llu messages\n", prb_next_seq(&printk_rb_static) - seq); @@ -1470,6 +1389,50 @@ static size_t get_record_print_text_size(struct printk_info *info, return ((prefix_len * line_count) + info->text_len + 1); } +/* + * Beginning with @start_seq, find the first record where it and all following + * records up to (but not including) @max_seq fit into @size. + * + * @max_seq is simply an upper bound and does not need to exist. If the caller + * does not require an upper bound, -1 can be used for @max_seq. + */ +static u64 find_first_fitting_seq(u64 start_seq, u64 max_seq, size_t size, + bool syslog, bool time) +{ + struct printk_info info; + unsigned int line_count; + size_t len = 0; + u64 seq; + + /* Determine the size of the records up to @max_seq. */ + prb_for_each_info(start_seq, prb, seq, &info, &line_count) { + if (info.seq >= max_seq) + break; + len += get_record_print_text_size(&info, line_count, syslog, time); + } + + /* + * Adjust the upper bound for the next loop to avoid subtracting + * lengths that were never added. + */ + if (seq < max_seq) + max_seq = seq; + + /* + * Move first record forward until length fits into the buffer. Ignore + * newest messages that were not counted in the above cycle. Messages + * might appear and get lost in the meantime. This is a best effort + * that prevents an infinite loop that could occur with a retry. + */ + prb_for_each_info(start_seq, prb, seq, &info, &line_count) { + if (len <= size || info.seq >= max_seq) + break; + len -= get_record_print_text_size(&info, line_count, syslog, time); + } + + return seq; +} + static int syslog_print(char __user *buf, int size) { struct printk_info info; @@ -1477,19 +1440,19 @@ static int syslog_print(char __user *buf, int size) char *text; int len = 0; - text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); + text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); if (!text) return -ENOMEM; - prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); while (size > 0) { size_t n; size_t skip; - logbuf_lock_irq(); + spin_lock_irq(&syslog_lock); if (!prb_read_valid(prb, syslog_seq, &r)) { - logbuf_unlock_irq(); + spin_unlock_irq(&syslog_lock); break; } if (r.info->seq != syslog_seq) { @@ -1518,7 +1481,7 @@ static int syslog_print(char __user *buf, int size) syslog_partial += n; } else n = 0; - logbuf_unlock_irq(); + spin_unlock_irq(&syslog_lock); if (!n) break; @@ -1541,34 +1504,25 @@ static int syslog_print(char __user *buf, int size) static int syslog_print_all(char __user *buf, int size, bool clear) { struct printk_info info; - unsigned int line_count; struct printk_record r; char *text; int len = 0; u64 seq; bool time; - text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); + text = kmalloc(CONSOLE_LOG_MAX, GFP_KERNEL); if (!text) return -ENOMEM; time = printk_time; - logbuf_lock_irq(); /* * Find first record that fits, including all following records, * into the user-provided buffer for this dump. */ - prb_for_each_info(clear_seq, prb, seq, &info, &line_count) - len += get_record_print_text_size(&info, line_count, true, time); - - /* move first record forward until length fits into the buffer */ - prb_for_each_info(clear_seq, prb, seq, &info, &line_count) { - if (len <= size) - break; - len -= get_record_print_text_size(&info, line_count, true, time); - } + seq = find_first_fitting_seq(latched_seq_read_nolock(&clear_seq), -1, + size, true, time); - prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + prb_rec_init_rd(&r, &info, text, CONSOLE_LOG_MAX); len = 0; prb_for_each_record(seq, prb, seq, &r) { @@ -1581,20 +1535,20 @@ static int syslog_print_all(char __user *buf, int size, bool clear) break; } - logbuf_unlock_irq(); if (copy_to_user(buf + len, text, textlen)) len = -EFAULT; else len += textlen; - logbuf_lock_irq(); if (len < 0) break; } - if (clear) - clear_seq = seq; - logbuf_unlock_irq(); + if (clear) { + spin_lock_irq(&syslog_lock); + latched_seq_write(&clear_seq, seq); + spin_unlock_irq(&syslog_lock); + } kfree(text); return len; @@ -1602,9 +1556,21 @@ static int syslog_print_all(char __user *buf, int size, bool clear) static void syslog_clear(void) { - logbuf_lock_irq(); - clear_seq = prb_next_seq(prb); - logbuf_unlock_irq(); + spin_lock_irq(&syslog_lock); + latched_seq_write(&clear_seq, prb_next_seq(prb)); + spin_unlock_irq(&syslog_lock); +} + +/* Return a consistent copy of @syslog_seq. */ +static u64 read_syslog_seq_irq(void) +{ + u64 seq; + + spin_lock_irq(&syslog_lock); + seq = syslog_seq; + spin_unlock_irq(&syslog_lock); + + return seq; } int do_syslog(int type, char __user *buf, int len, int source) @@ -1630,8 +1596,9 @@ int do_syslog(int type, char __user *buf, int len, int source) return 0; if (!access_ok(buf, len)) return -EFAULT; + error = wait_event_interruptible(log_wait, - prb_read_valid(prb, syslog_seq, NULL)); + prb_read_valid(prb, read_syslog_seq_irq(), NULL)); if (error) return error; error = syslog_print(buf, len); @@ -1679,10 +1646,10 @@ int do_syslog(int type, char __user *buf, int len, int source) break; /* Number of chars in the log buffer */ case SYSLOG_ACTION_SIZE_UNREAD: - logbuf_lock_irq(); + spin_lock_irq(&syslog_lock); if (!prb_read_valid_info(prb, syslog_seq, &info, NULL)) { /* No unread messages. */ - logbuf_unlock_irq(); + spin_unlock_irq(&syslog_lock); return 0; } if (info.seq != syslog_seq) { @@ -1710,7 +1677,7 @@ int do_syslog(int type, char __user *buf, int len, int source) } error -= syslog_partial; } - logbuf_unlock_irq(); + spin_unlock_irq(&syslog_lock); break; /* Size of the log buffer */ case SYSLOG_ACTION_SIZE_BUFFER: @@ -1740,9 +1707,7 @@ static struct lockdep_map console_owner_dep_map = { }; #endif -static DEFINE_RAW_SPINLOCK(console_owner_lock); -static struct task_struct *console_owner; -static bool console_waiter; +int printk_delay_msec __read_mostly; #if defined(CONFIG_X86) || defined(CONFIG_ARM64_PSEUDO_NMI) void zap_locks(void) @@ -1763,187 +1728,171 @@ void zap_locks(void) } #endif -/** - * console_lock_spinning_enable - mark beginning of code where another - * thread might safely busy wait - * - * This basically converts console_lock into a spinlock. This marks - * the section where the console_lock owner can not sleep, because - * there may be a waiter spinning (like a spinlock). Also it must be - * ready to hand over the lock at the end of the section. - */ -static void console_lock_spinning_enable(void) +static inline void printk_delay(int level) { - raw_spin_lock(&console_owner_lock); - console_owner = current; - raw_spin_unlock(&console_owner_lock); + boot_delay_msec(level); + + if (unlikely(printk_delay_msec)) { + int m = printk_delay_msec; - /* The waiter may spin on us after setting console_owner */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); + while (m--) { + mdelay(1); + touch_nmi_watchdog(); + } + } } -/** - * console_lock_spinning_disable_and_check - mark end of code where another - * thread was able to busy wait and check if there is a waiter - * - * This is called at the end of the section where spinning is allowed. - * It has two functions. First, it is a signal that it is no longer - * safe to start busy waiting for the lock. Second, it checks if - * there is a busy waiter and passes the lock rights to her. - * - * Important: Callers lose the lock if there was a busy waiter. - * They must not touch items synchronized by console_lock - * in this case. - * - * Return: 1 if the lock rights were passed, 0 otherwise. - */ -static int console_lock_spinning_disable_and_check(void) +static bool kernel_sync_mode(void) { - int waiter; + return (oops_in_progress || sync_mode); +} - raw_spin_lock(&console_owner_lock); - waiter = READ_ONCE(console_waiter); - console_owner = NULL; - raw_spin_unlock(&console_owner_lock); +static bool console_can_sync(struct console *con) +{ + if (!(con->flags & CON_ENABLED)) + return false; + if (con->write_atomic && kernel_sync_mode()) + return true; + if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) + return true; + if (con->write && (con->flags & CON_BOOT) && !con->thread) + return true; + return false; +} - if (!waiter) { - spin_release(&console_owner_dep_map, _THIS_IP_); - return 0; - } +static bool call_sync_console_driver(struct console *con, const char *text, size_t text_len) +{ + if (!(con->flags & CON_ENABLED)) + return false; + if (con->write_atomic && kernel_sync_mode()) + con->write_atomic(con, text, text_len); + else if (con->write_atomic && (con->flags & CON_HANDOVER) && !con->thread) + con->write_atomic(con, text, text_len); + else if (con->write && (con->flags & CON_BOOT) && !con->thread) + con->write(con, text, text_len); + else + return false; - /* The waiter is now free to continue */ - WRITE_ONCE(console_waiter, false); + return true; +} - spin_release(&console_owner_dep_map, _THIS_IP_); +static bool have_atomic_console(void) +{ + struct console *con; - /* - * Hand off console_lock to waiter. The waiter will perform - * the up(). After this, the waiter is the console_lock owner. - */ - mutex_release(&console_lock_dep_map, _THIS_IP_); - return 1; + for_each_console(con) { + if (!(con->flags & CON_ENABLED)) + continue; + if (con->write_atomic) + return true; + } + return false; } -/** - * console_trylock_spinning - try to get console_lock by busy waiting - * - * This allows to busy wait for the console_lock when the current - * owner is running in specially marked sections. It means that - * the current owner is running and cannot reschedule until it - * is ready to lose the lock. - * - * Return: 1 if we got the lock, 0 othrewise - */ -static int console_trylock_spinning(void) +static bool print_sync(struct console *con, u64 *seq) { - struct task_struct *owner = NULL; - bool waiter; - bool spin = false; - unsigned long flags; + struct printk_info info; + struct printk_record r; + size_t text_len; - if (console_trylock()) - return 1; + prb_rec_init_rd(&r, &info, &con->sync_buf[0], sizeof(con->sync_buf)); - printk_safe_enter_irqsave(flags); + if (!prb_read_valid(prb, *seq, &r)) + return false; - raw_spin_lock(&console_owner_lock); - owner = READ_ONCE(console_owner); - waiter = READ_ONCE(console_waiter); - if (!waiter && owner && owner != current) { - WRITE_ONCE(console_waiter, true); - spin = true; - } - raw_spin_unlock(&console_owner_lock); + text_len = record_print_text(&r, console_msg_format & MSG_FORMAT_SYSLOG, printk_time); - /* - * If there is an active printk() writing to the - * consoles, instead of having it write our data too, - * see if we can offload that load from the active - * printer, and do some printing ourselves. - * Go into a spin only if there isn't already a waiter - * spinning, and there is an active printer, and - * that active printer isn't us (recursive printk?). - */ - if (!spin) { - printk_safe_exit_irqrestore(flags); - return 0; - } + if (!call_sync_console_driver(con, &con->sync_buf[0], text_len)) + return false; - /* We spin waiting for the owner to release us */ - spin_acquire(&console_owner_dep_map, 0, 0, _THIS_IP_); - /* Owner will clear console_waiter on hand off */ - while (READ_ONCE(console_waiter)) - cpu_relax(); - spin_release(&console_owner_dep_map, _THIS_IP_); + *seq = r.info->seq; - printk_safe_exit_irqrestore(flags); - /* - * The owner passed the console lock to us. - * Since we did not spin on console lock, annotate - * this as a trylock. Otherwise lockdep will - * complain. - */ - mutex_acquire(&console_lock_dep_map, 0, 1, _THIS_IP_); + touch_softlockup_watchdog_sync(); + clocksource_touch_watchdog(); + rcu_cpu_stall_reset(); + touch_nmi_watchdog(); - return 1; + if (text_len) + printk_delay(r.info->level); + + return true; } -/* - * Call the console drivers, asking them to write out - * log_buf[start] to log_buf[end - 1]. - * The console_lock must be held. - */ -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) +static void print_sync_until(struct console *con, u64 seq) { - static char dropped_text[64]; - size_t dropped_len = 0; - struct console *con; + unsigned int flags; + u64 printk_seq; + + console_atomic_lock(&flags); + for (;;) { + printk_seq = atomic64_read(&con->printk_seq); + if (printk_seq >= seq) + break; + if (!print_sync(con, &printk_seq)) + break; + atomic64_set(&con->printk_seq, printk_seq + 1); + } + console_atomic_unlock(flags); +} - trace_console_rcuidle(text, len); +#ifdef CONFIG_PRINTK_NMI +#define NUM_RECURSION_CTX 2 +#else +#define NUM_RECURSION_CTX 1 +#endif - if (!console_drivers) - return; +struct printk_recursion { + char count[NUM_RECURSION_CTX]; +}; - if (console_dropped) { - dropped_len = snprintf(dropped_text, sizeof(dropped_text), - "** %lu printk messages dropped **\n", - console_dropped); - console_dropped = 0; - } +static DEFINE_PER_CPU(struct printk_recursion, percpu_printk_recursion); +static char printk_recursion_count[NUM_RECURSION_CTX]; - for_each_console(con) { - if (exclusive_console && con != exclusive_console) - continue; - if (!(con->flags & CON_ENABLED)) - continue; - if (!con->write) - continue; - if (!cpu_online(smp_processor_id()) && - !(con->flags & CON_ANYTIME)) - continue; - if (con->flags & CON_EXTENDED) - con->write(con, ext_text, ext_len); - else { - if (dropped_len) - con->write(con, dropped_text, dropped_len); - con->write(con, text, len); - } +static char *printk_recursion_counter(void) +{ + struct printk_recursion *rec; + char *count; + + if (!printk_percpu_data_ready()) { + count = &printk_recursion_count[0]; + } else { + rec = this_cpu_ptr(&percpu_printk_recursion); + + count = &rec->count[0]; } + +#ifdef CONFIG_PRINTK_NMI + if (in_nmi()) + count++; +#endif + + return count; } -int printk_delay_msec __read_mostly; -static inline void printk_delay(void) +static bool printk_enter_irqsave(unsigned long *flags) { - if (unlikely(printk_delay_msec)) { - int m = printk_delay_msec; + char *count; - while (m--) { - mdelay(1); - touch_nmi_watchdog(); - } + local_irq_save(*flags); + count = printk_recursion_counter(); + /* Only 1 level of recursion allowed. */ + if (*count > 1) { + local_irq_restore(*flags); + return false; } + (*count)++; + + return true; +} + +static void printk_exit_irqrestore(unsigned long flags) +{ + char *count; + + count = printk_recursion_counter(); + (*count)--; + local_irq_restore(flags); } static inline u32 printk_caller_id(void) @@ -1952,144 +1901,248 @@ static inline u32 printk_caller_id(void) 0x80000000 + raw_smp_processor_id(); } -static size_t log_output(int facility, int level, enum log_flags lflags, - const struct dev_printk_info *dev_info, - char *text, size_t text_len) +/** + * parse_prefix - Parse level and control flags. + * + * @text: The terminated text message. + * @level: A pointer to the current level value, will be updated. + * @lflags: A pointer to the current log flags, will be updated. + * + * @level may be NULL if the caller is not interested in the parsed value. + * Otherwise the variable pointed to by @level must be set to + * LOGLEVEL_DEFAULT in order to be updated with the parsed value. + * + * @lflags may be NULL if the caller is not interested in the parsed value. + * Otherwise the variable pointed to by @lflags will be OR'd with the parsed + * value. + * + * Return: The length of the parsed level and control flags. + */ +static u16 parse_prefix(char *text, int *level, enum log_flags *lflags) { - const u32 caller_id = printk_caller_id(); + u16 prefix_len = 0; + int kern_level; - if (lflags & LOG_CONT) { - struct prb_reserved_entry e; - struct printk_record r; + while (*text) { + kern_level = printk_get_level(text); + if (!kern_level) + break; - prb_rec_init_wr(&r, text_len); - if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { - memcpy(&r.text_buf[r.info->text_len], text, text_len); - r.info->text_len += text_len; - if (lflags & LOG_NEWLINE) { - r.info->flags |= LOG_NEWLINE; - prb_final_commit(&e); - } else { - prb_commit(&e); - } - return text_len; + switch (kern_level) { + case '0' ... '7': + if (level && *level == LOGLEVEL_DEFAULT) + *level = kern_level - '0'; + break; + case 'c': /* KERN_CONT */ + if (lflags) + *lflags |= LOG_CONT; } + + prefix_len += 2; + text += 2; } - /* Store it in the record log */ - return log_store(caller_id, facility, level, lflags, 0, - dev_info, text, text_len); + return prefix_len; } -/* Must be called under logbuf_lock. */ -int vprintk_store(int facility, int level, - const struct dev_printk_info *dev_info, - const char *fmt, va_list args) +static u16 printk_sprint(char *text, u16 size, int facility, enum log_flags *lflags, + const char *fmt, va_list args) { - static char textbuf[LOG_LINE_MAX]; - char *text = textbuf; - size_t text_len; - enum log_flags lflags = 0; + u16 text_len; - /* - * The printf needs to come first; we need the syslog - * prefix which might be passed-in as a parameter. - */ - text_len = vscnprintf(text, sizeof(textbuf), fmt, args); + text_len = vscnprintf(text, size, fmt, args); - /* mark and strip a trailing newline */ - if (text_len && text[text_len-1] == '\n') { + /* Mark and strip a trailing newline. */ + if (text_len && text[text_len - 1] == '\n') { text_len--; - lflags |= LOG_NEWLINE; + *lflags |= LOG_NEWLINE; } - /* strip kernel syslog prefix and extract log level or control flags */ + /* Strip log level and control flags. */ if (facility == 0) { - int kern_level; - - while ((kern_level = printk_get_level(text)) != 0) { - switch (kern_level) { - case '0' ... '7': - if (level == LOGLEVEL_DEFAULT) - level = kern_level - '0'; - break; - case 'c': /* KERN_CONT */ - lflags |= LOG_CONT; - } + u16 prefix_len; - text_len -= 2; - text += 2; + prefix_len = parse_prefix(text, NULL, NULL); + if (prefix_len) { + text_len -= prefix_len; + memmove(text, text + prefix_len, text_len); } } - if (level == LOGLEVEL_DEFAULT) - level = default_message_loglevel; - - if (dev_info) - lflags |= LOG_NEWLINE; - - return log_output(facility, level, lflags, dev_info, text, text_len); + return text_len; } -asmlinkage int vprintk_emit(int facility, int level, - const struct dev_printk_info *dev_info, - const char *fmt, va_list args) +__printf(4, 0) +static int vprintk_store(int facility, int level, + const struct dev_printk_info *dev_info, + const char *fmt, va_list args) { - int printed_len; - bool in_sched = false; - unsigned long flags; + const u32 caller_id = printk_caller_id(); + struct prb_reserved_entry e; + enum log_flags lflags = 0; + bool final_commit = false; + struct printk_record r; + unsigned long irqflags; + u16 trunc_msg_len = 0; + char prefix_buf[8]; + u16 reserve_size; + va_list args2; + u16 text_len; + int ret = 0; + u64 ts_nsec; + u64 seq; - /* Suppress unimportant messages after panic happens */ - if (unlikely(suppress_printk)) + /* + * Since the duration of printk() can vary depending on the message + * and state of the ringbuffer, grab the timestamp now so that it is + * close to the call of printk(). This provides a more deterministic + * timestamp with respect to the caller. + */ + ts_nsec = local_clock(); + + if (!printk_enter_irqsave(&irqflags)) return 0; - if (level == LOGLEVEL_SCHED) { - level = LOGLEVEL_DEFAULT; - in_sched = true; + /* + * The sprintf needs to come first since the syslog prefix might be + * passed in as a parameter. An extra byte must be reserved so that + * later the vscnprintf() into the reserved buffer has room for the + * terminating '\0', which is not counted by vsnprintf(). + */ + va_copy(args2, args); + reserve_size = vsnprintf(&prefix_buf[0], sizeof(prefix_buf), fmt, args2) + 1; + va_end(args2); + + if (reserve_size > LOG_LINE_MAX) + reserve_size = LOG_LINE_MAX; + + /* Extract log level or control flags. */ + if (facility == 0) + parse_prefix(&prefix_buf[0], &level, &lflags); + + if (level == LOGLEVEL_DEFAULT) + level = default_message_loglevel; + + if (dev_info) + lflags |= LOG_NEWLINE; + + if (lflags & LOG_CONT) { + prb_rec_init_wr(&r, reserve_size); + if (prb_reserve_in_last(&e, prb, &r, caller_id, LOG_LINE_MAX)) { + seq = r.info->seq; + text_len = printk_sprint(&r.text_buf[r.info->text_len], reserve_size, + facility, &lflags, fmt, args); + r.info->text_len += text_len; + + if (lflags & LOG_NEWLINE) { + r.info->flags |= LOG_NEWLINE; + prb_final_commit(&e); + final_commit = true; + } else { + prb_commit(&e); + } + + ret = text_len; + goto out; + } } - boot_delay_msec(level); - printk_delay(); + /* + * Explicitly initialize the record before every prb_reserve() call. + * prb_reserve_in_last() and prb_reserve() purposely invalidate the + * structure when they fail. + */ + prb_rec_init_wr(&r, reserve_size); + if (!prb_reserve(&e, prb, &r)) { + /* truncate the message if it is too long for empty buffer */ + truncate_msg(&reserve_size, &trunc_msg_len); - /* This stops the holder of console_sem just where we want him */ - logbuf_lock_irqsave(flags); - printed_len = vprintk_store(facility, level, dev_info, fmt, args); - logbuf_unlock_irqrestore(flags); + prb_rec_init_wr(&r, reserve_size + trunc_msg_len); + if (!prb_reserve(&e, prb, &r)) + goto out; + } - /* If called from the scheduler, we can not call up(). */ - if (!in_sched) { - /* - * Disable preemption to avoid being preempted while holding - * console_sem which would prevent anyone from printing to - * console - */ - preempt_disable(); - /* - * Try to acquire and then immediately release the console - * semaphore. The release will print out buffers and wake up - * /dev/kmsg and syslog() users. - */ - if (console_trylock_spinning()) - console_unlock(); - preempt_enable(); + seq = r.info->seq; + + /* fill message */ + text_len = printk_sprint(&r.text_buf[0], reserve_size, facility, &lflags, fmt, args); + if (trunc_msg_len) + memcpy(&r.text_buf[text_len], trunc_msg, trunc_msg_len); + r.info->text_len = text_len + trunc_msg_len; + r.info->facility = facility; + r.info->level = level & 7; + r.info->flags = lflags & 0x1f; + r.info->ts_nsec = ts_nsec; + r.info->caller_id = caller_id; + if (dev_info) + memcpy(&r.info->dev_info, dev_info, sizeof(r.info->dev_info)); + + /* A message without a trailing newline can be continued. */ + if (!(lflags & LOG_NEWLINE)) { + prb_commit(&e); + } else { + prb_final_commit(&e); + final_commit = true; } + ret = text_len + trunc_msg_len; +out: + /* only the kernel may perform synchronous printing */ + if (facility == 0 && final_commit) { + struct console *con; + + for_each_console(con) { + if (console_can_sync(con)) + print_sync_until(con, seq + 1); + } + } + + printk_exit_irqrestore(irqflags); + return ret; +} + +asmlinkage int vprintk_emit(int facility, int level, + const struct dev_printk_info *dev_info, + const char *fmt, va_list args) +{ + int printed_len; + + /* Suppress unimportant messages after panic happens */ + if (unlikely(suppress_printk)) + return 0; + + if (level == LOGLEVEL_SCHED) + level = LOGLEVEL_DEFAULT; + + printed_len = vprintk_store(facility, level, dev_info, fmt, args); + wake_up_klogd(); return printed_len; } EXPORT_SYMBOL(vprintk_emit); -asmlinkage int vprintk(const char *fmt, va_list args) +__printf(1, 0) +static int vprintk_default(const char *fmt, va_list args) { - return vprintk_func(fmt, args); + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } -EXPORT_SYMBOL(vprintk); -int vprintk_default(const char *fmt, va_list args) +__printf(1, 0) +static int vprintk_func(const char *fmt, va_list args) { - return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); +#ifdef CONFIG_KGDB_KDB + /* Allow to pass printk() to kdb but avoid a recursion. */ + if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) + return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); +#endif + return vprintk_default(fmt, args); +} + +asmlinkage int vprintk(const char *fmt, va_list args) +{ + return vprintk_func(fmt, args); } -EXPORT_SYMBOL_GPL(vprintk_default); +EXPORT_SYMBOL(vprintk); /** * printk - print a kernel message @@ -2125,38 +2178,158 @@ asmlinkage __visible int printk(const char *fmt, ...) } EXPORT_SYMBOL(printk); -#else /* CONFIG_PRINTK */ +static int printk_kthread_func(void *data) +{ + struct console *con = data; + unsigned long dropped = 0; + char *dropped_text = NULL; + struct printk_info info; + struct printk_record r; + char *ext_text = NULL; + size_t dropped_len; + int ret = -ENOMEM; + char *text = NULL; + char *write_text; + u64 printk_seq; + size_t len; + int error; + u64 seq; + + if (con->flags & CON_EXTENDED) { + ext_text = kmalloc(CONSOLE_EXT_LOG_MAX, GFP_KERNEL); + if (!ext_text) + goto out; + } + text = kmalloc(LOG_LINE_MAX + PREFIX_MAX, GFP_KERNEL); + dropped_text = kmalloc(64, GFP_KERNEL); + if (!text || !dropped_text) + goto out; -#define LOG_LINE_MAX 0 -#define PREFIX_MAX 0 -#define printk_time false + if (con->flags & CON_EXTENDED) + write_text = ext_text; + else + write_text = text; -#define prb_read_valid(rb, seq, r) false -#define prb_first_valid_seq(rb) 0 + seq = atomic64_read(&con->printk_seq); -static u64 syslog_seq; -static u64 console_seq; -static u64 exclusive_console_stop_seq; -static unsigned long console_dropped; + prb_rec_init_rd(&r, &info, text, LOG_LINE_MAX + PREFIX_MAX); + + for (;;) { + error = wait_event_interruptible(log_wait, + prb_read_valid(prb, seq, &r) || kthread_should_stop()); + + if (kthread_should_stop()) + break; + + if (error) + continue; + + if (seq != r.info->seq) { + dropped += r.info->seq - seq; + seq = r.info->seq; + } + + seq++; + + if (!(con->flags & CON_ENABLED)) + continue; + + if (suppress_message_printing(r.info->level)) + continue; + + if (con->flags & CON_EXTENDED) { + len = info_print_ext_header(ext_text, + CONSOLE_EXT_LOG_MAX, + r.info); + len += msg_print_ext_body(ext_text + len, + CONSOLE_EXT_LOG_MAX - len, + &r.text_buf[0], r.info->text_len, + &r.info->dev_info); + } else { + len = record_print_text(&r, + console_msg_format & MSG_FORMAT_SYSLOG, + printk_time); + } + + printk_seq = atomic64_read(&con->printk_seq); -static size_t record_print_text(const struct printk_record *r, - bool syslog, bool time) + console_lock(); + console_may_schedule = 0; + + if (kernel_sync_mode() && con->write_atomic) { + console_unlock(); + break; + } + + if (!(con->flags & CON_EXTENDED) && dropped) { + dropped_len = snprintf(dropped_text, 64, + "** %lu printk messages dropped **\n", + dropped); + dropped = 0; + + con->write(con, dropped_text, dropped_len); + printk_delay(r.info->level); + } + + con->write(con, write_text, len); + if (len) + printk_delay(r.info->level); + + atomic64_cmpxchg_relaxed(&con->printk_seq, printk_seq, seq); + + console_unlock(); + } +out: + kfree(dropped_text); + kfree(text); + kfree(ext_text); + pr_info("%sconsole [%s%d]: printing thread stopped\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); + return ret; +} + +/* Must be called within console_lock(). */ +static void start_printk_kthread(struct console *con) { - return 0; + con->thread = kthread_run(printk_kthread_func, con, + "pr/%s%d", con->name, con->index); + if (IS_ERR(con->thread)) { + pr_err("%sconsole [%s%d]: unable to start printing thread\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); + return; + } + pr_info("%sconsole [%s%d]: printing thread started\n", + (con->flags & CON_BOOT) ? "boot" : "", + con->name, con->index); } -static ssize_t info_print_ext_header(char *buf, size_t size, - struct printk_info *info) + +/* protected by console_lock */ +static bool kthreads_started; + +/* Must be called within console_lock(). */ +static void console_try_thread(struct console *con) { - return 0; + if (kthreads_started) { + start_printk_kthread(con); + return; + } + + /* + * The printing threads have not been started yet. If this console + * can print synchronously, print all unprinted messages. + */ + if (console_can_sync(con)) + print_sync_until(con, prb_next_seq(prb)); } -static ssize_t msg_print_ext_body(char *buf, size_t size, - char *text, size_t text_len, - struct dev_printk_info *dev_info) { return 0; } -static void console_lock_spinning_enable(void) { } -static int console_lock_spinning_disable_and_check(void) { return 0; } -static void call_console_drivers(const char *ext_text, size_t ext_len, - const char *text, size_t len) {} -static bool suppress_message_printing(int level) { return false; } + +#else /* CONFIG_PRINTK */ + +#define prb_first_valid_seq(rb) 0 +#define prb_next_seq(rb) 0 + +#define console_try_thread(con) #endif /* CONFIG_PRINTK */ @@ -2401,34 +2574,6 @@ int is_console_locked(void) } EXPORT_SYMBOL(is_console_locked); -/* - * Check if we have any console that is capable of printing while cpu is - * booting or shutting down. Requires console_sem. - */ -static int have_callable_console(void) -{ - struct console *con; - - for_each_console(con) - if ((con->flags & CON_ENABLED) && - (con->flags & CON_ANYTIME)) - return 1; - - return 0; -} - -/* - * Can we actually use the console at this time on this cpu? - * - * Console drivers may assume that per-cpu resources have been allocated. So - * unless they're explicitly marked as being able to cope (CON_ANYTIME) don't - * call them until this CPU is officially up. - */ -static inline int can_use_console(void) -{ - return cpu_online(raw_smp_processor_id()) || have_callable_console(); -} - /** * console_unlock - unlock the console system * @@ -2445,142 +2590,14 @@ static inline int can_use_console(void) */ void console_unlock(void) { - static char ext_text[CONSOLE_EXT_LOG_MAX]; - static char text[LOG_LINE_MAX + PREFIX_MAX]; - unsigned long flags; - bool do_cond_resched, retry; - struct printk_info info; - struct printk_record r; - if (console_suspended) { up_console_sem(); return; } - prb_rec_init_rd(&r, &info, text, sizeof(text)); - - /* - * Console drivers are called with interrupts disabled, so - * @console_may_schedule should be cleared before; however, we may - * end up dumping a lot of lines, for example, if called from - * console registration path, and should invoke cond_resched() - * between lines if allowable. Not doing so can cause a very long - * scheduling stall on a slow console leading to RCU stall and - * softlockup warnings which exacerbate the issue with more - * messages practically incapacitating the system. - * - * console_trylock() is not able to detect the preemptive - * context reliably. Therefore the value must be stored before - * and cleared after the "again" goto label. - */ - do_cond_resched = console_may_schedule; -again: - console_may_schedule = 0; - - /* - * We released the console_sem lock, so we need to recheck if - * cpu is online and (if not) is there at least one CON_ANYTIME - * console. - */ - if (!can_use_console()) { - console_locked = 0; - up_console_sem(); - return; - } - - for (;;) { - size_t ext_len = 0; - size_t len; - - printk_safe_enter_irqsave(flags); - raw_spin_lock(&logbuf_lock); -skip: - if (!prb_read_valid(prb, console_seq, &r)) - break; - - if (console_seq != r.info->seq) { - console_dropped += r.info->seq - console_seq; - console_seq = r.info->seq; - } - - if (suppress_message_printing(r.info->level)) { - /* - * Skip record we have buffered and already printed - * directly to the console when we received it, and - * record that has level above the console loglevel. - */ - console_seq++; - goto skip; - } - - /* Output to all consoles once old messages replayed. */ - if (unlikely(exclusive_console && - console_seq >= exclusive_console_stop_seq)) { - exclusive_console = NULL; - } - - /* - * Handle extended console text first because later - * record_print_text() will modify the record buffer in-place. - */ - if (nr_ext_console_drivers) { - ext_len = info_print_ext_header(ext_text, - sizeof(ext_text), - r.info); - ext_len += msg_print_ext_body(ext_text + ext_len, - sizeof(ext_text) - ext_len, - &r.text_buf[0], - r.info->text_len, - &r.info->dev_info); - } - len = record_print_text(&r, - console_msg_format & MSG_FORMAT_SYSLOG, - printk_time); - console_seq++; - raw_spin_unlock(&logbuf_lock); - - /* - * While actively printing out messages, if another printk() - * were to occur on another CPU, it may wait for this one to - * finish. This task can not be preempted if there is a - * waiter waiting to take over. - */ - console_lock_spinning_enable(); - - stop_critical_timings(); /* don't trace print latency */ - call_console_drivers(ext_text, ext_len, text, len); - start_critical_timings(); - - if (console_lock_spinning_disable_and_check()) { - printk_safe_exit_irqrestore(flags); - return; - } - - printk_safe_exit_irqrestore(flags); - - if (do_cond_resched) - cond_resched(); - } - console_locked = 0; - raw_spin_unlock(&logbuf_lock); - up_console_sem(); - - /* - * Someone could have filled up the buffer again, so re-check if there's - * something to flush. In case we cannot trylock the console_sem again, - * there's a new owner and the console_unlock() from them will do the - * flush, no worries. - */ - raw_spin_lock(&logbuf_lock); - retry = prb_read_valid(prb, console_seq, NULL); - raw_spin_unlock(&logbuf_lock); - printk_safe_exit_irqrestore(flags); - - if (retry && console_trylock()) - goto again; } EXPORT_SYMBOL(console_unlock); @@ -2630,23 +2647,20 @@ void console_unblank(void) */ void console_flush_on_panic(enum con_flush_mode mode) { - /* - * If someone else is holding the console lock, trylock will fail - * and may_schedule may be set. Ignore and proceed to unlock so - * that messages are flushed out. As this can be called from any - * context and we don't want to get preempted while flushing, - * ensure may_schedule is cleared. - */ - console_trylock(); + struct console *c; + u64 seq; + + if (!console_trylock()) + return; + console_may_schedule = 0; if (mode == CONSOLE_REPLAY_ALL) { - unsigned long flags; - - logbuf_lock_irqsave(flags); - console_seq = prb_first_valid_seq(prb); - logbuf_unlock_irqrestore(flags); + seq = prb_first_valid_seq(prb); + for_each_console(c) + atomic64_set(&c->printk_seq, seq); } + console_unlock(); } EXPORT_SYMBOL(console_flush_on_panic); @@ -2782,7 +2796,6 @@ static int try_enable_new_console(struct console *newcon, bool user_specified) */ void register_console(struct console *newcon) { - unsigned long flags; struct console *bcon = NULL; int err; @@ -2806,6 +2819,8 @@ void register_console(struct console *newcon) } } + newcon->thread = NULL; + if (console_drivers && console_drivers->flags & CON_BOOT) bcon = console_drivers; @@ -2847,8 +2862,10 @@ void register_console(struct console *newcon) * the real console are the same physical device, it's annoying to * see the beginning boot messages twice */ - if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) + if (bcon && ((newcon->flags & (CON_CONSDEV | CON_BOOT)) == CON_CONSDEV)) { newcon->flags &= ~CON_PRINTBUFFER; + newcon->flags |= CON_HANDOVER; + } /* * Put this console in the list - keep the @@ -2870,26 +2887,12 @@ void register_console(struct console *newcon) if (newcon->flags & CON_EXTENDED) nr_ext_console_drivers++; - if (newcon->flags & CON_PRINTBUFFER) { - /* - * console_unlock(); will print out the buffered messages - * for us. - */ - logbuf_lock_irqsave(flags); - /* - * We're about to replay the log buffer. Only do this to the - * just-registered console to avoid excessive message spam to - * the already-registered consoles. - * - * Set exclusive_console with disabled interrupts to reduce - * race window with eventual console_flush_on_panic() that - * ignores console_lock. - */ - exclusive_console = newcon; - exclusive_console_stop_seq = console_seq; - console_seq = syslog_seq; - logbuf_unlock_irqrestore(flags); - } + if (newcon->flags & CON_PRINTBUFFER) + atomic64_set(&newcon->printk_seq, 0); + else + atomic64_set(&newcon->printk_seq, prb_next_seq(prb)); + + console_try_thread(newcon); console_unlock(); console_sysfs_notify(); @@ -2963,6 +2966,9 @@ int unregister_console(struct console *console) console_unlock(); console_sysfs_notify(); + if (console->thread && !IS_ERR(console->thread)) + kthread_stop(console->thread); + if (console->exit) res = console->exit(console); @@ -3045,6 +3051,15 @@ static int __init printk_late_init(void) unregister_console(con); } } + +#ifdef CONFIG_PRINTK + console_lock(); + for_each_console(con) + start_printk_kthread(con); + kthreads_started = true; + console_unlock(); +#endif + ret = cpuhp_setup_state_nocalls(CPUHP_PRINTK_DEAD, "printk:dead", NULL, console_cpu_notify); WARN_ON(ret < 0); @@ -3060,7 +3075,6 @@ late_initcall(printk_late_init); * Delayed printk version, for scheduler-internal messages: */ #define PRINTK_PENDING_WAKEUP 0x01 -#define PRINTK_PENDING_OUTPUT 0x02 static DEFINE_PER_CPU(int, printk_pending); @@ -3068,14 +3082,8 @@ static void wake_up_klogd_work_func(struct irq_work *irq_work) { int pending = __this_cpu_xchg(printk_pending, 0); - if (pending & PRINTK_PENDING_OUTPUT) { - /* If trylock fails, someone else is doing the printing */ - if (console_trylock()) - console_unlock(); - } - if (pending & PRINTK_PENDING_WAKEUP) - wake_up_interruptible(&log_wait); + wake_up_interruptible_all(&log_wait); } static DEFINE_PER_CPU(struct irq_work, wake_up_klogd_work) = @@ -3094,25 +3102,10 @@ void wake_up_klogd(void) preempt_enable(); } -void defer_console_output(void) +__printf(1, 0) +static int vprintk_deferred(const char *fmt, va_list args) { - if (!printk_percpu_data_ready()) - return; - - preempt_disable(); - __this_cpu_or(printk_pending, PRINTK_PENDING_OUTPUT); - irq_work_queue(this_cpu_ptr(&wake_up_klogd_work)); - preempt_enable(); -} - -int vprintk_deferred(const char *fmt, va_list args) -{ - int r; - - r = vprintk_emit(0, LOGLEVEL_SCHED, NULL, fmt, args); - defer_console_output(); - - return r; + return vprintk_emit(0, LOGLEVEL_DEFAULT, NULL, fmt, args); } int printk_deferred(const char *fmt, ...) @@ -3251,8 +3244,26 @@ EXPORT_SYMBOL_GPL(kmsg_dump_reason_str); */ void kmsg_dump(enum kmsg_dump_reason reason) { + struct kmsg_dumper_iter iter; struct kmsg_dumper *dumper; - unsigned long flags; + + if (!oops_in_progress) { + /* + * If atomic consoles are available, activate kernel sync mode + * to make sure any final messages are visible. The trailing + * printk message is important to flush any pending messages. + */ + if (have_atomic_console()) { + sync_mode = true; + pr_info("enabled sync mode\n"); + } + + /* + * Give the printing threads time to flush, allowing up to + * 1s of no printing forward progress before giving up. + */ + pr_flush(1000, true); + } rcu_read_lock(); list_for_each_entry_rcu(dumper, &dump_list, list) { @@ -3270,25 +3281,18 @@ void kmsg_dump(enum kmsg_dump_reason reason) continue; /* initialize iterator with data about the stored records */ - dumper->active = true; - - logbuf_lock_irqsave(flags); - dumper->cur_seq = clear_seq; - dumper->next_seq = prb_next_seq(prb); - logbuf_unlock_irqrestore(flags); + iter.active = true; + kmsg_dump_rewind(&iter); /* invoke dumper which will iterate over records */ - dumper->dump(dumper, reason); - - /* reset iterator */ - dumper->active = false; + dumper->dump(dumper, reason, &iter); } rcu_read_unlock(); } /** - * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) - * @dumper: registered kmsg dumper + * kmsg_dump_get_line - retrieve one kmsg log line + * @iter: kmsg dumper iterator * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to * @size: maximum size of the buffer @@ -3302,11 +3306,9 @@ void kmsg_dump(enum kmsg_dump_reason reason) * * A return value of FALSE indicates that there are no more records to * read. - * - * The function is similar to kmsg_dump_get_line(), but grabs no locks. */ -bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len) +bool kmsg_dump_get_line(struct kmsg_dumper_iter *iter, bool syslog, + char *line, size_t size, size_t *len) { struct printk_info info; unsigned int line_count; @@ -3316,16 +3318,16 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, prb_rec_init_rd(&r, &info, line, size); - if (!dumper->active) + if (!iter->active) goto out; /* Read text or count text lines? */ if (line) { - if (!prb_read_valid(prb, dumper->cur_seq, &r)) + if (!prb_read_valid(prb, iter->cur_seq, &r)) goto out; l = record_print_text(&r, syslog, printk_time); } else { - if (!prb_read_valid_info(prb, dumper->cur_seq, + if (!prb_read_valid_info(prb, iter->cur_seq, &info, &line_count)) { goto out; } @@ -3334,48 +3336,18 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, } - dumper->cur_seq = r.info->seq + 1; + iter->cur_seq = r.info->seq + 1; ret = true; out: if (len) *len = l; return ret; } - -/** - * kmsg_dump_get_line - retrieve one kmsg log line - * @dumper: registered kmsg dumper - * @syslog: include the "<4>" prefixes - * @line: buffer to copy the line to - * @size: maximum size of the buffer - * @len: length of line placed into buffer - * - * Start at the beginning of the kmsg buffer, with the oldest kmsg - * record, and copy one record into the provided buffer. - * - * Consecutive calls will return the next available record moving - * towards the end of the buffer with the youngest messages. - * - * A return value of FALSE indicates that there are no more records to - * read. - */ -bool kmsg_dump_get_line(struct kmsg_dumper *dumper, bool syslog, - char *line, size_t size, size_t *len) -{ - unsigned long flags; - bool ret; - - logbuf_lock_irqsave(flags); - ret = kmsg_dump_get_line_nolock(dumper, syslog, line, size, len); - logbuf_unlock_irqrestore(flags); - - return ret; -} EXPORT_SYMBOL_GPL(kmsg_dump_get_line); /** * kmsg_dump_get_buffer - copy kmsg log lines - * @dumper: registered kmsg dumper + * @iter: kmsg dumper iterator * @syslog: include the "<4>" prefixes * @buf: buffer to copy the line to * @size: maximum size of the buffer @@ -3392,116 +3364,256 @@ EXPORT_SYMBOL_GPL(kmsg_dump_get_line); * A return value of FALSE indicates that there are no more records to * read. */ -bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, - char *buf, size_t size, size_t *len) +bool kmsg_dump_get_buffer(struct kmsg_dumper_iter *iter, bool syslog, + char *buf, size_t size, size_t *len_out) { struct printk_info info; - unsigned int line_count; struct printk_record r; - unsigned long flags; u64 seq; u64 next_seq; - size_t l = 0; + size_t len = 0; bool ret = false; bool time = printk_time; - prb_rec_init_rd(&r, &info, buf, size); - - if (!dumper->active || !buf || !size) + if (!iter->active || !buf || !size) goto out; - logbuf_lock_irqsave(flags); - if (prb_read_valid_info(prb, dumper->cur_seq, &info, NULL)) { - if (info.seq != dumper->cur_seq) { + if (prb_read_valid_info(prb, iter->cur_seq, &info, NULL)) { + if (info.seq != iter->cur_seq) { /* messages are gone, move to first available one */ - dumper->cur_seq = info.seq; + iter->cur_seq = info.seq; } } /* last entry */ - if (dumper->cur_seq >= dumper->next_seq) { - logbuf_unlock_irqrestore(flags); + if (iter->cur_seq >= iter->next_seq) goto out; - } - /* calculate length of entire buffer */ - seq = dumper->cur_seq; - while (prb_read_valid_info(prb, seq, &info, &line_count)) { - if (r.info->seq >= dumper->next_seq) - break; - l += get_record_print_text_size(&info, line_count, syslog, time); - seq = r.info->seq + 1; - } - - /* move first record forward until length fits into the buffer */ - seq = dumper->cur_seq; - while (l >= size && prb_read_valid_info(prb, seq, - &info, &line_count)) { - if (r.info->seq >= dumper->next_seq) - break; - l -= get_record_print_text_size(&info, line_count, syslog, time); - seq = r.info->seq + 1; - } + /* + * Find first record that fits, including all following records, + * into the user-provided buffer for this dump. Pass in size-1 + * because this function (by way of record_print_text()) will + * not write more than size-1 bytes of text into @buf. + */ + seq = find_first_fitting_seq(iter->cur_seq, iter->next_seq, + size - 1, syslog, time); - /* last message in next interation */ + /* + * Next kmsg_dump_get_buffer() invocation will dump block of + * older records stored right before this one. + */ next_seq = seq; - /* actually read text into the buffer now */ - l = 0; - while (prb_read_valid(prb, seq, &r)) { - if (r.info->seq >= dumper->next_seq) - break; + prb_rec_init_rd(&r, &info, buf, size); - l += record_print_text(&r, syslog, time); + len = 0; + prb_for_each_record(seq, prb, seq, &r) { + if (r.info->seq >= iter->next_seq) + break; - /* adjust record to store to remaining buffer space */ - prb_rec_init_rd(&r, &info, buf + l, size - l); + len += record_print_text(&r, syslog, time); - seq = r.info->seq + 1; + /* Adjust record to store to remaining buffer space. */ + prb_rec_init_rd(&r, &info, buf + len, size - len); } - dumper->next_seq = next_seq; + iter->next_seq = next_seq; ret = true; - logbuf_unlock_irqrestore(flags); out: - if (len) - *len = l; + if (len_out) + *len_out = len; return ret; } EXPORT_SYMBOL_GPL(kmsg_dump_get_buffer); /** - * kmsg_dump_rewind_nolock - reset the iterator (unlocked version) - * @dumper: registered kmsg dumper + * kmsg_dump_rewind - reset the iterator + * @iter: kmsg dumper iterator * * Reset the dumper's iterator so that kmsg_dump_get_line() and * kmsg_dump_get_buffer() can be called again and used multiple * times within the same dumper.dump() callback. + */ +void kmsg_dump_rewind(struct kmsg_dumper_iter *iter) +{ + iter->cur_seq = latched_seq_read_nolock(&clear_seq); + iter->next_seq = prb_next_seq(prb); +} +EXPORT_SYMBOL_GPL(kmsg_dump_rewind); + +#endif + +struct prb_cpulock { + atomic_t owner; + unsigned long __percpu *irqflags; +}; + +#define DECLARE_STATIC_PRINTKRB_CPULOCK(name) \ +static DEFINE_PER_CPU(unsigned long, _##name##_percpu_irqflags); \ +static struct prb_cpulock name = { \ + .owner = ATOMIC_INIT(-1), \ + .irqflags = &_##name##_percpu_irqflags, \ +} + +static bool __prb_trylock(struct prb_cpulock *cpu_lock, + unsigned int *cpu_store) +{ + unsigned long *flags; + unsigned int cpu; + + cpu = get_cpu(); + + *cpu_store = atomic_read(&cpu_lock->owner); + /* memory barrier to ensure the current lock owner is visible */ + smp_rmb(); + if (*cpu_store == -1) { + flags = per_cpu_ptr(cpu_lock->irqflags, cpu); + local_irq_save(*flags); + if (atomic_try_cmpxchg_acquire(&cpu_lock->owner, + cpu_store, cpu)) { + return true; + } + local_irq_restore(*flags); + } else if (*cpu_store == cpu) { + return true; + } + + put_cpu(); + return false; +} + +/* + * prb_lock: Perform a processor-reentrant spin lock. + * @cpu_lock: A pointer to the lock object. + * @cpu_store: A "flags" pointer to store lock status information. + * + * If no processor has the lock, the calling processor takes the lock and + * becomes the owner. If the calling processor is already the owner of the + * lock, this function succeeds immediately. If lock is locked by another + * processor, this function spins until the calling processor becomes the + * owner. * - * The function is similar to kmsg_dump_rewind(), but grabs no locks. + * It is safe to call this function from any context and state. */ -void kmsg_dump_rewind_nolock(struct kmsg_dumper *dumper) +static void prb_lock(struct prb_cpulock *cpu_lock, unsigned int *cpu_store) { - dumper->cur_seq = clear_seq; - dumper->next_seq = prb_next_seq(prb); + for (;;) { + if (__prb_trylock(cpu_lock, cpu_store)) + break; + cpu_relax(); + } } -/** - * kmsg_dump_rewind - reset the iterator - * @dumper: registered kmsg dumper +/* + * prb_unlock: Perform a processor-reentrant spin unlock. + * @cpu_lock: A pointer to the lock object. + * @cpu_store: A "flags" object storing lock status information. * - * Reset the dumper's iterator so that kmsg_dump_get_line() and - * kmsg_dump_get_buffer() can be called again and used multiple - * times within the same dumper.dump() callback. + * Release the lock. The calling processor must be the owner of the lock. + * + * It is safe to call this function from any context and state. */ -void kmsg_dump_rewind(struct kmsg_dumper *dumper) +static void prb_unlock(struct prb_cpulock *cpu_lock, unsigned int cpu_store) { - unsigned long flags; + unsigned long *flags; + unsigned int cpu; + + cpu = atomic_read(&cpu_lock->owner); + atomic_set_release(&cpu_lock->owner, cpu_store); + + if (cpu_store == -1) { + flags = per_cpu_ptr(cpu_lock->irqflags, cpu); + local_irq_restore(*flags); + } - logbuf_lock_irqsave(flags); - kmsg_dump_rewind_nolock(dumper); - logbuf_unlock_irqrestore(flags); + put_cpu(); } -EXPORT_SYMBOL_GPL(kmsg_dump_rewind); -#endif +DECLARE_STATIC_PRINTKRB_CPULOCK(printk_cpulock); + +void console_atomic_lock(unsigned int *flags) +{ + prb_lock(&printk_cpulock, flags); +} +EXPORT_SYMBOL(console_atomic_lock); + +void console_atomic_unlock(unsigned int flags) +{ + prb_unlock(&printk_cpulock, flags); +} +EXPORT_SYMBOL(console_atomic_unlock); + +static void pr_msleep(bool may_sleep, int ms) +{ + if (may_sleep) { + msleep(ms); + } else { + while (ms--) + udelay(1000); + } +} + +/** + * pr_flush() - Wait for printing threads to catch up. + * + * @timeout_ms: The maximum time (in ms) to wait. + * @reset_on_progress: Reset the timeout if forward progress is seen. + * + * A value of 0 for @timeout_ms means no waiting will occur. A value of -1 + * represents infinite waiting. + * + * If @reset_on_progress is true, the timeout will be reset whenever any + * printer has been seen to make some forward progress. + * + * Context: Any context. + * Return: true if all enabled printers are caught up. + */ +bool pr_flush(int timeout_ms, bool reset_on_progress) +{ + int remaining = timeout_ms; + struct console *con; + u64 last_diff = 0; + bool may_sleep; + u64 printk_seq; + u64 diff; + u64 seq; + + may_sleep = (preemptible() && + !in_softirq() && + system_state >= SYSTEM_RUNNING); + + seq = prb_next_seq(prb); + + for (;;) { + diff = 0; + + for_each_console(con) { + if (!(con->flags & CON_ENABLED)) + continue; + printk_seq = atomic64_read(&con->printk_seq); + if (printk_seq < seq) + diff += seq - printk_seq; + } + + if (diff != last_diff && reset_on_progress) + remaining = timeout_ms; + + if (!diff || remaining == 0) + break; + + if (remaining < 0) { + pr_msleep(may_sleep, 100); + } else if (remaining < 100) { + pr_msleep(may_sleep, remaining); + remaining = 0; + } else { + pr_msleep(may_sleep, 100); + remaining -= 100; + } + + last_diff = diff; + } + + return (diff == 0); +} +EXPORT_SYMBOL(pr_flush); diff --git a/kernel/printk/printk_safe.c b/kernel/printk/printk_safe.c index b774685cc..e69de29bb 100644 --- a/kernel/printk/printk_safe.c +++ b/kernel/printk/printk_safe.c @@ -1,425 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-or-later -/* - * printk_safe.c - Safe printk for printk-deadlock-prone contexts - */ - -#include <linux/preempt.h> -#include <linux/spinlock.h> -#include <linux/debug_locks.h> -#include <linux/kdb.h> -#include <linux/smp.h> -#include <linux/cpumask.h> -#include <linux/irq_work.h> -#include <linux/printk.h> -#include <linux/kprobes.h> - -#include "internal.h" - -/* - * printk() could not take logbuf_lock in NMI context. Instead, - * it uses an alternative implementation that temporary stores - * the strings into a per-CPU buffer. The content of the buffer - * is later flushed into the main ring buffer via IRQ work. - * - * The alternative implementation is chosen transparently - * by examining current printk() context mask stored in @printk_context - * per-CPU variable. - * - * The implementation allows to flush the strings also from another CPU. - * There are situations when we want to make sure that all buffers - * were handled or when IRQs are blocked. - */ - -#define SAFE_LOG_BUF_LEN ((1 << CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT) - \ - sizeof(atomic_t) - \ - sizeof(atomic_t) - \ - sizeof(struct irq_work)) - -struct printk_safe_seq_buf { - atomic_t len; /* length of written data */ - atomic_t message_lost; - struct irq_work work; /* IRQ work that flushes the buffer */ - unsigned char buffer[SAFE_LOG_BUF_LEN]; -}; - -static DEFINE_PER_CPU(struct printk_safe_seq_buf, safe_print_seq); -static DEFINE_PER_CPU(int, printk_context); - -static DEFINE_RAW_SPINLOCK(safe_read_lock); - -#ifdef CONFIG_PRINTK_NMI -static DEFINE_PER_CPU(struct printk_safe_seq_buf, nmi_print_seq); -#endif - -/* Get flushed in a more safe context. */ -static void queue_flush_work(struct printk_safe_seq_buf *s) -{ - if (printk_percpu_data_ready()) - irq_work_queue(&s->work); -} - -/* - * Add a message to per-CPU context-dependent buffer. NMI and printk-safe - * have dedicated buffers, because otherwise printk-safe preempted by - * NMI-printk would have overwritten the NMI messages. - * - * The messages are flushed from irq work (or from panic()), possibly, - * from other CPU, concurrently with printk_safe_log_store(). Should this - * happen, printk_safe_log_store() will notice the buffer->len mismatch - * and repeat the write. - */ -static __printf(2, 0) int printk_safe_log_store(struct printk_safe_seq_buf *s, - const char *fmt, va_list args) -{ - int add; - size_t len; - va_list ap; - -again: - len = atomic_read(&s->len); - - /* The trailing '\0' is not counted into len. */ - if (len >= sizeof(s->buffer) - 1) { - atomic_inc(&s->message_lost); - queue_flush_work(s); - return 0; - } - - /* - * Make sure that all old data have been read before the buffer - * was reset. This is not needed when we just append data. - */ - if (!len) - smp_rmb(); - - va_copy(ap, args); - add = vscnprintf(s->buffer + len, sizeof(s->buffer) - len, fmt, ap); - va_end(ap); - if (!add) - return 0; - - /* - * Do it once again if the buffer has been flushed in the meantime. - * Note that atomic_cmpxchg() is an implicit memory barrier that - * makes sure that the data were written before updating s->len. - */ - if (atomic_cmpxchg(&s->len, len, len + add) != len) - goto again; - - queue_flush_work(s); - return add; -} - -static inline void printk_safe_flush_line(const char *text, int len) -{ - /* - * Avoid any console drivers calls from here, because we may be - * in NMI or printk_safe context (when in panic). The messages - * must go only into the ring buffer at this stage. Consoles will - * get explicitly called later when a crashdump is not generated. - */ - printk_deferred("%.*s", len, text); -} - -/* printk part of the temporary buffer line by line */ -static int printk_safe_flush_buffer(const char *start, size_t len) -{ - const char *c, *end; - bool header; - - c = start; - end = start + len; - header = true; - - /* Print line by line. */ - while (c < end) { - if (*c == '\n') { - printk_safe_flush_line(start, c - start + 1); - start = ++c; - header = true; - continue; - } - - /* Handle continuous lines or missing new line. */ - if ((c + 1 < end) && printk_get_level(c)) { - if (header) { - c = printk_skip_level(c); - continue; - } - - printk_safe_flush_line(start, c - start); - start = c++; - header = true; - continue; - } - - header = false; - c++; - } - - /* Check if there was a partial line. Ignore pure header. */ - if (start < end && !header) { - static const char newline[] = KERN_CONT "\n"; - - printk_safe_flush_line(start, end - start); - printk_safe_flush_line(newline, strlen(newline)); - } - - return len; -} - -static void report_message_lost(struct printk_safe_seq_buf *s) -{ - int lost = atomic_xchg(&s->message_lost, 0); - - if (lost) - printk_deferred("Lost %d message(s)!\n", lost); -} - -/* - * Flush data from the associated per-CPU buffer. The function - * can be called either via IRQ work or independently. - */ -static void __printk_safe_flush(struct irq_work *work) -{ - struct printk_safe_seq_buf *s = - container_of(work, struct printk_safe_seq_buf, work); - unsigned long flags; - size_t len; - int i; - - /* - * The lock has two functions. First, one reader has to flush all - * available message to make the lockless synchronization with - * writers easier. Second, we do not want to mix messages from - * different CPUs. This is especially important when printing - * a backtrace. - */ - raw_spin_lock_irqsave(&safe_read_lock, flags); - - i = 0; -more: - len = atomic_read(&s->len); - - /* - * This is just a paranoid check that nobody has manipulated - * the buffer an unexpected way. If we printed something then - * @len must only increase. Also it should never overflow the - * buffer size. - */ - if ((i && i >= len) || len > sizeof(s->buffer)) { - const char *msg = "printk_safe_flush: internal error\n"; - - printk_safe_flush_line(msg, strlen(msg)); - len = 0; - } - - if (!len) - goto out; /* Someone else has already flushed the buffer. */ - - /* Make sure that data has been written up to the @len */ - smp_rmb(); - i += printk_safe_flush_buffer(s->buffer + i, len - i); - - /* - * Check that nothing has got added in the meantime and truncate - * the buffer. Note that atomic_cmpxchg() is an implicit memory - * barrier that makes sure that the data were copied before - * updating s->len. - */ - if (atomic_cmpxchg(&s->len, len, 0) != len) - goto more; - -out: - report_message_lost(s); - raw_spin_unlock_irqrestore(&safe_read_lock, flags); -} - -/** - * printk_safe_flush - flush all per-cpu nmi buffers. - * - * The buffers are flushed automatically via IRQ work. This function - * is useful only when someone wants to be sure that all buffers have - * been flushed at some point. - */ -void printk_safe_flush(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { -#ifdef CONFIG_PRINTK_NMI - __printk_safe_flush(&per_cpu(nmi_print_seq, cpu).work); -#endif - __printk_safe_flush(&per_cpu(safe_print_seq, cpu).work); - } -} - -/** - * printk_safe_flush_on_panic - flush all per-cpu nmi buffers when the system - * goes down. - * - * Similar to printk_safe_flush() but it can be called even in NMI context when - * the system goes down. It does the best effort to get NMI messages into - * the main ring buffer. - * - * Note that it could try harder when there is only one CPU online. - */ -void printk_safe_flush_on_panic(void) -{ - /* - * Make sure that we could access the main ring buffer. - * Do not risk a double release when more CPUs are up. - */ - if (raw_spin_is_locked(&logbuf_lock)) { - if (num_online_cpus() > 1) - return; - - debug_locks_off(); - raw_spin_lock_init(&logbuf_lock); - } - - if (raw_spin_is_locked(&safe_read_lock)) { - if (num_online_cpus() > 1) - return; - - debug_locks_off(); - raw_spin_lock_init(&safe_read_lock); - } - - printk_safe_flush(); -} -EXPORT_SYMBOL_GPL(printk_safe_flush_on_panic); - -#ifdef CONFIG_PRINTK_NMI -/* - * Safe printk() for NMI context. It uses a per-CPU buffer to - * store the message. NMIs are not nested, so there is always only - * one writer running. But the buffer might get flushed from another - * CPU, so we need to be careful. - */ -static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) -{ - struct printk_safe_seq_buf *s = this_cpu_ptr(&nmi_print_seq); - - return printk_safe_log_store(s, fmt, args); -} - -void noinstr printk_nmi_enter(void) -{ - this_cpu_add(printk_context, PRINTK_NMI_CONTEXT_OFFSET); -} - -void noinstr printk_nmi_exit(void) -{ - this_cpu_sub(printk_context, PRINTK_NMI_CONTEXT_OFFSET); -} - -/* - * Marks a code that might produce many messages in NMI context - * and the risk of losing them is more critical than eventual - * reordering. - * - * It has effect only when called in NMI context. Then printk() - * will try to store the messages into the main logbuf directly - * and use the per-CPU buffers only as a fallback when the lock - * is not available. - */ -void printk_nmi_direct_enter(void) -{ - if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) - this_cpu_or(printk_context, PRINTK_NMI_DIRECT_CONTEXT_MASK); -} - -void printk_nmi_direct_exit(void) -{ - this_cpu_and(printk_context, ~PRINTK_NMI_DIRECT_CONTEXT_MASK); -} - -#else - -static __printf(1, 0) int vprintk_nmi(const char *fmt, va_list args) -{ - return 0; -} - -#endif /* CONFIG_PRINTK_NMI */ - -/* - * Lock-less printk(), to avoid deadlocks should the printk() recurse - * into itself. It uses a per-CPU buffer to store the message, just like - * NMI. - */ -static __printf(1, 0) int vprintk_safe(const char *fmt, va_list args) -{ - struct printk_safe_seq_buf *s = this_cpu_ptr(&safe_print_seq); - - return printk_safe_log_store(s, fmt, args); -} - -/* Can be preempted by NMI. */ -void printk_safe_enter(void) -{ - this_cpu_inc(printk_context); -} -EXPORT_SYMBOL_GPL(printk_safe_enter); - -/* Can be preempted by NMI. */ -void printk_safe_exit(void) -{ - this_cpu_dec(printk_context); -} -EXPORT_SYMBOL_GPL(printk_safe_exit); - -__printf(1, 0) int vprintk_func(const char *fmt, va_list args) -{ -#ifdef CONFIG_KGDB_KDB - /* Allow to pass printk() to kdb but avoid a recursion. */ - if (unlikely(kdb_trap_printk && kdb_printf_cpu < 0)) - return vkdb_printf(KDB_MSGSRC_PRINTK, fmt, args); -#endif - - /* - * Try to use the main logbuf even in NMI. But avoid calling console - * drivers that might have their own locks. - */ - if ((this_cpu_read(printk_context) & PRINTK_NMI_DIRECT_CONTEXT_MASK) && - raw_spin_trylock(&logbuf_lock)) { - int len; - - len = vprintk_store(0, LOGLEVEL_DEFAULT, NULL, fmt, args); - raw_spin_unlock(&logbuf_lock); - defer_console_output(); - return len; - } - - /* Use extra buffer in NMI when logbuf_lock is taken or in safe mode. */ - if (this_cpu_read(printk_context) & PRINTK_NMI_CONTEXT_MASK) - return vprintk_nmi(fmt, args); - - /* Use extra buffer to prevent a recursion deadlock in safe mode. */ - if (this_cpu_read(printk_context) & PRINTK_SAFE_CONTEXT_MASK) - return vprintk_safe(fmt, args); - - /* No obstacles. */ - return vprintk_default(fmt, args); -} - -void __init printk_safe_init(void) -{ - int cpu; - - for_each_possible_cpu(cpu) { - struct printk_safe_seq_buf *s; - - s = &per_cpu(safe_print_seq, cpu); - init_irq_work(&s->work, __printk_safe_flush); - -#ifdef CONFIG_PRINTK_NMI - s = &per_cpu(nmi_print_seq, cpu); - init_irq_work(&s->work, __printk_safe_flush); -#endif - } - - /* Flush pending messages that did not have scheduled IRQ works. */ - printk_safe_flush(); -} diff --git a/kernel/ptrace.c b/kernel/ptrace.c index 0087ce50d..1458950fc 100644 --- a/kernel/ptrace.c +++ b/kernel/ptrace.c @@ -196,7 +196,14 @@ static bool ptrace_freeze_traced(struct task_struct *task) spin_lock_irq(&task->sighand->siglock); if (task_is_traced(task) && !looks_like_a_spurious_pid(task) && !__fatal_signal_pending(task)) { - task->state = __TASK_TRACED; + unsigned long flags; + + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (task->state & __TASK_TRACED) + task->state = __TASK_TRACED; + else + task->saved_state = __TASK_TRACED; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); ret = true; } spin_unlock_irq(&task->sighand->siglock); @@ -206,8 +213,8 @@ static bool ptrace_freeze_traced(struct task_struct *task) static void ptrace_unfreeze_traced(struct task_struct *task) { - if (task->state != __TASK_TRACED) - return; + unsigned long flags; + bool frozen = true; WARN_ON(!task->ptrace || task->parent != current); @@ -216,12 +223,19 @@ static void ptrace_unfreeze_traced(struct task_struct *task) * Recheck state under the lock to close this race. */ spin_lock_irq(&task->sighand->siglock); - if (task->state == __TASK_TRACED) { - if (__fatal_signal_pending(task)) - wake_up_state(task, __TASK_TRACED); - else - task->state = TASK_TRACED; - } + + raw_spin_lock_irqsave(&task->pi_lock, flags); + if (task->state == __TASK_TRACED) + task->state = TASK_TRACED; + else if (task->saved_state == __TASK_TRACED) + task->saved_state = TASK_TRACED; + else + frozen = false; + raw_spin_unlock_irqrestore(&task->pi_lock, flags); + + if (frozen && __fatal_signal_pending(task)) + wake_up_state(task, __TASK_TRACED); + spin_unlock_irq(&task->sighand->siglock); } diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig index 84dfa8dae..e222aa0a5 100644 --- a/kernel/rcu/Kconfig +++ b/kernel/rcu/Kconfig @@ -189,8 +189,8 @@ config RCU_FAST_NO_HZ config RCU_BOOST bool "Enable RCU priority boosting" - depends on RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT - default n + depends on (RT_MUTEXES && PREEMPT_RCU && RCU_EXPERT) || PREEMPT_RT + default y if PREEMPT_RT help This option boosts the priority of preempted RCU readers that block the current preemptible RCU grace period for too long. diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c index b9c45b2d7..b0e409c65 100644 --- a/kernel/rcu/tree.c +++ b/kernel/rcu/tree.c @@ -100,8 +100,10 @@ static struct rcu_state rcu_state = { static bool dump_tree; module_param(dump_tree, bool, 0444); /* By default, use RCU_SOFTIRQ instead of rcuc kthreads. */ -static bool use_softirq = true; +static bool use_softirq = !IS_ENABLED(CONFIG_PREEMPT_RT); +#ifndef CONFIG_PREEMPT_RT module_param(use_softirq, bool, 0444); +#endif /* Control rcu_node-tree auto-balancing at boot time. */ static bool rcu_fanout_exact; module_param(rcu_fanout_exact, bool, 0444); diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c index 849f0aa99..dd94a602a 100644 --- a/kernel/rcu/update.c +++ b/kernel/rcu/update.c @@ -56,8 +56,10 @@ #ifndef CONFIG_TINY_RCU module_param(rcu_expedited, int, 0); module_param(rcu_normal, int, 0); -static int rcu_normal_after_boot; +static int rcu_normal_after_boot = IS_ENABLED(CONFIG_PREEMPT_RT); +#ifndef CONFIG_PREEMPT_RT module_param(rcu_normal_after_boot, int, 0); +#endif #endif /* #ifndef CONFIG_TINY_RCU */ #ifdef CONFIG_DEBUG_LOCK_ALLOC diff --git a/kernel/sched/core.c b/kernel/sched/core.c index a8358ddba..391a8085a 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -64,7 +64,11 @@ const_debug unsigned int sysctl_sched_features = * Number of tasks to iterate in a single balance run. * Limited because this is done with IRQs disabled. */ +#ifdef CONFIG_PREEMPT_RT +const_debug unsigned int sysctl_sched_nr_migrate = 8; +#else const_debug unsigned int sysctl_sched_nr_migrate = 32; +#endif /* * period over which we measure -rt task CPU usage in us. @@ -502,9 +506,15 @@ static bool set_nr_if_polling(struct task_struct *p) #endif #endif -static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) +static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task, + bool sleeper) { - struct wake_q_node *node = &task->wake_q; + struct wake_q_node *node; + + if (sleeper) + node = &task->wake_q_sleeper; + else + node = &task->wake_q; /* * Atomically grab the task, if ->wake_q is !nil already it means @@ -540,7 +550,13 @@ static bool __wake_q_add(struct wake_q_head *head, struct task_struct *task) */ void wake_q_add(struct wake_q_head *head, struct task_struct *task) { - if (__wake_q_add(head, task)) + if (__wake_q_add(head, task, false)) + get_task_struct(task); +} + +void wake_q_add_sleeper(struct wake_q_head *head, struct task_struct *task) +{ + if (__wake_q_add(head, task, true)) get_task_struct(task); } @@ -563,28 +579,39 @@ void wake_q_add(struct wake_q_head *head, struct task_struct *task) */ void wake_q_add_safe(struct wake_q_head *head, struct task_struct *task) { - if (!__wake_q_add(head, task)) + if (!__wake_q_add(head, task, false)) put_task_struct(task); } -void wake_up_q(struct wake_q_head *head) +void __wake_up_q(struct wake_q_head *head, bool sleeper) { struct wake_q_node *node = head->first; while (node != WAKE_Q_TAIL) { struct task_struct *task; - task = container_of(node, struct task_struct, wake_q); + if (sleeper) + task = container_of(node, struct task_struct, wake_q_sleeper); + else + task = container_of(node, struct task_struct, wake_q); + BUG_ON(!task); /* Task can safely be re-inserted now: */ node = node->next; - task->wake_q.next = NULL; + if (sleeper) + task->wake_q_sleeper.next = NULL; + else + task->wake_q.next = NULL; /* * wake_up_process() executes a full barrier, which pairs with * the queueing in wake_q_add() so as not to miss wakeups. */ - wake_up_process(task); + if (sleeper) + wake_up_lock_sleeper(task); + else + wake_up_process(task); + put_task_struct(task); } } @@ -620,6 +647,48 @@ void resched_curr(struct rq *rq) trace_sched_wake_idle_without_ipi(cpu); } +#ifdef CONFIG_PREEMPT_LAZY + +static int tsk_is_polling(struct task_struct *p) +{ +#ifdef TIF_POLLING_NRFLAG + return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG); +#else + return 0; +#endif +} + +void resched_curr_lazy(struct rq *rq) +{ + struct task_struct *curr = rq->curr; + int cpu; + + if (!sched_feat(PREEMPT_LAZY)) { + resched_curr(rq); + return; + } + + lockdep_assert_held(&rq->lock); + + if (test_tsk_need_resched(curr)) + return; + + if (test_tsk_need_resched_lazy(curr)) + return; + + set_tsk_need_resched_lazy(curr); + + cpu = cpu_of(rq); + if (cpu == smp_processor_id()) + return; + + /* NEED_RESCHED_LAZY must be visible before we test polling */ + smp_mb(); + if (!tsk_is_polling(curr)) + smp_send_reschedule(cpu); +} +#endif + void resched_cpu(int cpu) { struct rq *rq = cpu_rq(cpu); @@ -1702,6 +1771,82 @@ void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags) #ifdef CONFIG_SMP +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +static int __set_cpus_allowed_ptr(struct task_struct *p, + const struct cpumask *new_mask, + u32 flags); + +static void migrate_disable_switch(struct rq *rq, struct task_struct *p) +{ + if (likely(!p->migration_disabled)) + return; + + if (p->cpus_ptr != &p->cpus_mask) + return; + + /* + * Violates locking rules! see comment in __do_set_cpus_allowed(). + */ + __do_set_cpus_allowed(p, cpumask_of(rq->cpu), SCA_MIGRATE_DISABLE); +} + +void migrate_disable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled) { + p->migration_disabled++; + return; + } + + trace_sched_migrate_disable_tp(p); + + preempt_disable(); + this_rq()->nr_pinned++; + p->migration_disabled = 1; + preempt_lazy_disable(); + preempt_enable(); +} +EXPORT_SYMBOL_GPL(migrate_disable); + +void migrate_enable(void) +{ + struct task_struct *p = current; + + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). + */ + preempt_disable(); + if (p->cpus_ptr != &p->cpus_mask) + __set_cpus_allowed_ptr(p, &p->cpus_mask, SCA_MIGRATE_ENABLE); + /* + * Mustn't clear migration_disabled() until cpus_ptr points back at the + * regular cpus_mask, otherwise things that race (eg. + * select_fallback_rq) get confused. + */ + barrier(); + p->migration_disabled = 0; + this_rq()->nr_pinned--; + preempt_lazy_enable(); + preempt_enable(); + + trace_sched_migrate_enable_tp(p); +} +EXPORT_SYMBOL_GPL(migrate_enable); + +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return rq->nr_pinned; +} + /* * Per-CPU kthreads are allowed to run on !active && online CPUs, see * __set_cpus_allowed_ptr() and select_fallback_rq(). @@ -1711,7 +1856,7 @@ static inline bool is_cpu_allowed(struct task_struct *p, int cpu) if (!cpumask_test_cpu(cpu, p->cpus_ptr)) return false; - if (is_per_cpu_kthread(p)) + if (is_per_cpu_kthread(p) || is_migration_disabled(p)) return cpu_online(cpu); return cpu_active(cpu); @@ -1756,8 +1901,21 @@ static struct rq *move_queued_task(struct rq *rq, struct rq_flags *rf, } struct migration_arg { - struct task_struct *task; - int dest_cpu; + struct task_struct *task; + int dest_cpu; + struct set_affinity_pending *pending; +}; + +/* + * @refs: number of wait_for_completion() + * @stop_pending: is @stop_work in use + */ +struct set_affinity_pending { + refcount_t refs; + unsigned int stop_pending; + struct completion done; + struct cpu_stop_work stop_work; + struct migration_arg arg; }; /* @@ -1790,15 +1948,17 @@ static struct rq *__migrate_task(struct rq *rq, struct rq_flags *rf, static int migration_cpu_stop(void *data) { struct migration_arg *arg = data; + struct set_affinity_pending *pending = arg->pending; struct task_struct *p = arg->task; struct rq *rq = this_rq(); + bool complete = false; struct rq_flags rf; /* * The original target CPU might have gone down and we might * be on another CPU but it doesn't matter. */ - local_irq_disable(); + local_irq_save(rf.flags); /* * We need to explicitly wake pending tasks before running * __migrate_task() such that we will not miss enforcing cpus_ptr @@ -1808,21 +1968,121 @@ static int migration_cpu_stop(void *data) raw_spin_lock(&p->pi_lock); rq_lock(rq, &rf); + /* * If task_rq(p) != rq, it cannot be migrated here, because we're * holding rq->lock, if p->on_rq == 0 it cannot get enqueued because * we're holding p->pi_lock. */ if (task_rq(p) == rq) { + if (is_migration_disabled(p)) + goto out; + + if (pending) { + if (p->migration_pending == pending) + p->migration_pending = NULL; + complete = true; + + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) + goto out; + } + if (task_on_rq_queued(p)) rq = __migrate_task(rq, &rf, p, arg->dest_cpu); else p->wake_cpu = arg->dest_cpu; + + /* + * XXX __migrate_task() can fail, at which point we might end + * up running on a dodgy CPU, AFAICT this can only happen + * during CPU hotplug, at which point we'll get pushed out + * anyway, so it's probably not a big deal. + */ + + } else if (pending) { + /* + * This happens when we get migrated between migrate_enable()'s + * preempt_enable() and scheduling the stopper task. At that + * point we're a regular task again and not current anymore. + * + * A !PREEMPT kernel has a giant hole here, which makes it far + * more likely. + */ + + /* + * The task moved before the stopper got to run. We're holding + * ->pi_lock, so the allowed mask is stable - if it got + * somewhere allowed, we're done. + */ + if (cpumask_test_cpu(task_cpu(p), p->cpus_ptr)) { + if (p->migration_pending == pending) + p->migration_pending = NULL; + complete = true; + goto out; + } + + /* + * When migrate_enable() hits a rq mis-match we can't reliably + * determine is_migration_disabled() and so have to chase after + * it. + */ + WARN_ON_ONCE(!pending->stop_pending); + task_rq_unlock(rq, p, &rf); + stop_one_cpu_nowait(task_cpu(p), migration_cpu_stop, + &pending->arg, &pending->stop_work); + return 0; } - rq_unlock(rq, &rf); - raw_spin_unlock(&p->pi_lock); +out: + if (pending) + pending->stop_pending = false; + task_rq_unlock(rq, p, &rf); + + if (complete) + complete_all(&pending->done); + + return 0; +} + +int push_cpu_stop(void *arg) +{ + struct rq *lowest_rq = NULL, *rq = this_rq(); + struct task_struct *p = arg; + + raw_spin_lock_irq(&p->pi_lock); + raw_spin_lock(&rq->lock); + + if (task_rq(p) != rq) + goto out_unlock; + + if (is_migration_disabled(p)) { + p->migration_flags |= MDF_PUSH; + goto out_unlock; + } + + p->migration_flags &= ~MDF_PUSH; + + if (p->sched_class->find_lock_rq) + lowest_rq = p->sched_class->find_lock_rq(p, rq); + + if (!lowest_rq) + goto out_unlock; + + // XXX validate p is still the highest prio task + if (task_rq(p) == rq) { + deactivate_task(rq, p, 0); + set_task_cpu(p, lowest_rq->cpu); + activate_task(lowest_rq, p, 0); + resched_curr(lowest_rq); + } + + double_unlock_balance(rq, lowest_rq); + +out_unlock: + rq->push_busy = false; + raw_spin_unlock(&rq->lock); + raw_spin_unlock_irq(&p->pi_lock); - local_irq_enable(); + put_task_struct(p); return 0; } @@ -1830,18 +2090,39 @@ static int migration_cpu_stop(void *data) * sched_class::set_cpus_allowed must do the below, but is not required to * actually call this function. */ -void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) +void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { + if (flags & (SCA_MIGRATE_ENABLE | SCA_MIGRATE_DISABLE)) { + p->cpus_ptr = new_mask; + return; + } + cpumask_copy(&p->cpus_mask, new_mask); p->nr_cpus_allowed = cpumask_weight(new_mask); } -void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +static void +__do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask, u32 flags) { struct rq *rq = task_rq(p); bool queued, running; - lockdep_assert_held(&p->pi_lock); + /* + * This here violates the locking rules for affinity, since we're only + * supposed to change these variables while holding both rq->lock and + * p->pi_lock. + * + * HOWEVER, it magically works, because ttwu() is the only code that + * accesses these variables under p->pi_lock and only does so after + * smp_cond_load_acquire(&p->on_cpu, !VAL), and we're in __schedule() + * before finish_task(). + * + * XXX do further audits, this smells like something putrid. + */ + if (flags & SCA_MIGRATE_DISABLE) + SCHED_WARN_ON(!p->on_cpu); + else + lockdep_assert_held(&p->pi_lock); queued = task_on_rq_queued(p); running = task_current(rq, p); @@ -1857,7 +2138,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (running) put_prev_task(rq, p); - p->sched_class->set_cpus_allowed(p, new_mask); + p->sched_class->set_cpus_allowed(p, new_mask, flags); if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); @@ -1865,6 +2146,222 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) set_next_task(rq, p); } +void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) +{ + __do_set_cpus_allowed(p, new_mask, 0); +} + +/* + * This function is wildly self concurrent; here be dragons. + * + * + * When given a valid mask, __set_cpus_allowed_ptr() must block until the + * designated task is enqueued on an allowed CPU. If that task is currently + * running, we have to kick it out using the CPU stopper. + * + * Migrate-Disable comes along and tramples all over our nice sandcastle. + * Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * + * P1 *cannot* return from this set_cpus_allowed_ptr() call until P0 executes + * its outermost migrate_enable() (i.e. it exits its Migrate-Disable region). + * This means we need the following scheme: + * + * P0@CPU0 P1 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * <blocks> + * <resumes> + * migrate_enable(); + * __set_cpus_allowed_ptr(); + * <wakes local stopper> + * `--> <woken on migration completion> + * + * Now the fun stuff: there may be several P1-like tasks, i.e. multiple + * concurrent set_cpus_allowed_ptr(P0, [*]) calls. CPU affinity changes of any + * task p are serialized by p->pi_lock, which we can leverage: the one that + * should come into effect at the end of the Migrate-Disable region is the last + * one. This means we only need to track a single cpumask (i.e. p->cpus_mask), + * but we still need to properly signal those waiting tasks at the appropriate + * moment. + * + * This is implemented using struct set_affinity_pending. The first + * __set_cpus_allowed_ptr() caller within a given Migrate-Disable region will + * setup an instance of that struct and install it on the targeted task_struct. + * Any and all further callers will reuse that instance. Those then wait for + * a completion signaled at the tail of the CPU stopper callback (1), triggered + * on the end of the Migrate-Disable region (i.e. outermost migrate_enable()). + * + * + * (1) In the cases covered above. There is one more where the completion is + * signaled within affine_move_task() itself: when a subsequent affinity request + * cancels the need for an active migration. Consider: + * + * Initial conditions: P0->cpus_mask = [0, 1] + * + * P0@CPU0 P1 P2 + * + * migrate_disable(); + * <preempted> + * set_cpus_allowed_ptr(P0, [1]); + * <blocks> + * set_cpus_allowed_ptr(P0, [0, 1]); + * <signal completion> + * <awakes> + * + * Note that the above is safe vs a concurrent migrate_enable(), as any + * pending affinity completion is preceded an uninstallion of + * p->migration_pending done with p->pi_lock held. + */ +static int affine_move_task(struct rq *rq, struct task_struct *p, struct rq_flags *rf, + int dest_cpu, unsigned int flags) +{ + struct set_affinity_pending my_pending = { }, *pending = NULL; + bool stop_pending, complete = false; + + /* Can the task run on the task's current CPU? If so, we're done */ + if (cpumask_test_cpu(task_cpu(p), &p->cpus_mask)) { + struct task_struct *push_task = NULL; + + if ((flags & SCA_MIGRATE_ENABLE) && + (p->migration_flags & MDF_PUSH) && !rq->push_busy) { + rq->push_busy = true; + push_task = get_task_struct(p); + } + + /* + * If there are pending waiters, but no pending stop_work, + * then complete now. + */ + pending = p->migration_pending; + if (pending && !pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } + + task_rq_unlock(rq, p, rf); + + if (push_task) { + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + p, &rq->push_work); + } + + if (complete) + complete_all(&pending->done); + + return 0; + } + + if (!(flags & SCA_MIGRATE_ENABLE)) { + /* serialized by p->pi_lock */ + if (!p->migration_pending) { + /* Install the request */ + refcount_set(&my_pending.refs, 1); + init_completion(&my_pending.done); + my_pending.arg = (struct migration_arg) { + .task = p, + .dest_cpu = dest_cpu, + .pending = &my_pending, + }; + + p->migration_pending = &my_pending; + } else { + pending = p->migration_pending; + refcount_inc(&pending->refs); + /* + * Affinity has changed, but we've already installed a + * pending. migration_cpu_stop() *must* see this, else + * we risk a completion of the pending despite having a + * task on a disallowed CPU. + * + * Serialized by p->pi_lock, so this is safe. + */ + pending->arg.dest_cpu = dest_cpu; + } + } + pending = p->migration_pending; + /* + * - !MIGRATE_ENABLE: + * we'll have installed a pending if there wasn't one already. + * + * - MIGRATE_ENABLE: + * we're here because the current CPU isn't matching anymore, + * the only way that can happen is because of a concurrent + * set_cpus_allowed_ptr() call, which should then still be + * pending completion. + * + * Either way, we really should have a @pending here. + */ + if (WARN_ON_ONCE(!pending)) { + task_rq_unlock(rq, p, rf); + return -EINVAL; + } + + if (task_running(rq, p) || p->state == TASK_WAKING) { + /* + * MIGRATE_ENABLE gets here because 'p == current', but for + * anything else we cannot do is_migration_disabled(), punt + * and have the stopper function handle it all race-free. + */ + stop_pending = pending->stop_pending; + if (!stop_pending) + pending->stop_pending = true; + + if (flags & SCA_MIGRATE_ENABLE) + p->migration_flags &= ~MDF_PUSH; + + task_rq_unlock(rq, p, rf); + + if (!stop_pending) { + stop_one_cpu_nowait(cpu_of(rq), migration_cpu_stop, + &pending->arg, &pending->stop_work); + } + + if (flags & SCA_MIGRATE_ENABLE) + return 0; + } else { + + if (!is_migration_disabled(p)) { + if (task_on_rq_queued(p)) + rq = move_queued_task(rq, rf, p, dest_cpu); + + if (!pending->stop_pending) { + p->migration_pending = NULL; + complete = true; + } + } + task_rq_unlock(rq, p, rf); + + if (complete) + complete_all(&pending->done); + } + + wait_for_completion(&pending->done); + + if (refcount_dec_and_test(&pending->refs)) + wake_up_var(&pending->refs); /* No UaF, just an address */ + + /* + * Block the original owner of &pending until all subsequent callers + * have seen the completion and decremented the refcount + */ + wait_var_event(&my_pending.refs, !refcount_read(&my_pending.refs)); + + /* ARGH */ + WARN_ON_ONCE(my_pending.stop_pending); + + return 0; +} + /* * Change a given task's CPU affinity. Migrate the thread to a * proper CPU and schedule it away if the CPU it's executing on @@ -1875,7 +2372,8 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) * call is not atomic; no spinlocks may be held. */ static int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) + const struct cpumask *new_mask, + u32 flags) { const struct cpumask *cpu_valid_mask = cpu_active_mask; unsigned int dest_cpu; @@ -1886,9 +2384,14 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, rq = task_rq_lock(p, &rf); update_rq_clock(rq); - if (p->flags & PF_KTHREAD) { + if (p->flags & PF_KTHREAD || is_migration_disabled(p)) { /* - * Kernel threads are allowed on online && !active CPUs + * Kernel threads are allowed on online && !active CPUs. + * + * Specifically, migration_disabled() tasks must not fail the + * cpumask_any_and_distribute() pick below, esp. so on + * SCA_MIGRATE_ENABLE, otherwise we'll not call + * set_cpus_allowed_common() and actually reset p->cpus_ptr. */ cpu_valid_mask = cpu_online_mask; } @@ -1897,13 +2400,22 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, * Must re-check here, to close a race against __kthread_bind(), * sched_setaffinity() is not guaranteed to observe the flag. */ - if (check && (p->flags & PF_NO_SETAFFINITY)) { + if ((flags & SCA_CHECK) && (p->flags & PF_NO_SETAFFINITY)) { ret = -EINVAL; goto out; } - if (cpumask_equal(&p->cpus_mask, new_mask)) - goto out; + if (!(flags & SCA_MIGRATE_ENABLE)) { + if (cpumask_equal(&p->cpus_mask, new_mask)) + goto out; + + if (WARN_ON_ONCE(p == current && + is_migration_disabled(p) && + !cpumask_test_cpu(task_cpu(p), new_mask))) { + ret = -EBUSY; + goto out; + } + } /* * Picking a ~random cpu helps in cases where we are changing affinity @@ -1916,7 +2428,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - do_set_cpus_allowed(p, new_mask); + __do_set_cpus_allowed(p, new_mask, flags); if (p->flags & PF_KTHREAD) { /* @@ -1928,23 +2440,8 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, p->nr_cpus_allowed != 1); } - /* Can the task run on the task's current CPU? If so, we're done */ - if (cpumask_test_cpu(task_cpu(p), new_mask)) - goto out; + return affine_move_task(rq, p, &rf, dest_cpu, flags); - if (task_running(rq, p) || p->state == TASK_WAKING) { - struct migration_arg arg = { p, dest_cpu }; - /* Need help from migration thread: drop lock and wait. */ - task_rq_unlock(rq, p, &rf); - stop_one_cpu(cpu_of(rq), migration_cpu_stop, &arg); - return 0; - } else if (task_on_rq_queued(p)) { - /* - * OK, since we're going to drop the lock immediately - * afterwards anyway. - */ - rq = move_queued_task(rq, &rf, p, dest_cpu); - } out: task_rq_unlock(rq, p, &rf); @@ -1953,7 +2450,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) { - return __set_cpus_allowed_ptr(p, new_mask, false); + return __set_cpus_allowed_ptr(p, new_mask, 0); } EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); @@ -1994,6 +2491,8 @@ void set_task_cpu(struct task_struct *p, unsigned int new_cpu) * Clearly, migrating tasks to offline CPUs is a fairly daft thing. */ WARN_ON_ONCE(!cpu_online(new_cpu)); + + WARN_ON_ONCE(is_migration_disabled(p)); #endif trace_sched_migrate_task(p, new_cpu); @@ -2126,6 +2625,18 @@ int migrate_swap(struct task_struct *cur, struct task_struct *p, } #endif /* CONFIG_NUMA_BALANCING */ +static bool check_task_state(struct task_struct *p, long match_state) +{ + bool match = false; + + raw_spin_lock_irq(&p->pi_lock); + if (p->state == match_state || p->saved_state == match_state) + match = true; + raw_spin_unlock_irq(&p->pi_lock); + + return match; +} + /* * wait_task_inactive - wait for a thread to unschedule. * @@ -2170,7 +2681,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) * is actually now running somewhere else! */ while (task_running(rq, p)) { - if (match_state && unlikely(p->state != match_state)) + if (match_state && !check_task_state(p, match_state)) return 0; cpu_relax(); } @@ -2185,7 +2696,8 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) running = task_running(rq, p); queued = task_on_rq_queued(p); ncsw = 0; - if (!match_state || p->state == match_state) + if (!match_state || p->state == match_state || + p->saved_state == match_state) ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ task_rq_unlock(rq, p, &rf); @@ -2219,7 +2731,7 @@ unsigned long wait_task_inactive(struct task_struct *p, long match_state) ktime_t to = NSEC_PER_SEC / HZ; set_current_state(TASK_UNINTERRUPTIBLE); - schedule_hrtimeout(&to, HRTIMER_MODE_REL); + schedule_hrtimeout(&to, HRTIMER_MODE_REL_HARD); continue; } @@ -2324,6 +2836,12 @@ static int select_fallback_rq(int cpu, struct task_struct *p) } fallthrough; case possible: + /* + * XXX When called from select_task_rq() we only + * hold p->pi_lock and again violate locking order. + * + * More yuck to audit. + */ do_set_cpus_allowed(p, cpu_possible_mask); state = fail; break; @@ -2358,7 +2876,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) { lockdep_assert_held(&p->pi_lock); - if (p->nr_cpus_allowed > 1) + if (p->nr_cpus_allowed > 1 && !is_migration_disabled(p)) cpu = p->sched_class->select_task_rq(p, cpu, sd_flags, wake_flags); else cpu = cpumask_any(p->cpus_ptr); @@ -2381,6 +2899,7 @@ int select_task_rq(struct task_struct *p, int cpu, int sd_flags, int wake_flags) void sched_set_stop_task(int cpu, struct task_struct *stop) { + static struct lock_class_key stop_pi_lock; struct sched_param param = { .sched_priority = MAX_RT_PRIO - 1 }; struct task_struct *old_stop = cpu_rq(cpu)->stop; @@ -2396,6 +2915,20 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) sched_setscheduler_nocheck(stop, SCHED_FIFO, ¶m); stop->sched_class = &stop_sched_class; + + /* + * The PI code calls rt_mutex_setprio() with ->pi_lock held to + * adjust the effective priority of a task. As a result, + * rt_mutex_setprio() can trigger (RT) balancing operations, + * which can then trigger wakeups of the stop thread to push + * around the current task. + * + * The stop task itself will never be part of the PI-chain, it + * never blocks, therefore that ->pi_lock recursion is safe. + * Tell lockdep about this by placing the stop->pi_lock in its + * own class. + */ + lockdep_set_class(&stop->pi_lock, &stop_pi_lock); } cpu_rq(cpu)->stop = stop; @@ -2409,15 +2942,23 @@ void sched_set_stop_task(int cpu, struct task_struct *stop) } } -#else +#else /* CONFIG_SMP */ static inline int __set_cpus_allowed_ptr(struct task_struct *p, - const struct cpumask *new_mask, bool check) + const struct cpumask *new_mask, + u32 flags) { return set_cpus_allowed_ptr(p, new_mask); } -#endif /* CONFIG_SMP */ +static inline void migrate_disable_switch(struct rq *rq, struct task_struct *p) { } + +static inline bool rq_has_pinned_tasks(struct rq *rq) +{ + return false; +} + +#endif /* !CONFIG_SMP */ static void ttwu_stat(struct task_struct *p, int cpu, int wake_flags) @@ -2838,7 +3379,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) int cpu, success = 0; preempt_disable(); - if (p == current) { + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) * == smp_processor_id()'. Together this means we can special @@ -2868,8 +3409,26 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) */ raw_spin_lock_irqsave(&p->pi_lock, flags); smp_mb__after_spinlock(); - if (!(p->state & state)) + if (!(p->state & state)) { + /* + * The task might be running due to a spinlock sleeper + * wakeup. Check the saved state and set it to running + * if the wakeup condition is true. + */ + if (!(wake_flags & WF_LOCK_SLEEPER)) { + if (p->saved_state & state) { + p->saved_state = TASK_RUNNING; + success = 1; + } + } goto unlock; + } + /* + * If this is a regular wakeup, then we can unconditionally + * clear the saved state of a "lock sleeper". + */ + if (!(wake_flags & WF_LOCK_SLEEPER)) + p->saved_state = TASK_RUNNING; trace_sched_waking(p); @@ -3058,6 +3617,18 @@ int wake_up_process(struct task_struct *p) } EXPORT_SYMBOL(wake_up_process); +/** + * wake_up_lock_sleeper - Wake up a specific process blocked on a "sleeping lock" + * @p: The process to be woken up. + * + * Same as wake_up_process() above, but wake_flags=WF_LOCK_SLEEPER to indicate + * the nature of the wakeup. + */ +int wake_up_lock_sleeper(struct task_struct *p) +{ + return try_to_wake_up(p, TASK_UNINTERRUPTIBLE, WF_LOCK_SLEEPER); +} + int wake_up_state(struct task_struct *p, unsigned int state) { return try_to_wake_up(p, state, 0); @@ -3111,6 +3682,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) init_numa_balancing(clone_flags, p); #ifdef CONFIG_SMP p->wake_entry.u_flags = CSD_TYPE_TTWU; + p->migration_pending = NULL; #endif } @@ -3315,6 +3887,9 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) p->on_cpu = 0; #endif init_task_preempt_count(p); +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(p)->preempt_lazy_count = 0; +#endif #ifdef CONFIG_SMP plist_node_init(&p->pushable_tasks, MAX_PRIO); RB_CLEAR_NODE(&p->pushable_dl_tasks); @@ -3485,51 +4060,135 @@ fire_sched_out_preempt_notifiers(struct task_struct *curr, __fire_sched_out_preempt_notifiers(curr, next); } -#else /* !CONFIG_PREEMPT_NOTIFIERS */ +#else /* !CONFIG_PREEMPT_NOTIFIERS */ + +static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) +{ +} + +static inline void +fire_sched_out_preempt_notifiers(struct task_struct *curr, + struct task_struct *next) +{ +} + +#endif /* CONFIG_PREEMPT_NOTIFIERS */ + +static inline void prepare_task(struct task_struct *next) +{ +#ifdef CONFIG_SMP + /* + * Claim the task as running, we do this before switching to it + * such that any running task will have this set. + * + * See the ttwu() WF_ON_CPU case and its ordering comment. + */ + WRITE_ONCE(next->on_cpu, 1); +#endif +} + +static inline void finish_task(struct task_struct *prev) +{ +#ifdef CONFIG_SMP + /* + * This must be the very last reference to @prev from this CPU. After + * p->on_cpu is cleared, the task can be moved to a different CPU. We + * must ensure this doesn't happen until the switch is completely + * finished. + * + * In particular, the load of prev->state in finish_task_switch() must + * happen before this. + * + * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). + */ + smp_store_release(&prev->on_cpu, 0); +#endif +} + +#ifdef CONFIG_SMP + +static void do_balance_callbacks(struct rq *rq, struct callback_head *head) +{ + void (*func)(struct rq *rq); + struct callback_head *next; + + lockdep_assert_held(&rq->lock); + + while (head) { + func = (void (*)(struct rq *))head->func; + next = head->next; + head->next = NULL; + head = next; + + func(rq); + } +} + +static inline struct callback_head *splice_balance_callbacks(struct rq *rq) +{ + struct callback_head *head = rq->balance_callback; + + lockdep_assert_held(&rq->lock); + if (head) { + rq->balance_callback = NULL; + rq->balance_flags &= ~BALANCE_WORK; + } + + return head; +} + +static void __balance_callbacks(struct rq *rq) +{ + do_balance_callbacks(rq, splice_balance_callbacks(rq)); +} + +static inline void balance_callbacks(struct rq *rq, struct callback_head *head) +{ + unsigned long flags; + + if (unlikely(head)) { + raw_spin_lock_irqsave(&rq->lock, flags); + do_balance_callbacks(rq, head); + raw_spin_unlock_irqrestore(&rq->lock, flags); + } +} + +static void balance_push(struct rq *rq); + +static inline void balance_switch(struct rq *rq) +{ + if (likely(!rq->balance_flags)) + return; + + if (rq->balance_flags & BALANCE_PUSH) { + balance_push(rq); + return; + } + + __balance_callbacks(rq); +} + +#else -static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) +static inline void __balance_callbacks(struct rq *rq) { } -static inline void -fire_sched_out_preempt_notifiers(struct task_struct *curr, - struct task_struct *next) +static inline struct callback_head *splice_balance_callbacks(struct rq *rq) { + return NULL; } -#endif /* CONFIG_PREEMPT_NOTIFIERS */ - -static inline void prepare_task(struct task_struct *next) +static inline void balance_callbacks(struct rq *rq, struct callback_head *head) { -#ifdef CONFIG_SMP - /* - * Claim the task as running, we do this before switching to it - * such that any running task will have this set. - * - * See the ttwu() WF_ON_CPU case and its ordering comment. - */ - WRITE_ONCE(next->on_cpu, 1); -#endif } -static inline void finish_task(struct task_struct *prev) +static inline void balance_switch(struct rq *rq) { -#ifdef CONFIG_SMP - /* - * This must be the very last reference to @prev from this CPU. After - * p->on_cpu is cleared, the task can be moved to a different CPU. We - * must ensure this doesn't happen until the switch is completely - * finished. - * - * In particular, the load of prev->state in finish_task_switch() must - * happen before this. - * - * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). - */ - smp_store_release(&prev->on_cpu, 0); -#endif } +#endif + static inline void prepare_lock_switch(struct rq *rq, struct task_struct *next, struct rq_flags *rf) { @@ -3555,6 +4214,7 @@ static inline void finish_lock_switch(struct rq *rq) * prev into current: */ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); + balance_switch(rq); raw_spin_unlock_irq(&rq->lock); } @@ -3570,6 +4230,22 @@ static inline void finish_lock_switch(struct rq *rq) # define finish_arch_post_lock_switch() do { } while (0) #endif +static inline void kmap_local_sched_out(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_out(); +#endif +} + +static inline void kmap_local_sched_in(void) +{ +#ifdef CONFIG_KMAP_LOCAL + if (unlikely(current->kmap_ctrl.idx)) + __kmap_local_sched_in(); +#endif +} + /** * prepare_task_switch - prepare to switch tasks * @rq: the runqueue preparing to switch @@ -3592,6 +4268,7 @@ prepare_task_switch(struct rq *rq, struct task_struct *prev, perf_event_task_sched_out(prev, next); rseq_preempt(prev); fire_sched_out_preempt_notifiers(prev, next); + kmap_local_sched_out(); prepare_task(next); prepare_arch_switch(next); } @@ -3659,6 +4336,7 @@ static struct rq *finish_task_switch(struct task_struct *prev) finish_lock_switch(rq); finish_arch_post_lock_switch(); kcov_finish_switch(current); + kmap_local_sched_in(); fire_sched_in_preempt_notifiers(current); /* @@ -3673,66 +4351,24 @@ static struct rq *finish_task_switch(struct task_struct *prev) * provided by mmdrop(), * - a sync_core for SYNC_CORE. */ + /* + * We use mmdrop_delayed() here so we don't have to do the + * full __mmdrop() when we are the last user. + */ if (mm) { membarrier_mm_sync_core_before_usermode(mm); - mmdrop(mm); + mmdrop_delayed(mm); } if (unlikely(prev_state == TASK_DEAD)) { if (prev->sched_class->task_dead) prev->sched_class->task_dead(prev); - /* - * Remove function-return probe instances associated with this - * task and put them back on the free list. - */ - kprobe_flush_task(prev); - - /* Task is done with its stack. */ - put_task_stack(prev); - put_task_struct_rcu_user(prev); } return rq; } -#ifdef CONFIG_SMP - -/* rq->lock is NOT held, but preemption is disabled */ -static void __balance_callback(struct rq *rq) -{ - struct callback_head *head, *next; - void (*func)(struct rq *rq); - unsigned long flags; - - raw_spin_lock_irqsave(&rq->lock, flags); - head = rq->balance_callback; - rq->balance_callback = NULL; - while (head) { - func = (void (*)(struct rq *))head->func; - next = head->next; - head->next = NULL; - head = next; - - func(rq); - } - raw_spin_unlock_irqrestore(&rq->lock, flags); -} - -static inline void balance_callback(struct rq *rq) -{ - if (unlikely(rq->balance_callback)) - __balance_callback(rq); -} - -#else - -static inline void balance_callback(struct rq *rq) -{ -} - -#endif - /** * schedule_tail - first thing a freshly forked thread must call. * @prev: the thread we just switched away from. @@ -3752,7 +4388,6 @@ asmlinkage __visible void schedule_tail(struct task_struct *prev) */ rq = finish_task_switch(prev); - balance_callback(rq); preempt_enable(); if (current->set_child_tid) @@ -4447,7 +5082,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) * * WARNING: must be called with preemption disabled! */ -static void __sched notrace __schedule(bool preempt) +static void __sched notrace __schedule(bool preempt, bool spinning_lock) { struct task_struct *prev, *next; unsigned long *switch_count; @@ -4500,7 +5135,7 @@ static void __sched notrace __schedule(bool preempt) * - ptrace_{,un}freeze_traced() can change ->state underneath us. */ prev_state = prev->state; - if (!preempt && prev_state) { + if ((!preempt || spinning_lock) && prev_state) { if (signal_pending_state(prev_state, prev)) { prev->state = TASK_RUNNING; } else { @@ -4535,6 +5170,7 @@ static void __sched notrace __schedule(bool preempt) next = pick_next_task(rq, prev, &rf); clear_tsk_need_resched(prev); + clear_tsk_need_resched_lazy(prev); clear_preempt_need_resched(); if (likely(prev != next)) { @@ -4560,6 +5196,7 @@ static void __sched notrace __schedule(bool preempt) */ ++*switch_count; + migrate_disable_switch(rq, prev); psi_sched_switch(prev, next, !task_on_rq_queued(prev)); trace_sched_switch(preempt, prev, next); @@ -4568,10 +5205,11 @@ static void __sched notrace __schedule(bool preempt) rq = context_switch(rq, prev, next, &rf); } else { rq->clock_update_flags &= ~(RQCF_ACT_SKIP|RQCF_REQ_SKIP); - rq_unlock_irq(rq, &rf); - } - balance_callback(rq); + rq_unpin_lock(rq, &rf); + __balance_callbacks(rq); + raw_spin_unlock_irq(&rq->lock); + } } void __noreturn do_task_dead(void) @@ -4582,7 +5220,7 @@ void __noreturn do_task_dead(void) /* Tell freezer to ignore us: */ current->flags |= PF_NOFREEZE; - __schedule(false); + __schedule(false, false); BUG(); /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ @@ -4615,9 +5253,6 @@ static inline void sched_submit_work(struct task_struct *tsk) preempt_enable_no_resched(); } - if (tsk_is_pi_blocked(tsk)) - return; - /* * If we are going to sleep and we have plugged IO queued, * make sure to submit it to avoid deadlocks. @@ -4643,7 +5278,7 @@ asmlinkage __visible void __sched schedule(void) sched_submit_work(tsk); do { preempt_disable(); - __schedule(false); + __schedule(false, false); sched_preempt_enable_no_resched(); } while (need_resched()); sched_update_worker(tsk); @@ -4671,7 +5306,7 @@ void __sched schedule_idle(void) */ WARN_ON_ONCE(current->state); do { - __schedule(false); + __schedule(false, false); } while (need_resched()); } @@ -4724,7 +5359,7 @@ static void __sched notrace preempt_schedule_common(void) */ preempt_disable_notrace(); preempt_latency_start(1); - __schedule(true); + __schedule(true, false); preempt_latency_stop(1); preempt_enable_no_resched_notrace(); @@ -4735,6 +5370,30 @@ static void __sched notrace preempt_schedule_common(void) } while (need_resched()); } +#ifdef CONFIG_PREEMPT_LAZY +/* + * If TIF_NEED_RESCHED is then we allow to be scheduled away since this is + * set by a RT task. Oterwise we try to avoid beeing scheduled out as long as + * preempt_lazy_count counter >0. + */ +static __always_inline int preemptible_lazy(void) +{ + if (test_thread_flag(TIF_NEED_RESCHED)) + return 1; + if (current_thread_info()->preempt_lazy_count) + return 0; + return 1; +} + +#else + +static inline int preemptible_lazy(void) +{ + return 1; +} + +#endif + #ifdef CONFIG_PREEMPTION /* * This is the entry point to schedule() from in-kernel preemption @@ -4749,11 +5408,26 @@ asmlinkage __visible void __sched notrace preempt_schedule(void) if (likely(!preemptible())) return; + if (!preemptible_lazy()) + return; preempt_schedule_common(); } NOKPROBE_SYMBOL(preempt_schedule); EXPORT_SYMBOL(preempt_schedule); +#ifdef CONFIG_PREEMPT_RT +void __sched notrace preempt_schedule_lock(void) +{ + do { + preempt_disable(); + __schedule(true, true); + sched_preempt_enable_no_resched(); + } while (need_resched()); +} +NOKPROBE_SYMBOL(preempt_schedule_lock); +EXPORT_SYMBOL(preempt_schedule_lock); +#endif + #ifdef CONFIG_PREEMPT_DYNAMIC DEFINE_STATIC_CALL(preempt_schedule, __preempt_schedule_func); EXPORT_STATIC_CALL(preempt_schedule); @@ -4781,6 +5455,9 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) if (likely(!preemptible())) return; + if (!preemptible_lazy()) + return; + do { /* * Because the function tracer can trace preempt_count_sub() @@ -4803,7 +5480,7 @@ asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) * an infinite recursion. */ prev_ctx = exception_enter(); - __schedule(true); + __schedule(true, false); exception_exit(prev_ctx); preempt_latency_stop(1); @@ -5021,7 +5698,7 @@ asmlinkage __visible void __sched preempt_schedule_irq(void) do { preempt_disable(); local_irq_enable(); - __schedule(true); + __schedule(true, false); local_irq_disable(); sched_preempt_enable_no_resched(); } while (need_resched()); @@ -5187,9 +5864,11 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) out_unlock: /* Avoid rq from going away on us: */ preempt_disable(); - __task_rq_unlock(rq, &rf); - balance_callback(rq); + rq_unpin_lock(rq, &rf); + __balance_callbacks(rq); + raw_spin_unlock(&rq->lock); + preempt_enable(); } #else @@ -5432,6 +6111,7 @@ static int __sched_setscheduler(struct task_struct *p, int oldpolicy = -1, policy = attr->sched_policy; int retval, oldprio, newprio, queued, running; const struct sched_class *prev_class; + struct callback_head *head; struct rq_flags rf; int reset_on_fork; int queue_flags = DEQUEUE_SAVE | DEQUEUE_MOVE | DEQUEUE_NOCLOCK; @@ -5686,6 +6366,7 @@ static int __sched_setscheduler(struct task_struct *p, /* Avoid rq from going away on us: */ preempt_disable(); + head = splice_balance_callbacks(rq); task_rq_unlock(rq, p, &rf); if (pi) { @@ -5694,7 +6375,7 @@ static int __sched_setscheduler(struct task_struct *p, } /* Run balance callbacks after we've adjusted the PI chain: */ - balance_callback(rq); + balance_callbacks(rq, head); preempt_enable(); return 0; @@ -6189,7 +6870,7 @@ long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) } #endif again: - retval = __set_cpus_allowed_ptr(p, new_mask, true); + retval = __set_cpus_allowed_ptr(p, new_mask, SCA_CHECK); if (!retval) { cpuset_cpus_allowed(p, cpus_allowed); @@ -6775,7 +7456,7 @@ void __init init_idle(struct task_struct *idle, int cpu) * * And since this is boot we can forgo the serialization. */ - set_cpus_allowed_common(idle, cpumask_of(cpu)); + set_cpus_allowed_common(idle, cpumask_of(cpu), 0); #endif /* * We're having a chicken and egg problem, even though we are @@ -6802,7 +7483,9 @@ void __init init_idle(struct task_struct *idle, int cpu) /* Set the preempt count _outside_ the spinlocks! */ init_idle_preempt_count(idle, cpu); - +#ifdef CONFIG_HAVE_PREEMPT_LAZY + task_thread_info(idle)->preempt_lazy_count = 0; +#endif /* * The idle tasks have their own, simple scheduling class: */ @@ -6907,6 +7590,7 @@ void sched_setnuma(struct task_struct *p, int nid) #endif /* CONFIG_NUMA_BALANCING */ #ifdef CONFIG_HOTPLUG_CPU + /* * Ensure that the idle task is using init_mm right before its CPU goes * offline. @@ -6926,119 +7610,126 @@ void idle_task_exit(void) /* finish_cpu(), as ran on the BP, will clean up the active_mm state */ } -/* - * Since this CPU is going 'away' for a while, fold any nr_active delta - * we might have. Assumes we're called after migrate_tasks() so that the - * nr_active count is stable. We need to take the teardown thread which - * is calling this into account, so we hand in adjust = 1 to the load - * calculation. - * - * Also see the comment "Global load-average calculations". - */ -static void calc_load_migrate(struct rq *rq) +static int __balance_push_cpu_stop(void *arg) { - long delta = calc_load_fold_active(rq, 1); - if (delta) - atomic_long_add(delta, &calc_load_tasks); -} + struct task_struct *p = arg; + struct rq *rq = this_rq(); + struct rq_flags rf; + int cpu; -static struct task_struct *__pick_migrate_task(struct rq *rq) -{ - const struct sched_class *class; - struct task_struct *next; + raw_spin_lock_irq(&p->pi_lock); + rq_lock(rq, &rf); - for_each_class(class) { - next = class->pick_next_task(rq); - if (next) { - next->sched_class->put_prev_task(rq, next); - return next; - } + update_rq_clock(rq); + + if (task_rq(p) == rq && task_on_rq_queued(p)) { + cpu = select_fallback_rq(rq->cpu, p); + rq = __migrate_task(rq, &rf, p, cpu); } - /* The idle class should always have a runnable task */ - BUG(); + rq_unlock(rq, &rf); + raw_spin_unlock_irq(&p->pi_lock); + + put_task_struct(p); + + return 0; } +static DEFINE_PER_CPU(struct cpu_stop_work, push_work); + /* - * Migrate all tasks from the rq, sleeping tasks will be migrated by - * try_to_wake_up()->select_task_rq(). - * - * Called with rq->lock held even though we'er in stop_machine() and - * there's no concurrency possible, we hold the required locks anyway - * because of lock validation efforts. + * Ensure we only run per-cpu kthreads once the CPU goes !active. */ -static void migrate_tasks(struct rq *dead_rq, struct rq_flags *rf) +static void balance_push(struct rq *rq) { - struct rq *rq = dead_rq; - struct task_struct *next, *stop = rq->stop; - struct rq_flags orf = *rf; - int dest_cpu; + struct task_struct *push_task = rq->curr; + + lockdep_assert_held(&rq->lock); + SCHED_WARN_ON(rq->cpu != smp_processor_id()); /* - * Fudge the rq selection such that the below task selection loop - * doesn't get stuck on the currently eligible stop task. - * - * We're currently inside stop_machine() and the rq is either stuck - * in the stop_machine_cpu_stop() loop, or we're executing this code, - * either way we should never end up calling schedule() until we're - * done here. + * Both the cpu-hotplug and stop task are in this case and are + * required to complete the hotplug process. */ - rq->stop = NULL; + if (is_per_cpu_kthread(push_task) || is_migration_disabled(push_task)) { + /* + * If this is the idle task on the outgoing CPU try to wake + * up the hotplug control thread which might wait for the + * last task to vanish. The rcuwait_active() check is + * accurate here because the waiter is pinned on this CPU + * and can't obviously be running in parallel. + * + * On RT kernels this also has to check whether there are + * pinned and scheduled out tasks on the runqueue. They + * need to leave the migrate disabled section first. + */ + if (!rq->nr_running && !rq_has_pinned_tasks(rq) && + rcuwait_active(&rq->hotplug_wait)) { + raw_spin_unlock(&rq->lock); + rcuwait_wake_up(&rq->hotplug_wait); + raw_spin_lock(&rq->lock); + } + return; + } + get_task_struct(push_task); /* - * put_prev_task() and pick_next_task() sched - * class method both need to have an up-to-date - * value of rq->clock[_task] + * Temporarily drop rq->lock such that we can wake-up the stop task. + * Both preemption and IRQs are still disabled. */ - update_rq_clock(rq); + raw_spin_unlock(&rq->lock); + stop_one_cpu_nowait(rq->cpu, __balance_push_cpu_stop, push_task, + this_cpu_ptr(&push_work)); + /* + * At this point need_resched() is true and we'll take the loop in + * schedule(). The next pick is obviously going to be the stop task + * which is_per_cpu_kthread() and will push this task away. + */ + raw_spin_lock(&rq->lock); +} - for (;;) { - /* - * There's this thread running, bail when that's the only - * remaining thread: - */ - if (rq->nr_running == 1) - break; +static void balance_push_set(int cpu, bool on) +{ + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; - next = __pick_migrate_task(rq); + rq_lock_irqsave(rq, &rf); + if (on) + rq->balance_flags |= BALANCE_PUSH; + else + rq->balance_flags &= ~BALANCE_PUSH; + rq_unlock_irqrestore(rq, &rf); +} - /* - * Rules for changing task_struct::cpus_mask are holding - * both pi_lock and rq->lock, such that holding either - * stabilizes the mask. - * - * Drop rq->lock is not quite as disastrous as it usually is - * because !cpu_active at this point, which means load-balance - * will not interfere. Also, stop-machine. - */ - rq_unlock(rq, rf); - raw_spin_lock(&next->pi_lock); - rq_relock(rq, rf); +/* + * Invoked from a CPUs hotplug control thread after the CPU has been marked + * inactive. All tasks which are not per CPU kernel threads are either + * pushed off this CPU now via balance_push() or placed on a different CPU + * during wakeup. Wait until the CPU is quiescent. + */ +static void balance_hotplug_wait(void) +{ + struct rq *rq = this_rq(); - /* - * Since we're inside stop-machine, _nothing_ should have - * changed the task, WARN if weird stuff happened, because in - * that case the above rq->lock drop is a fail too. - */ - if (WARN_ON(task_rq(next) != rq || !task_on_rq_queued(next))) { - raw_spin_unlock(&next->pi_lock); - continue; - } + rcuwait_wait_event(&rq->hotplug_wait, + rq->nr_running == 1 && !rq_has_pinned_tasks(rq), + TASK_UNINTERRUPTIBLE); +} - /* Find suitable destination for @next, with force if needed. */ - dest_cpu = select_fallback_rq(dead_rq->cpu, next); - rq = __migrate_task(rq, rf, next, dest_cpu); - if (rq != dead_rq) { - rq_unlock(rq, rf); - rq = dead_rq; - *rf = orf; - rq_relock(rq, rf); - } - raw_spin_unlock(&next->pi_lock); - } +#else + +static inline void balance_push(struct rq *rq) +{ +} - rq->stop = stop; +static inline void balance_push_set(int cpu, bool on) +{ +} + +static inline void balance_hotplug_wait(void) +{ } + #endif /* CONFIG_HOTPLUG_CPU */ void set_rq_online(struct rq *rq) @@ -7124,6 +7815,8 @@ int sched_cpu_activate(unsigned int cpu) struct rq *rq = cpu_rq(cpu); struct rq_flags rf; + balance_push_set(cpu, false); + #ifdef CONFIG_SCHED_SMT /* * When going up, increment the number of cores with SMT present. @@ -7159,6 +7852,8 @@ int sched_cpu_activate(unsigned int cpu) int sched_cpu_deactivate(unsigned int cpu) { + struct rq *rq = cpu_rq(cpu); + struct rq_flags rf; int ret; set_cpu_active(cpu, false); @@ -7171,6 +7866,16 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu(); + balance_push_set(cpu, true); + + rq_lock_irqsave(rq, &rf); + if (rq->rd) { + update_rq_clock(rq); + BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); + set_rq_offline(rq); + } + rq_unlock_irqrestore(rq, &rf); + #ifdef CONFIG_SCHED_SMT /* * When going down, decrement the number of cores with SMT present. @@ -7184,6 +7889,7 @@ int sched_cpu_deactivate(unsigned int cpu) ret = cpuset_cpu_inactive(cpu); if (ret) { + balance_push_set(cpu, false); set_cpu_active(cpu, true); return ret; } @@ -7207,6 +7913,41 @@ int sched_cpu_starting(unsigned int cpu) } #ifdef CONFIG_HOTPLUG_CPU + +/* + * Invoked immediately before the stopper thread is invoked to bring the + * CPU down completely. At this point all per CPU kthreads except the + * hotplug thread (current) and the stopper thread (inactive) have been + * either parked or have been unbound from the outgoing CPU. Ensure that + * any of those which might be on the way out are gone. + * + * If after this point a bound task is being woken on this CPU then the + * responsible hotplug callback has failed to do it's job. + * sched_cpu_dying() will catch it with the appropriate fireworks. + */ +int sched_cpu_wait_empty(unsigned int cpu) +{ + balance_hotplug_wait(); + return 0; +} + +/* + * Since this CPU is going 'away' for a while, fold any nr_active delta we + * might have. Called from the CPU stopper task after ensuring that the + * stopper is the last running task on the CPU, so nr_active count is + * stable. We need to take the teardown thread which is calling this into + * account, so we hand in adjust = 1 to the load calculation. + * + * Also see the comment "Global load-average calculations". + */ +static void calc_load_migrate(struct rq *rq) +{ + long delta = calc_load_fold_active(rq, 1); + + if (delta) + atomic_long_add(delta, &calc_load_tasks); +} + int sched_cpu_dying(unsigned int cpu) { struct rq *rq = cpu_rq(cpu); @@ -7216,12 +7957,7 @@ int sched_cpu_dying(unsigned int cpu) sched_tick_stop(cpu); rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); - set_rq_offline(rq); - } - migrate_tasks(rq, &rf); - BUG_ON(rq->nr_running != 1); + BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq)); rq_unlock_irqrestore(rq, &rf); calc_load_migrate(rq); @@ -7431,6 +8167,9 @@ void __init sched_init(void) INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); #endif +#ifdef CONFIG_HOTPLUG_CPU + rcuwait_init(&rq->hotplug_wait); +#endif #endif /* CONFIG_SMP */ hrtick_rq_init(rq); atomic_set(&rq->nr_iowait, 0); @@ -7471,7 +8210,7 @@ void __init sched_init(void) #ifdef CONFIG_DEBUG_ATOMIC_SLEEP static inline int preempt_count_equals(int preempt_offset) { - int nested = preempt_count() + rcu_preempt_depth(); + int nested = preempt_count() + sched_rcu_preempt_depth(); return (nested == preempt_offset); } @@ -7568,6 +8307,39 @@ void __cant_sleep(const char *file, int line, int preempt_offset) add_taint(TAINT_WARN, LOCKDEP_STILL_OK); } EXPORT_SYMBOL_GPL(__cant_sleep); + +#ifdef CONFIG_SMP +void __cant_migrate(const char *file, int line) +{ + static unsigned long prev_jiffy; + + if (irqs_disabled()) + return; + + if (is_migration_disabled(current)) + return; + + if (!IS_ENABLED(CONFIG_PREEMPT_COUNT)) + return; + + if (preempt_count() > 0) + return; + + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) + return; + prev_jiffy = jiffies; + + pr_err("BUG: assuming non migratable context at %s:%d\n", file, line); + pr_err("in_atomic(): %d, irqs_disabled(): %d, migration_disabled() %u pid: %d, name: %s\n", + in_atomic(), irqs_disabled(), is_migration_disabled(current), + current->pid, current->comm); + + debug_show_held_locks(current); + dump_stack(); + add_taint(TAINT_WARN, LOCKDEP_STILL_OK); +} +EXPORT_SYMBOL_GPL(__cant_migrate); +#endif #endif #ifdef CONFIG_MAGIC_SYSRQ diff --git a/kernel/sched/cpudeadline.c b/kernel/sched/cpudeadline.c index 8cb06c8c7..ceb03d76c 100644 --- a/kernel/sched/cpudeadline.c +++ b/kernel/sched/cpudeadline.c @@ -120,7 +120,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, const struct sched_dl_entity *dl_se = &p->dl; if (later_mask && - cpumask_and(later_mask, cp->free_cpus, p->cpus_ptr)) { + cpumask_and(later_mask, cp->free_cpus, &p->cpus_mask)) { unsigned long cap, max_cap = 0; int cpu, max_cpu = -1; @@ -151,7 +151,7 @@ int cpudl_find(struct cpudl *cp, struct task_struct *p, WARN_ON(best_cpu != -1 && !cpu_present(best_cpu)); - if (cpumask_test_cpu(best_cpu, p->cpus_ptr) && + if (cpumask_test_cpu(best_cpu, &p->cpus_mask) && dl_time_before(dl_se->deadline, cp->elements[0].dl)) { if (later_mask) cpumask_set_cpu(best_cpu, later_mask); diff --git a/kernel/sched/cpupri.c b/kernel/sched/cpupri.c index 0033731a0..11c4df201 100644 --- a/kernel/sched/cpupri.c +++ b/kernel/sched/cpupri.c @@ -73,11 +73,11 @@ static inline int __cpupri_find(struct cpupri *cp, struct task_struct *p, if (skip) return 0; - if (cpumask_any_and(p->cpus_ptr, vec->mask) >= nr_cpu_ids) + if (cpumask_any_and(&p->cpus_mask, vec->mask) >= nr_cpu_ids) return 0; if (lowest_mask) { - cpumask_and(lowest_mask, p->cpus_ptr, vec->mask); + cpumask_and(lowest_mask, &p->cpus_mask, vec->mask); /* * We have to ensure that we have at least one bit diff --git a/kernel/sched/cputime.c b/kernel/sched/cputime.c index 5a55d2300..2c36a5fad 100644 --- a/kernel/sched/cputime.c +++ b/kernel/sched/cputime.c @@ -44,12 +44,13 @@ static void irqtime_account_delta(struct irqtime *irqtime, u64 delta, } /* - * Called before incrementing preempt_count on {soft,}irq_enter + * Called after incrementing preempt_count on {soft,}irq_enter * and before decrementing preempt_count on {soft,}irq_exit. */ -void irqtime_account_irq(struct task_struct *curr) +void irqtime_account_irq(struct task_struct *curr, unsigned int offset) { struct irqtime *irqtime = this_cpu_ptr(&cpu_irqtime); + unsigned int pc; s64 delta; int cpu; @@ -59,6 +60,7 @@ void irqtime_account_irq(struct task_struct *curr) cpu = smp_processor_id(); delta = sched_clock_cpu(cpu) - irqtime->irq_start_time; irqtime->irq_start_time += delta; + pc = irq_count() - offset; /* * We do not account for softirq time from ksoftirqd here. @@ -66,12 +68,11 @@ void irqtime_account_irq(struct task_struct *curr) * in that case, so as not to confuse scheduler with a special task * that do not consume any time, but still wants to run. */ - if (hardirq_count()) + if (pc & HARDIRQ_MASK) irqtime_account_delta(irqtime, delta, CPUTIME_IRQ); - else if (in_serving_softirq() && curr != this_cpu_ksoftirqd()) + else if ((pc & SOFTIRQ_OFFSET) && curr != this_cpu_ksoftirqd()) irqtime_account_delta(irqtime, delta, CPUTIME_SOFTIRQ); } -EXPORT_SYMBOL_GPL(irqtime_account_irq); static u64 irqtime_tick_accounted(u64 maxtime) { @@ -418,24 +419,21 @@ void vtime_task_switch(struct task_struct *prev) } # endif -/* - * Archs that account the whole time spent in the idle task - * (outside irq) as idle time can rely on this and just implement - * vtime_account_kernel() and vtime_account_idle(). Archs that - * have other meaning of the idle time (s390 only includes the - * time spent by the CPU when it's in low power mode) must override - * vtime_account(). - */ -#ifndef __ARCH_HAS_VTIME_ACCOUNT -void vtime_account_irq_enter(struct task_struct *tsk) +void vtime_account_irq(struct task_struct *tsk, unsigned int offset) { - if (!in_interrupt() && is_idle_task(tsk)) + unsigned int pc = irq_count() - offset; + + if (pc & HARDIRQ_OFFSET) { + vtime_account_hardirq(tsk); + } else if (pc & SOFTIRQ_OFFSET) { + vtime_account_softirq(tsk); + } else if (!IS_ENABLED(CONFIG_HAVE_VIRT_CPU_ACCOUNTING_IDLE) && + is_idle_task(tsk)) { vtime_account_idle(tsk); - else + } else { vtime_account_kernel(tsk); + } } -EXPORT_SYMBOL_GPL(vtime_account_irq_enter); -#endif /* __ARCH_HAS_VTIME_ACCOUNT */ void cputime_adjust(struct task_cputime *curr, struct prev_cputime *prev, u64 *ut, u64 *st) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 8255267ce..5ab09ef74 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -565,7 +565,7 @@ static int push_dl_task(struct rq *rq); static inline bool need_pull_dl_task(struct rq *rq, struct task_struct *prev) { - return dl_task(prev); + return rq->online && dl_task(prev); } static DEFINE_PER_CPU(struct callback_head, dl_push_head); @@ -1919,7 +1919,7 @@ static void task_fork_dl(struct task_struct *p) static int pick_dl_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; } @@ -2009,8 +2009,8 @@ static int find_later_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(later_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(later_mask, + sched_domain_span(sd)); /* * Last chance: if a CPU being in both later_mask * and current sd span is valid, that becomes our @@ -2032,7 +2032,7 @@ static int find_later_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(later_mask); + cpu = cpumask_any_distribute(later_mask); if (cpu < nr_cpu_ids) return cpu; @@ -2097,7 +2097,7 @@ static struct rq *find_lock_later_rq(struct task_struct *task, struct rq *rq) */ next_task = pick_next_pushable_dl_task(rq); if (unlikely(next_task != task || - !cpumask_test_cpu(later_rq->cpu, task->cpus_ptr))) { + !cpumask_test_cpu(later_rq->cpu, &task->cpus_mask))) { double_unlock_balance(rq, later_rq); later_rq = NULL; break; @@ -2141,6 +2141,9 @@ static int push_dl_task(struct rq *rq) return 0; retry: + if (is_migration_disabled(next_task)) + return 0; + if (WARN_ON(next_task == rq->curr)) return 0; @@ -2218,7 +2221,7 @@ static void push_dl_tasks(struct rq *rq) static void pull_dl_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; - struct task_struct *p; + struct task_struct *p, *push_task; bool resched = false; struct rq *src_rq; u64 dmin = LONG_MAX; @@ -2248,6 +2251,7 @@ static void pull_dl_task(struct rq *this_rq) continue; /* Might drop this_rq->lock */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2279,17 +2283,28 @@ static void pull_dl_task(struct rq *this_rq) src_rq->curr->dl.deadline)) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); - dmin = p->dl.deadline; + if (is_migration_disabled(p)) { + trace_sched_migrate_pull_tp(p); + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + dmin = p->dl.deadline; + resched = true; + } /* Is there any other task even earlier? */ } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + raw_spin_unlock(&this_rq->lock); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + raw_spin_lock(&this_rq->lock); + } } if (resched) @@ -2313,7 +2328,8 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) } static void set_cpus_allowed_dl(struct task_struct *p, - const struct cpumask *new_mask) + const struct cpumask *new_mask, + u32 flags) { struct root_domain *src_rd; struct rq *rq; @@ -2342,7 +2358,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, raw_spin_unlock(&src_dl_b->lock); } - set_cpus_allowed_common(p, new_mask); + set_cpus_allowed_common(p, new_mask, flags); } /* Assumes rq->lock is held */ @@ -2537,6 +2553,7 @@ const struct sched_class dl_sched_class .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, + .find_lock_rq = find_lock_later_rq, #endif .task_tick = task_tick_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 583b5dcbf..3a9d38c5d 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4484,7 +4484,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) ideal_runtime = sched_slice(cfs_rq, curr); delta_exec = curr->sum_exec_runtime - curr->prev_sum_exec_runtime; if (delta_exec > ideal_runtime) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); /* * The current task ran long enough, ensure it doesn't get * re-elected due to buddy favours. @@ -4508,7 +4508,7 @@ check_preempt_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr) return; if (delta > ideal_runtime) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static void @@ -4651,7 +4651,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) * validating it and just reschedule. */ if (queued) { - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); return; } /* @@ -4788,7 +4788,7 @@ static void __account_cfs_rq_runtime(struct cfs_rq *cfs_rq, u64 delta_exec) * hierarchy can be throttled */ if (!assign_cfs_rq_runtime(cfs_rq) && likely(cfs_rq->curr)) - resched_curr(rq_of(cfs_rq)); + resched_curr_lazy(rq_of(cfs_rq)); } static __always_inline @@ -5534,7 +5534,7 @@ static void hrtick_start_fair(struct rq *rq, struct task_struct *p) if (delta < 0) { if (rq->curr == p) - resched_curr(rq); + resched_curr_lazy(rq); return; } hrtick_start(rq, delta); @@ -7141,7 +7141,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ return; preempt: - resched_curr(rq); + resched_curr_lazy(rq); /* * Only set the backward buddy when the current task is still * on the rq. This can happen when a wakeup gets interleaved @@ -11331,7 +11331,7 @@ static void task_fork_fair(struct task_struct *p) * 'current' within the tree based on its new key value. */ swap(curr->vruntime, se->vruntime); - resched_curr(rq); + resched_curr_lazy(rq); } se->vruntime -= cfs_rq->min_vruntime; @@ -11358,7 +11358,7 @@ prio_changed_fair(struct rq *rq, struct task_struct *p, int oldprio) */ if (rq->curr == p) { if (p->prio > oldprio) - resched_curr(rq); + resched_curr_lazy(rq); } else check_preempt_curr(rq, p, 0); } diff --git a/kernel/sched/features.h b/kernel/sched/features.h index 97ed11bd2..0dade2e74 100644 --- a/kernel/sched/features.h +++ b/kernel/sched/features.h @@ -45,11 +45,19 @@ SCHED_FEAT(DOUBLE_TICK, false) */ SCHED_FEAT(NONTASK_CAPACITY, true) +#ifdef CONFIG_PREEMPT_RT +SCHED_FEAT(TTWU_QUEUE, false) +# ifdef CONFIG_PREEMPT_LAZY +SCHED_FEAT(PREEMPT_LAZY, true) +# endif +#else + /* * Queue remote wakeups on the target CPU and process them * using the scheduler IPI. Reduces rq->lock contention/bounces. */ SCHED_FEAT(TTWU_QUEUE, true) +#endif /* * When doing wakeups, attempt to limit superfluous scans of the LLC domain. diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index dae1e8eaa..86f97f15a 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -265,7 +265,7 @@ static void pull_rt_task(struct rq *this_rq); static inline bool need_pull_rt_task(struct rq *rq, struct task_struct *prev) { /* Try to pull RT tasks here if we lower this rq's prio */ - return rq->rt.highest_prio.curr > prev->prio; + return rq->online && rq->rt.highest_prio.curr > prev->prio; } static inline int rt_overloaded(struct rq *rq) @@ -1672,7 +1672,7 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) static int pick_rt_task(struct rq *rq, struct task_struct *p, int cpu) { if (!task_running(rq, p) && - cpumask_test_cpu(cpu, p->cpus_ptr)) + cpumask_test_cpu(cpu, &p->cpus_mask)) return 1; return 0; @@ -1766,8 +1766,8 @@ static int find_lowest_rq(struct task_struct *task) return this_cpu; } - best_cpu = cpumask_first_and(lowest_mask, - sched_domain_span(sd)); + best_cpu = cpumask_any_and_distribute(lowest_mask, + sched_domain_span(sd)); if (best_cpu < nr_cpu_ids) { rcu_read_unlock(); return best_cpu; @@ -1784,7 +1784,7 @@ static int find_lowest_rq(struct task_struct *task) if (this_cpu != -1) return this_cpu; - cpu = cpumask_any(lowest_mask); + cpu = cpumask_any_distribute(lowest_mask); if (cpu < nr_cpu_ids) return cpu; @@ -1845,7 +1845,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) */ struct task_struct *next_task = pick_next_pushable_task(rq); if (unlikely(next_task != task || - !cpumask_test_cpu(lowest_rq->cpu, task->cpus_ptr))) { + !cpumask_test_cpu(lowest_rq->cpu, &task->cpus_mask))) { double_unlock_balance(rq, lowest_rq); lowest_rq = NULL; break; @@ -1869,7 +1869,7 @@ static struct rq *find_lock_lowest_rq(struct task_struct *task, struct rq *rq) * running task can migrate over to a CPU that is running a task * of lesser priority. */ -static int push_rt_task(struct rq *rq) +static int push_rt_task(struct rq *rq, bool pull) { struct task_struct *next_task; struct rq *lowest_rq; @@ -1883,6 +1883,39 @@ static int push_rt_task(struct rq *rq) return 0; retry: + if (is_migration_disabled(next_task)) { + struct task_struct *push_task = NULL; + int cpu; + + if (!pull) + return 0; + + trace_sched_migrate_pull_tp(next_task); + + if (rq->push_busy) + return 0; + + cpu = find_lowest_rq(rq->curr); + if (cpu == -1 || cpu == rq->cpu) + return 0; + + /* + * Given we found a CPU with lower priority than @next_task, + * therefore it should be running. However we cannot migrate it + * to this other CPU, instead attempt to push the current + * running task on this CPU away. + */ + push_task = get_push_task(rq); + if (push_task) { + raw_spin_unlock(&rq->lock); + stop_one_cpu_nowait(rq->cpu, push_cpu_stop, + push_task, &rq->push_work); + raw_spin_lock(&rq->lock); + } + + return 0; + } + if (WARN_ON(next_task == rq->curr)) return 0; @@ -1937,12 +1970,10 @@ static int push_rt_task(struct rq *rq) deactivate_task(rq, next_task, 0); set_task_cpu(next_task, lowest_rq->cpu); activate_task(lowest_rq, next_task, 0); - ret = 1; - resched_curr(lowest_rq); + ret = 1; double_unlock_balance(rq, lowest_rq); - out: put_task_struct(next_task); @@ -1952,7 +1983,7 @@ static int push_rt_task(struct rq *rq) static void push_rt_tasks(struct rq *rq) { /* push_rt_task will return true if it moved an RT */ - while (push_rt_task(rq)) + while (push_rt_task(rq, false)) ; } @@ -2105,7 +2136,8 @@ void rto_push_irq_work_func(struct irq_work *work) */ if (has_pushable_tasks(rq)) { raw_spin_lock(&rq->lock); - push_rt_tasks(rq); + while (push_rt_task(rq, true)) + ; raw_spin_unlock(&rq->lock); } @@ -2130,7 +2162,7 @@ static void pull_rt_task(struct rq *this_rq) { int this_cpu = this_rq->cpu, cpu; bool resched = false; - struct task_struct *p; + struct task_struct *p, *push_task; struct rq *src_rq; int rt_overload_count = rt_overloaded(this_rq); @@ -2177,6 +2209,7 @@ static void pull_rt_task(struct rq *this_rq) * double_lock_balance, and another CPU could * alter this_rq */ + push_task = NULL; double_lock_balance(this_rq, src_rq); /* @@ -2204,11 +2237,15 @@ static void pull_rt_task(struct rq *this_rq) if (p->prio < src_rq->curr->prio) goto skip; - resched = true; - - deactivate_task(src_rq, p, 0); - set_task_cpu(p, this_cpu); - activate_task(this_rq, p, 0); + if (is_migration_disabled(p)) { + trace_sched_migrate_pull_tp(p); + push_task = get_push_task(src_rq); + } else { + deactivate_task(src_rq, p, 0); + set_task_cpu(p, this_cpu); + activate_task(this_rq, p, 0); + resched = true; + } /* * We continue with the search, just in * case there's an even higher prio task @@ -2218,6 +2255,13 @@ static void pull_rt_task(struct rq *this_rq) } skip: double_unlock_balance(this_rq, src_rq); + + if (push_task) { + raw_spin_unlock(&this_rq->lock); + stop_one_cpu_nowait(src_rq->cpu, push_cpu_stop, + push_task, &src_rq->push_work); + raw_spin_lock(&this_rq->lock); + } } if (resched) @@ -2466,6 +2510,7 @@ const struct sched_class rt_sched_class .rq_offline = rq_offline_rt, .task_woken = task_woken_rt, .switched_from = switched_from_rt, + .find_lock_rq = find_lock_lowest_rq, #endif .task_tick = task_tick_rt, diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index d122f1b8e..186707fa5 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1008,6 +1008,7 @@ struct rq { unsigned long cpu_capacity_orig; struct callback_head *balance_callback; + unsigned char balance_flags; unsigned char nohz_idle_balance; unsigned char idle_balance; @@ -1038,6 +1039,10 @@ struct rq { /* This is used to determine avg_idle's max value */ u64 max_idle_balance_cost; + +#ifdef CONFIG_HOTPLUG_CPU + struct rcuwait hotplug_wait; +#endif #endif /* CONFIG_SMP */ #ifdef CONFIG_IRQ_TIME_ACCOUNTING @@ -1096,6 +1101,11 @@ struct rq { struct cpuidle_state *idle_state; #endif +#ifdef CONFIG_SMP + unsigned int nr_pinned; +#endif + unsigned int push_busy; + struct cpu_stop_work push_work; KABI_RESERVE(1) KABI_RESERVE(2) KABI_RESERVE(3) @@ -1131,6 +1141,17 @@ static inline int cpu_of(struct rq *rq) #endif } +#define MDF_PUSH 0x01 + +static inline bool is_migration_disabled(struct task_struct *p) +{ +#ifdef CONFIG_SMP + return p->migration_disabled; +#else + return false; +#endif +} + #ifdef CONFIG_QOS_SCHED void init_qos_hrtimer(int cpu); #endif @@ -1260,6 +1281,12 @@ struct rq_flags { */ unsigned int clock_update_flags; #endif + +#ifdef CONFIG_SMP + unsigned int nr_pinned; +#endif + unsigned int push_busy; + struct cpu_stop_work push_work; }; /* @@ -1280,6 +1307,9 @@ static inline void rq_pin_lock(struct rq *rq, struct rq_flags *rf) rq->clock_update_flags &= (RQCF_REQ_SKIP|RQCF_ACT_SKIP); rf->clock_update_flags = 0; #endif +#ifdef CONFIG_SMP + SCHED_WARN_ON(rq->balance_callback); +#endif } static inline void rq_unpin_lock(struct rq *rq, struct rq_flags *rf) @@ -1445,6 +1475,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) #ifdef CONFIG_SMP +#define BALANCE_WORK 0x01 +#define BALANCE_PUSH 0x02 + static inline void queue_balance_callback(struct rq *rq, struct callback_head *head, @@ -1452,12 +1485,12 @@ queue_balance_callback(struct rq *rq, { lockdep_assert_held(&rq->lock); - if (unlikely(head->next)) + if (unlikely(head->next || (rq->balance_flags & BALANCE_PUSH))) return; head->func = (void (*)(struct callback_head *))func; head->next = rq->balance_callback; - rq->balance_callback = head; + rq->balance_flags |= BALANCE_WORK; } #define rcu_dereference_check_sched_domain(p) \ @@ -1782,6 +1815,7 @@ static inline int task_on_rq_migrating(struct task_struct *p) #define WF_FORK 0x02 /* Child wakeup after fork */ #define WF_MIGRATED 0x04 /* Internal use, task got migrated */ #define WF_ON_CPU 0x08 /* Wakee is on_cpu */ +#define WF_LOCK_SLEEPER 0x10 /* Wakeup spinlock "sleeper" */ /* * To aid in avoiding the subversion of "niceness" due to uneven distribution @@ -1863,10 +1897,13 @@ struct sched_class { void (*task_woken)(struct rq *this_rq, struct task_struct *task); void (*set_cpus_allowed)(struct task_struct *p, - const struct cpumask *newmask); + const struct cpumask *newmask, + u32 flags); void (*rq_online)(struct rq *rq); void (*rq_offline)(struct rq *rq); + + struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); #endif void (*task_tick)(struct rq *rq, struct task_struct *p, int queued); @@ -1953,13 +1990,38 @@ static inline bool sched_fair_runnable(struct rq *rq) extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); extern struct task_struct *pick_next_task_idle(struct rq *rq); +#define SCA_CHECK 0x01 +#define SCA_MIGRATE_DISABLE 0x02 +#define SCA_MIGRATE_ENABLE 0x04 + #ifdef CONFIG_SMP extern void update_group_capacity(struct sched_domain *sd, int cpu); extern void trigger_load_balance(struct rq *rq); -extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); +extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask, u32 flags); + +static inline struct task_struct *get_push_task(struct rq *rq) +{ + struct task_struct *p = rq->curr; + + lockdep_assert_held(&rq->lock); + + if (rq->push_busy) + return NULL; + + if (p->nr_cpus_allowed == 1) + return NULL; + + if (p->migration_disabled) + return NULL; + + rq->push_busy = true; + return get_task_struct(p); +} + +extern int push_cpu_stop(void *arg); #endif @@ -2003,6 +2065,15 @@ extern void reweight_task(struct task_struct *p, int prio); extern void resched_curr(struct rq *rq); extern void resched_cpu(int cpu); +#ifdef CONFIG_PREEMPT_LAZY +extern void resched_curr_lazy(struct rq *rq); +#else +static inline void resched_curr_lazy(struct rq *rq) +{ + resched_curr(rq); +} +#endif + extern struct rt_bandwidth def_rt_bandwidth; extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); @@ -2365,7 +2436,6 @@ extern void nohz_balance_exit_idle(struct rq *rq); static inline void nohz_balance_exit_idle(struct rq *rq) { } #endif - #ifdef CONFIG_SMP static inline void __dl_update(struct dl_bw *dl_b, s64 bw) diff --git a/kernel/sched/swait.c b/kernel/sched/swait.c index e1c655f92..f230b1ac7 100644 --- a/kernel/sched/swait.c +++ b/kernel/sched/swait.c @@ -64,6 +64,7 @@ void swake_up_all(struct swait_queue_head *q) struct swait_queue *curr; LIST_HEAD(tmp); + WARN_ON(irqs_disabled()); raw_spin_lock_irq(&q->lock); list_splice_init(&q->task_list, &tmp); while (!list_empty(&tmp)) { diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c index 9b4e3b25d..9a62e1b59 100644 --- a/kernel/sched/topology.c +++ b/kernel/sched/topology.c @@ -529,6 +529,7 @@ static int init_rootdomain(struct root_domain *rd) rd->rto_cpu = -1; raw_spin_lock_init(&rd->rto_lock); init_irq_work(&rd->rto_push_work, rto_push_irq_work_func); + atomic_or(IRQ_WORK_HARD_IRQ, &rd->rto_push_work.node.a_flags); #endif init_dl_bw(&rd->dl_bw); diff --git a/kernel/signal.c b/kernel/signal.c index 54f86e0b9..28d34857e 100644 --- a/kernel/signal.c +++ b/kernel/signal.c @@ -20,6 +20,7 @@ #include <linux/sched/task.h> #include <linux/sched/task_stack.h> #include <linux/sched/cputime.h> +#include <linux/sched/rt.h> #include <linux/file.h> #include <linux/fs.h> #include <linux/proc_fs.h> @@ -406,13 +407,30 @@ void task_join_group_stop(struct task_struct *task) task_set_jobctl_pending(task, mask | JOBCTL_STOP_PENDING); } +static inline struct sigqueue *get_task_cache(struct task_struct *t) +{ + struct sigqueue *q = t->sigqueue_cache; + + if (cmpxchg(&t->sigqueue_cache, q, NULL) != q) + return NULL; + return q; +} + +static inline int put_task_cache(struct task_struct *t, struct sigqueue *q) +{ + if (cmpxchg(&t->sigqueue_cache, NULL, q) == NULL) + return 0; + return 1; +} + /* * allocate a new signal queue record * - this may be called without locks if and only if t == current, otherwise an * appropriate lock must be held to stop the target task from exiting */ static struct sigqueue * -__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimit) +__sigqueue_do_alloc(int sig, struct task_struct *t, gfp_t flags, + int override_rlimit, int fromslab) { struct sigqueue *q = NULL; struct user_struct *user; @@ -434,7 +452,10 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi rcu_read_unlock(); if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { - q = kmem_cache_alloc(sigqueue_cachep, flags); + if (!fromslab) + q = get_task_cache(t); + if (!q) + q = kmem_cache_alloc(sigqueue_cachep, flags); } else { print_dropped_signal(sig); } @@ -451,6 +472,13 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, int override_rlimi return q; } +static struct sigqueue * +__sigqueue_alloc(int sig, struct task_struct *t, gfp_t flags, + int override_rlimit) +{ + return __sigqueue_do_alloc(sig, t, flags, override_rlimit, 0); +} + static void __sigqueue_free(struct sigqueue *q) { if (q->flags & SIGQUEUE_PREALLOC) @@ -460,6 +488,21 @@ static void __sigqueue_free(struct sigqueue *q) kmem_cache_free(sigqueue_cachep, q); } +static void sigqueue_free_current(struct sigqueue *q) +{ + struct user_struct *up; + + if (q->flags & SIGQUEUE_PREALLOC) + return; + + up = q->user; + if (rt_prio(current->normal_prio) && !put_task_cache(current, q)) { + if (atomic_dec_and_test(&up->sigpending)) + free_uid(up); + } else + __sigqueue_free(q); +} + void flush_sigqueue(struct sigpending *queue) { struct sigqueue *q; @@ -472,6 +515,21 @@ void flush_sigqueue(struct sigpending *queue) } } +/* + * Called from __exit_signal. Flush tsk->pending and + * tsk->sigqueue_cache + */ +void flush_task_sigqueue(struct task_struct *tsk) +{ + struct sigqueue *q; + + flush_sigqueue(&tsk->pending); + + q = get_task_cache(tsk); + if (q) + kmem_cache_free(sigqueue_cachep, q); +} + /* * Flush all pending signals for this kthread. */ @@ -596,7 +654,7 @@ static void collect_signal(int sig, struct sigpending *list, kernel_siginfo_t *i (info->si_code == SI_TIMER) && (info->si_sys_private); - __sigqueue_free(first); + sigqueue_free_current(first); } else { /* * Ok, it wasn't in the queue. This must be @@ -633,6 +691,8 @@ int dequeue_signal(struct task_struct *tsk, sigset_t *mask, kernel_siginfo_t *in bool resched_timer = false; int signr; + WARN_ON_ONCE(tsk != current); + /* We only dequeue private signals from ourselves, we don't let * signalfd steal them */ @@ -1319,6 +1379,34 @@ force_sig_info_to_task(struct kernel_siginfo *info, struct task_struct *t) struct k_sigaction *action; int sig = info->si_signo; + /* + * On some archs, PREEMPT_RT has to delay sending a signal from a trap + * since it can not enable preemption, and the signal code's spin_locks + * turn into mutexes. Instead, it must set TIF_NOTIFY_RESUME which will + * send the signal on exit of the trap. + */ +#ifdef ARCH_RT_DELAYS_SIGNAL_SEND + if (in_atomic()) { + struct task_struct *t = current; + + if (WARN_ON_ONCE(t->forced_info.si_signo)) + return 0; + + if (is_si_special(info)) { + WARN_ON_ONCE(info != SEND_SIG_PRIV); + t->forced_info.si_signo = info->si_signo; + t->forced_info.si_errno = 0; + t->forced_info.si_code = SI_KERNEL; + t->forced_info.si_pid = 0; + t->forced_info.si_uid = 0; + } else { + t->forced_info = *info; + } + + set_tsk_thread_flag(t, TIF_NOTIFY_RESUME); + return 0; + } +#endif spin_lock_irqsave(&t->sighand->siglock, flags); action = &t->sighand->action[sig-1]; ignored = action->sa.sa_handler == SIG_IGN; @@ -1812,7 +1900,8 @@ EXPORT_SYMBOL(kill_pid); */ struct sigqueue *sigqueue_alloc(void) { - struct sigqueue *q = __sigqueue_alloc(-1, current, GFP_KERNEL, 0); + /* Preallocated sigqueue objects always from the slabcache ! */ + struct sigqueue *q = __sigqueue_do_alloc(-1, current, GFP_KERNEL, 0, 1); if (q) q->flags |= SIGQUEUE_PREALLOC; @@ -2198,16 +2287,8 @@ static void ptrace_stop(int exit_code, int why, int clear_code, kernel_siginfo_t if (gstop_done && ptrace_reparented(current)) do_notify_parent_cldstop(current, false, why); - /* - * Don't want to allow preemption here, because - * sys_ptrace() needs this task to be inactive. - * - * XXX: implement read_unlock_no_resched(). - */ - preempt_disable(); read_unlock(&tasklist_lock); cgroup_enter_frozen(); - preempt_enable_no_resched(); freezable_schedule(); cgroup_leave_frozen(true); } else { diff --git a/kernel/smp.c b/kernel/smp.c index a5a87a51e..f8add8ce2 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -450,8 +450,18 @@ void flush_smp_call_function_from_idle(void) local_irq_save(flags); flush_smp_call_function_queue(true); - if (local_softirq_pending()) - do_softirq(); + + if (local_softirq_pending()) { + + if (!IS_ENABLED(CONFIG_PREEMPT_RT)) { + do_softirq(); + } else { + struct task_struct *ksoftirqd = this_cpu_ksoftirqd(); + + if (ksoftirqd && ksoftirqd->state != TASK_RUNNING) + wake_up_process(ksoftirqd); + } + } local_irq_restore(flags); } diff --git a/kernel/softirq.c b/kernel/softirq.c index 09229ad82..c9adc5c46 100644 --- a/kernel/softirq.c +++ b/kernel/softirq.c @@ -13,6 +13,7 @@ #include <linux/kernel_stat.h> #include <linux/interrupt.h> #include <linux/init.h> +#include <linux/local_lock.h> #include <linux/mm.h> #include <linux/notifier.h> #include <linux/percpu.h> @@ -25,6 +26,7 @@ #include <linux/smpboot.h> #include <linux/tick.h> #include <linux/irq.h> +#include <linux/wait_bit.h> #define CREATE_TRACE_POINTS #include <trace/events/irq.h> @@ -92,27 +94,212 @@ static bool ksoftirqd_running(unsigned long pending) !__kthread_should_park(tsk); } +#ifdef CONFIG_TRACE_IRQFLAGS +DEFINE_PER_CPU(int, hardirqs_enabled); +DEFINE_PER_CPU(int, hardirq_context); +EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); +EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); +#endif + /* - * preempt_count and SOFTIRQ_OFFSET usage: - * - preempt_count is changed by SOFTIRQ_OFFSET on entering or leaving - * softirq processing. - * - preempt_count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) + * SOFTIRQ_OFFSET usage: + * + * On !RT kernels 'count' is the preempt counter, on RT kernels this applies + * to a per CPU counter and to task::softirqs_disabled_cnt. + * + * - count is changed by SOFTIRQ_OFFSET on entering or leaving softirq + * processing. + * + * - count is changed by SOFTIRQ_DISABLE_OFFSET (= 2 * SOFTIRQ_OFFSET) * on local_bh_disable or local_bh_enable. + * * This lets us distinguish between whether we are currently processing * softirq and whether we just have bh disabled. */ +#ifdef CONFIG_PREEMPT_RT /* - * This one is for softirq.c-internal use, - * where hardirqs are disabled legitimately: + * RT accounts for BH disabled sections in task::softirqs_disabled_cnt and + * also in per CPU softirq_ctrl::cnt. This is necessary to allow tasks in a + * softirq disabled section to be preempted. + * + * The per task counter is used for softirq_count(), in_softirq() and + * in_serving_softirqs() because these counts are only valid when the task + * holding softirq_ctrl::lock is running. + * + * The per CPU counter prevents pointless wakeups of ksoftirqd in case that + * the task which is in a softirq disabled section is preempted or blocks. */ -#ifdef CONFIG_TRACE_IRQFLAGS +struct softirq_ctrl { + local_lock_t lock; + int cnt; +}; -DEFINE_PER_CPU(int, hardirqs_enabled); -DEFINE_PER_CPU(int, hardirq_context); -EXPORT_PER_CPU_SYMBOL_GPL(hardirqs_enabled); -EXPORT_PER_CPU_SYMBOL_GPL(hardirq_context); +static DEFINE_PER_CPU(struct softirq_ctrl, softirq_ctrl) = { + .lock = INIT_LOCAL_LOCK(softirq_ctrl.lock), +}; + +/** + * local_bh_blocked() - Check for idle whether BH processing is blocked + * + * Returns false if the per CPU softirq::cnt is 0 otherwise true. + * + * This is invoked from the idle task to guard against false positive + * softirq pending warnings, which would happen when the task which holds + * softirq_ctrl::lock was the only running task on the CPU and blocks on + * some other lock. + */ +bool local_bh_blocked(void) +{ + return __this_cpu_read(softirq_ctrl.cnt) != 0; +} + +void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) +{ + unsigned long flags; + int newcnt; + + WARN_ON_ONCE(in_hardirq()); + + /* First entry of a task into a BH disabled section? */ + if (!current->softirq_disable_cnt) { + if (preemptible()) { + local_lock(&softirq_ctrl.lock); + /* Required to meet the RCU bottomhalf requirements. */ + rcu_read_lock(); + } else { + DEBUG_LOCKS_WARN_ON(this_cpu_read(softirq_ctrl.cnt)); + } + } + + /* + * Track the per CPU softirq disabled state. On RT this is per CPU + * state to allow preemption of bottom half disabled sections. + */ + newcnt = __this_cpu_add_return(softirq_ctrl.cnt, cnt); + /* + * Reflect the result in the task state to prevent recursion on the + * local lock and to make softirq_count() & al work. + */ + current->softirq_disable_cnt = newcnt; + + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && newcnt == cnt) { + raw_local_irq_save(flags); + lockdep_softirqs_off(ip); + raw_local_irq_restore(flags); + } +} +EXPORT_SYMBOL(__local_bh_disable_ip); + +static void __local_bh_enable(unsigned int cnt, bool unlock) +{ + unsigned long flags; + int newcnt; + + DEBUG_LOCKS_WARN_ON(current->softirq_disable_cnt != + this_cpu_read(softirq_ctrl.cnt)); + + if (IS_ENABLED(CONFIG_TRACE_IRQFLAGS) && softirq_count() == cnt) { + raw_local_irq_save(flags); + lockdep_softirqs_on(_RET_IP_); + raw_local_irq_restore(flags); + } + + newcnt = __this_cpu_sub_return(softirq_ctrl.cnt, cnt); + current->softirq_disable_cnt = newcnt; + + if (!newcnt && unlock) { + rcu_read_unlock(); + local_unlock(&softirq_ctrl.lock); + } +} + +void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) +{ + bool preempt_on = preemptible(); + unsigned long flags; + u32 pending; + int curcnt; + + WARN_ON_ONCE(in_irq()); + lockdep_assert_irqs_enabled(); + + local_irq_save(flags); + curcnt = __this_cpu_read(softirq_ctrl.cnt); + + /* + * If this is not reenabling soft interrupts, no point in trying to + * run pending ones. + */ + if (curcnt != cnt) + goto out; + + pending = local_softirq_pending(); + if (!pending || ksoftirqd_running(pending)) + goto out; + + /* + * If this was called from non preemptible context, wake up the + * softirq daemon. + */ + if (!preempt_on) { + wakeup_softirqd(); + goto out; + } + + /* + * Adjust softirq count to SOFTIRQ_OFFSET which makes + * in_serving_softirq() become true. + */ + cnt = SOFTIRQ_OFFSET; + __local_bh_enable(cnt, false); + __do_softirq(); + +out: + __local_bh_enable(cnt, preempt_on); + local_irq_restore(flags); +} +EXPORT_SYMBOL(__local_bh_enable_ip); + +/* + * Invoked from ksoftirqd_run() outside of the interrupt disabled section + * to acquire the per CPU local lock for reentrancy protection. + */ +static inline void ksoftirqd_run_begin(void) +{ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); + local_irq_disable(); +} + +/* Counterpart to ksoftirqd_run_begin() */ +static inline void ksoftirqd_run_end(void) +{ + __local_bh_enable(SOFTIRQ_OFFSET, true); + WARN_ON_ONCE(in_interrupt()); + local_irq_enable(); +} + +static inline void softirq_handle_begin(void) { } +static inline void softirq_handle_end(void) { } +static inline bool should_wake_ksoftirqd(void) +{ + return !this_cpu_read(softirq_ctrl.cnt); +} + +static inline void invoke_softirq(void) +{ + if (should_wake_ksoftirqd()) + wakeup_softirqd(); +} + +#else /* CONFIG_PREEMPT_RT */ + +/* + * This one is for softirq.c-internal use, where hardirqs are disabled + * legitimately: + */ +#ifdef CONFIG_TRACE_IRQFLAGS void __local_bh_disable_ip(unsigned long ip, unsigned int cnt) { unsigned long flags; @@ -203,6 +390,78 @@ void __local_bh_enable_ip(unsigned long ip, unsigned int cnt) } EXPORT_SYMBOL(__local_bh_enable_ip); +static inline void softirq_handle_begin(void) +{ + __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); +} + +static inline void softirq_handle_end(void) +{ + __local_bh_enable(SOFTIRQ_OFFSET); + WARN_ON_ONCE(in_interrupt()); +} + +static inline void ksoftirqd_run_begin(void) +{ + local_irq_disable(); +} + +static inline void ksoftirqd_run_end(void) +{ + local_irq_enable(); +} + +static inline bool should_wake_ksoftirqd(void) +{ + return true; +} + +static inline void invoke_softirq(void) +{ + if (ksoftirqd_running(local_softirq_pending())) + return; + + if (!force_irqthreads) { +#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK + /* + * We can safely execute softirq on the current stack if + * it is the irq stack, because it should be near empty + * at this stage. + */ + __do_softirq(); +#else + /* + * Otherwise, irq_exit() is called on the task stack that can + * be potentially deep already. So call softirq in its own stack + * to prevent from any overrun. + */ + do_softirq_own_stack(); +#endif + } else { + wakeup_softirqd(); + } +} + +asmlinkage __visible void do_softirq(void) +{ + __u32 pending; + unsigned long flags; + + if (in_interrupt()) + return; + + local_irq_save(flags); + + pending = local_softirq_pending(); + + if (pending && !ksoftirqd_running(pending)) + do_softirq_own_stack(); + + local_irq_restore(flags); +} + +#endif /* !CONFIG_PREEMPT_RT */ + /* * We restart softirq processing for at most MAX_SOFTIRQ_RESTART times, * but break the loop if need_resched() is set or after 2 ms. @@ -270,10 +529,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) current->flags &= ~PF_MEMALLOC; pending = local_softirq_pending(); - account_irq_enter_time(current); - __local_bh_disable_ip(_RET_IP_, SOFTIRQ_OFFSET); + softirq_handle_begin(); in_hardirq = lockdep_softirq_start(); + account_softirq_enter(current); restart: /* Reset the pending bitmask before enabling irqs */ @@ -307,8 +566,10 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) pending >>= softirq_bit; } - if (__this_cpu_read(ksoftirqd) == current) + if (!IS_ENABLED(CONFIG_PREEMPT_RT) && + __this_cpu_read(ksoftirqd) == current) rcu_softirq_qs(); + local_irq_disable(); pending = local_softirq_pending(); @@ -320,46 +581,23 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) wakeup_softirqd(); } + account_softirq_exit(current); lockdep_softirq_end(in_hardirq); - account_irq_exit_time(current); - __local_bh_enable(SOFTIRQ_OFFSET); - WARN_ON_ONCE(in_interrupt()); + softirq_handle_end(); current_restore_flags(old_flags, PF_MEMALLOC); } -asmlinkage __visible void do_softirq(void) -{ - __u32 pending; - unsigned long flags; - - if (in_interrupt()) - return; - - local_irq_save(flags); - - pending = local_softirq_pending(); - - if (pending && !ksoftirqd_running(pending)) - do_softirq_own_stack(); - - local_irq_restore(flags); -} - /** * irq_enter_rcu - Enter an interrupt context with RCU watching */ void irq_enter_rcu(void) { - if (is_idle_task(current) && !in_interrupt()) { - /* - * Prevent raise_softirq from needlessly waking up ksoftirqd - * here, as softirq will be serviced on return from interrupt. - */ - local_bh_disable(); + __irq_enter_raw(); + + if (is_idle_task(current) && (irq_count() == HARDIRQ_OFFSET)) tick_irq_enter(); - _local_bh_enable(); - } - __irq_enter(); + + account_hardirq_enter(current); } /** @@ -371,32 +609,6 @@ void irq_enter(void) irq_enter_rcu(); } -static inline void invoke_softirq(void) -{ - if (ksoftirqd_running(local_softirq_pending())) - return; - - if (!force_irqthreads) { -#ifdef CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK - /* - * We can safely execute softirq on the current stack if - * it is the irq stack, because it should be near empty - * at this stage. - */ - __do_softirq(); -#else - /* - * Otherwise, irq_exit() is called on the task stack that can - * be potentially deep already. So call softirq in its own stack - * to prevent from any overrun. - */ - do_softirq_own_stack(); -#endif - } else { - wakeup_softirqd(); - } -} - static inline void tick_irq_exit(void) { #ifdef CONFIG_NO_HZ_COMMON @@ -417,7 +629,7 @@ static inline void __irq_exit_rcu(void) #else lockdep_assert_irqs_disabled(); #endif - account_irq_exit_time(current); + account_hardirq_exit(current); preempt_count_sub(HARDIRQ_OFFSET); if (!in_interrupt() && local_softirq_pending()) invoke_softirq(); @@ -466,7 +678,7 @@ inline void raise_softirq_irqoff(unsigned int nr) * Otherwise we wake up ksoftirqd to make sure we * schedule the softirq soon. */ - if (!in_interrupt()) + if (!in_interrupt() && should_wake_ksoftirqd()) wakeup_softirqd(); } @@ -532,6 +744,16 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) } EXPORT_SYMBOL(__tasklet_hi_schedule); +static inline bool tasklet_clear_sched(struct tasklet_struct *t) +{ + if (test_and_clear_bit(TASKLET_STATE_SCHED, &t->state)) { + wake_up_var(&t->state); + return true; + } + + return false; +} + static void tasklet_action_common(struct softirq_action *a, struct tasklet_head *tl_head, unsigned int softirq_nr) @@ -551,8 +773,7 @@ static void tasklet_action_common(struct softirq_action *a, if (tasklet_trylock(t)) { if (!atomic_read(&t->count)) { - if (!test_and_clear_bit(TASKLET_STATE_SCHED, - &t->state)) + if (!tasklet_clear_sched(t)) BUG(); if (t->use_callback) t->callback(t); @@ -607,21 +828,62 @@ void tasklet_init(struct tasklet_struct *t, } EXPORT_SYMBOL(tasklet_init); +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +/* + * Do not use in new code. Waiting for tasklets from atomic contexts is + * error prone and should be avoided. + */ +void tasklet_unlock_spin_wait(struct tasklet_struct *t) +{ + while (test_bit(TASKLET_STATE_RUN, &(t)->state)) { + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { + /* + * Prevent a live lock when current preempted soft + * interrupt processing or prevents ksoftirqd from + * running. If the tasklet runs on a different CPU + * then this has no effect other than doing the BH + * disable/enable dance for nothing. + */ + local_bh_disable(); + local_bh_enable(); + } else { + cpu_relax(); + } + } +} +EXPORT_SYMBOL(tasklet_unlock_spin_wait); +#endif + void tasklet_kill(struct tasklet_struct *t) { if (in_interrupt()) pr_notice("Attempt to kill tasklet from interrupt\n"); - while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) { - do { - yield(); - } while (test_bit(TASKLET_STATE_SCHED, &t->state)); - } + while (test_and_set_bit(TASKLET_STATE_SCHED, &t->state)) + wait_var_event(&t->state, !test_bit(TASKLET_STATE_SCHED, &t->state)); + tasklet_unlock_wait(t); - clear_bit(TASKLET_STATE_SCHED, &t->state); + tasklet_clear_sched(t); } EXPORT_SYMBOL(tasklet_kill); +#if defined(CONFIG_SMP) || defined(CONFIG_PREEMPT_RT) +void tasklet_unlock(struct tasklet_struct *t) +{ + smp_mb__before_atomic(); + clear_bit(TASKLET_STATE_RUN, &t->state); + smp_mb__after_atomic(); + wake_up_var(&t->state); +} +EXPORT_SYMBOL_GPL(tasklet_unlock); + +void tasklet_unlock_wait(struct tasklet_struct *t) +{ + wait_var_event(&t->state, !test_bit(TASKLET_STATE_RUN, &t->state)); +} +EXPORT_SYMBOL_GPL(tasklet_unlock_wait); +#endif + void __init softirq_init(void) { int cpu; @@ -644,18 +906,18 @@ static int ksoftirqd_should_run(unsigned int cpu) static void run_ksoftirqd(unsigned int cpu) { - local_irq_disable(); + ksoftirqd_run_begin(); if (local_softirq_pending()) { /* * We can safely run softirq on inline stack, as we are not deep * in the task stack here. */ __do_softirq(); - local_irq_enable(); + ksoftirqd_run_end(); cond_resched(); return; } - local_irq_enable(); + ksoftirqd_run_end(); } #ifdef CONFIG_HOTPLUG_CPU diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index dd5aeddbe..8bf1fb832 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -47,11 +47,27 @@ struct cpu_stopper { struct list_head works; /* list of pending works */ struct cpu_stop_work stop_work; /* for stop_cpus */ + unsigned long caller; + cpu_stop_fn_t fn; }; static DEFINE_PER_CPU(struct cpu_stopper, cpu_stopper); static bool stop_machine_initialized = false; +void print_stop_info(const char *log_lvl, struct task_struct *task) +{ + /* + * If @task is a stopper task, it cannot migrate and task_cpu() is + * stable. + */ + struct cpu_stopper *stopper = per_cpu_ptr(&cpu_stopper, task_cpu(task)); + + if (task != stopper->thread) + return; + + printk("%sStopper: %pS <- %pS\n", log_lvl, stopper->fn, (void *)stopper->caller); +} + /* static data for stop_cpus */ static DEFINE_MUTEX(stop_cpus_mutex); static bool stop_cpus_in_progress; @@ -128,7 +144,7 @@ static bool cpu_stop_queue_work(unsigned int cpu, struct cpu_stop_work *work) int stop_one_cpu(unsigned int cpu, cpu_stop_fn_t fn, void *arg) { struct cpu_stop_done done; - struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done }; + struct cpu_stop_work work = { .fn = fn, .arg = arg, .done = &done, .caller = _RET_IP_ }; cpu_stop_init_done(&done, 1); if (!cpu_stop_queue_work(cpu, &work)) @@ -344,7 +360,8 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * work1 = work2 = (struct cpu_stop_work){ .fn = multi_cpu_stop, .arg = &msdata, - .done = &done + .done = &done, + .caller = _RET_IP_, }; cpu_stop_init_done(&done, 2); @@ -380,7 +397,7 @@ int stop_two_cpus(unsigned int cpu1, unsigned int cpu2, cpu_stop_fn_t fn, void * bool stop_one_cpu_nowait(unsigned int cpu, cpu_stop_fn_t fn, void *arg, struct cpu_stop_work *work_buf) { - *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, }; + *work_buf = (struct cpu_stop_work){ .fn = fn, .arg = arg, .caller = _RET_IP_, }; return cpu_stop_queue_work(cpu, work_buf); } @@ -500,6 +517,8 @@ static void cpu_stopper_thread(unsigned int cpu) int ret; /* cpu stop callbacks must not sleep, make in_atomic() == T */ + stopper->caller = work->caller; + stopper->fn = fn; preempt_count_inc(); ret = fn(arg); if (done) { @@ -508,6 +527,8 @@ static void cpu_stopper_thread(unsigned int cpu) cpu_stop_signal_done(done); } preempt_count_dec(); + stopper->fn = NULL; + stopper->caller = 0; WARN_ONCE(preempt_count(), "cpu_stop: %ps(%p) leaked preempt count\n", fn, arg); goto repeat; diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c index 4ef90718c..6eb443234 100644 --- a/kernel/time/hrtimer.c +++ b/kernel/time/hrtimer.c @@ -2052,6 +2052,36 @@ SYSCALL_DEFINE2(nanosleep_time32, struct old_timespec32 __user *, rqtp, } #endif +#ifdef CONFIG_PREEMPT_RT +/* + * Sleep for 1 ms in hope whoever holds what we want will let it go. + */ +void cpu_chill(void) +{ + unsigned int freeze_flag = current->flags & PF_NOFREEZE; + struct task_struct *self = current; + ktime_t chill_time; + + raw_spin_lock_irq(&self->pi_lock); + self->saved_state = self->state; + __set_current_state_no_track(TASK_UNINTERRUPTIBLE); + raw_spin_unlock_irq(&self->pi_lock); + + chill_time = ktime_set(0, NSEC_PER_MSEC); + + current->flags |= PF_NOFREEZE; + schedule_hrtimeout(&chill_time, HRTIMER_MODE_REL_HARD); + if (!freeze_flag) + current->flags &= ~PF_NOFREEZE; + + raw_spin_lock_irq(&self->pi_lock); + __set_current_state_no_track(self->saved_state); + self->saved_state = TASK_RUNNING; + raw_spin_unlock_irq(&self->pi_lock); +} +EXPORT_SYMBOL(cpu_chill); +#endif + /* * Functions related to boot-time initialization: */ diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index 2d7899700..e4e09ad9e 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -990,7 +990,7 @@ static bool can_stop_idle_tick(int cpu, struct tick_sched *ts) if (unlikely(local_softirq_pending())) { static int ratelimit; - if (ratelimit < 10 && + if (ratelimit < 10 && !local_bh_blocked() && (local_softirq_pending() & SOFTIRQ_STOP_IDLE_MASK)) { pr_warn("NOHZ tick-stop error: Non-RCU local softirq work is pending, handler #%02x!!!\n", (unsigned int) local_softirq_pending()); diff --git a/kernel/time/timer.c b/kernel/time/timer.c index 351420c23..2a9e0b89d 100644 --- a/kernel/time/timer.c +++ b/kernel/time/timer.c @@ -1287,7 +1287,7 @@ static void del_timer_wait_running(struct timer_list *timer) u32 tf; tf = READ_ONCE(timer->flags); - if (!(tf & TIMER_MIGRATING)) { + if (!(tf & (TIMER_MIGRATING | TIMER_IRQSAFE))) { struct timer_base *base = get_timer_base(tf); /* @@ -1371,6 +1371,13 @@ int del_timer_sync(struct timer_list *timer) */ WARN_ON(in_irq() && !(timer->flags & TIMER_IRQSAFE)); + /* + * Must be able to sleep on PREEMPT_RT because of the slowpath in + * del_timer_wait_running(). + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT) && !(timer->flags & TIMER_IRQSAFE)) + lockdep_assert_preemption_enabled(); + do { ret = try_to_del_timer_sync(timer); diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c index aced2292d..5bcf53436 100644 --- a/kernel/trace/trace.c +++ b/kernel/trace/trace.c @@ -2586,60 +2586,43 @@ enum print_line_t trace_handle_return(struct trace_seq *s) } EXPORT_SYMBOL_GPL(trace_handle_return); -unsigned int tracing_gen_ctx_flags(unsigned long irqflags) +static unsigned short migration_disable_value(void) { - unsigned int trace_flags = 0; +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT) + return current->migration_disabled; +#else + return 0; +#endif +} + +unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) +{ + unsigned int trace_flags = irqs_status; unsigned int pc; pc = preempt_count(); -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT - if (irqs_disabled_flags(irqflags)) - trace_flags |= TRACE_FLAG_IRQS_OFF; -#else - trace_flags |= TRACE_FLAG_IRQS_NOSUPPORT; -#endif - if (pc & NMI_MASK) trace_flags |= TRACE_FLAG_NMI; if (pc & HARDIRQ_MASK) trace_flags |= TRACE_FLAG_HARDIRQ; - - if (pc & SOFTIRQ_OFFSET) + if (in_serving_softirq()) trace_flags |= TRACE_FLAG_SOFTIRQ; if (tif_need_resched()) trace_flags |= TRACE_FLAG_NEED_RESCHED; if (test_preempt_need_resched()) trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; - return (trace_flags << 16) | (pc & 0xff); -} -unsigned int tracing_gen_ctx(void) -{ - unsigned long irqflags; - -#ifdef CONFIG_TRACE_IRQFLAGS_SUPPORT - local_save_flags(irqflags); -#else - irqflags = 0; +#ifdef CONFIG_PREEMPT_LAZY + if (need_resched_lazy()) + trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; #endif - return tracing_gen_ctx_flags(irqflags); -} -unsigned int tracing_gen_ctx_dec(void) -{ - unsigned int trace_ctx; - - trace_ctx = tracing_gen_ctx(); - - /* - * Subtract one from the preeption counter if preemption is enabled, - * see trace_event_buffer_reserve()for details. - */ - if (IS_ENABLED(CONFIG_PREEMPTION)) - trace_ctx--; - return trace_ctx; + return (pc & 0xff) | + (migration_disable_value() & 0xff) << 8 | + (preempt_lazy_count() & 0xff) << 16 | + (trace_flags << 24); } struct ring_buffer_event * @@ -3835,14 +3818,17 @@ unsigned long trace_total_entries(struct trace_array *tr) static void print_lat_help_header(struct seq_file *m) { - seq_puts(m, "# _------=> CPU# \n" - "# / _-----=> irqs-off \n" - "# | / _----=> need-resched \n" - "# || / _---=> hardirq/softirq \n" - "# ||| / _--=> preempt-depth \n" - "# |||| / delay \n" - "# cmd pid ||||| time | caller \n" - "# \\ / ||||| \\ | / \n"); + seq_puts(m, "# _--------=> CPU# \n" + "# / _-------=> irqs-off \n" + "# | / _------=> need-resched \n" + "# || / _-----=> need-resched-lazy\n" + "# ||| / _----=> hardirq/softirq \n" + "# |||| / _---=> preempt-depth \n" + "# ||||| / _--=> preempt-lazy-depth\n" + "# |||||| / _-=> migrate-disable \n" + "# ||||||| / delay \n" + "# cmd pid |||||||| time | caller \n" + "# \\ / |||||||| \\ | / \n"); } static void print_event_info(struct array_buffer *buf, struct seq_file *m) @@ -3876,13 +3862,16 @@ static void print_func_help_header_irq(struct array_buffer *buf, struct seq_file print_event_info(buf, m); - seq_printf(m, "# %.*s _-----=> irqs-off\n", prec, space); - seq_printf(m, "# %.*s / _----=> need-resched\n", prec, space); - seq_printf(m, "# %.*s| / _---=> hardirq/softirq\n", prec, space); - seq_printf(m, "# %.*s|| / _--=> preempt-depth\n", prec, space); - seq_printf(m, "# %.*s||| / delay\n", prec, space); - seq_printf(m, "# TASK-PID %.*s CPU# |||| TIMESTAMP FUNCTION\n", prec, " TGID "); - seq_printf(m, "# | | %.*s | |||| | |\n", prec, " | "); + seq_printf(m, "# %.*s _-------=> irqs-off\n", prec, space); + seq_printf(m, "# %.*s / _------=> need-resched\n", prec, space); + seq_printf(m, "# %.*s| / _-----=> need-resched-lazy\n", prec, space); + seq_printf(m, "# %.*s|| / _----=> hardirq/softirq\n", prec, space); + seq_printf(m, "# %.*s||| / _---=> preempt-depth\n", prec, space); + seq_printf(m, "# %.*s|||| / _--=> preempt-lazy-depth\n", prec, space); + seq_printf(m, "# %.*s||||| / _-=> migrate-disable\n", prec, space); + seq_printf(m, "# %.*s|||||| / delay\n", prec, space); + seq_printf(m, "# TASK-PID %.*s CPU# ||||||| TIMESTAMP FUNCTION\n", prec, " TGID "); + seq_printf(m, "# | | %.*s | ||||||| | |\n", prec, " | "); } void @@ -9417,7 +9406,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) tracing_off(); local_irq_save(flags); - printk_nmi_direct_enter(); /* Simulate the iterator */ trace_init_global_iter(&iter); @@ -9497,7 +9485,6 @@ void ftrace_dump(enum ftrace_dump_mode oops_dump_mode) atomic_dec(&per_cpu_ptr(iter.array_buffer->data, cpu)->disabled); } atomic_dec(&dump_running); - printk_nmi_direct_exit(); local_irq_restore(flags); } EXPORT_SYMBOL_GPL(ftrace_dump); diff --git a/kernel/trace/trace.h b/kernel/trace/trace.h index c0596e250..44943a9be 100644 --- a/kernel/trace/trace.h +++ b/kernel/trace/trace.h @@ -141,25 +141,6 @@ struct kretprobe_trace_entry_head { unsigned long ret_ip; }; -/* - * trace_flag_type is an enumeration that holds different - * states when a trace occurs. These are: - * IRQS_OFF - interrupts were disabled - * IRQS_NOSUPPORT - arch does not support irqs_disabled_flags - * NEED_RESCHED - reschedule is requested - * HARDIRQ - inside an interrupt handler - * SOFTIRQ - inside a softirq handler - */ -enum trace_flag_type { - TRACE_FLAG_IRQS_OFF = 0x01, - TRACE_FLAG_IRQS_NOSUPPORT = 0x02, - TRACE_FLAG_NEED_RESCHED = 0x04, - TRACE_FLAG_HARDIRQ = 0x08, - TRACE_FLAG_SOFTIRQ = 0x10, - TRACE_FLAG_PREEMPT_RESCHED = 0x20, - TRACE_FLAG_NMI = 0x40, -}; - #define TRACE_BUF_SIZE 1024 struct trace_array; diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c index f4b11f609..a2abc0b40 100644 --- a/kernel/trace/trace_events.c +++ b/kernel/trace/trace_events.c @@ -183,6 +183,8 @@ static int trace_define_common_fields(void) __common_field(unsigned char, flags); __common_field(unsigned char, preempt_count); __common_field(int, pid); + __common_field(unsigned char, migrate_disable); + __common_field(unsigned char, preempt_lazy_count); return ret; } diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c index 7042544c5..c711eb334 100644 --- a/kernel/trace/trace_output.c +++ b/kernel/trace/trace_output.c @@ -441,6 +441,7 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) { char hardsoft_irq; char need_resched; + char need_resched_lazy; char irqs_off; int hardirq; int softirq; @@ -471,6 +472,9 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) break; } + need_resched_lazy = + (entry->flags & TRACE_FLAG_NEED_RESCHED_LAZY) ? 'L' : '.'; + hardsoft_irq = (nmi && hardirq) ? 'Z' : nmi ? 'z' : @@ -479,14 +483,25 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) softirq ? 's' : '.' ; - trace_seq_printf(s, "%c%c%c", - irqs_off, need_resched, hardsoft_irq); + trace_seq_printf(s, "%c%c%c%c", + irqs_off, need_resched, need_resched_lazy, + hardsoft_irq); if (entry->preempt_count) trace_seq_printf(s, "%x", entry->preempt_count); else trace_seq_putc(s, '.'); + if (entry->preempt_lazy_count) + trace_seq_printf(s, "%x", entry->preempt_lazy_count); + else + trace_seq_putc(s, '.'); + + if (entry->migrate_disable) + trace_seq_printf(s, "%x", entry->migrate_disable); + else + trace_seq_putc(s, '.'); + return !trace_seq_has_overflowed(s); } diff --git a/kernel/workqueue.c b/kernel/workqueue.c index af95ef023..a56c38ec4 100644 --- a/kernel/workqueue.c +++ b/kernel/workqueue.c @@ -4924,6 +4924,10 @@ static void unbind_workers(int cpu) pool->flags |= POOL_DISASSOCIATED; raw_spin_unlock_irq(&pool->lock); + + for_each_pool_worker(worker, pool) + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, cpu_active_mask) < 0); + mutex_unlock(&wq_pool_attach_mutex); /* diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug index f906df9db..c52d39d10 100644 --- a/lib/Kconfig.debug +++ b/lib/Kconfig.debug @@ -1366,7 +1366,7 @@ config DEBUG_ATOMIC_SLEEP config DEBUG_LOCKING_API_SELFTESTS bool "Locking API boot-time self-tests" - depends on DEBUG_KERNEL + depends on DEBUG_KERNEL && !PREEMPT_RT help Say Y here if you want the kernel to run a short self-test during bootup. The self-test checks whether common types of locking bugs diff --git a/lib/bug.c b/lib/bug.c index 4ab398a2d..9c681f29e 100644 --- a/lib/bug.c +++ b/lib/bug.c @@ -202,6 +202,7 @@ enum bug_trap_type report_bug(unsigned long bugaddr, struct pt_regs *regs) else pr_crit("Kernel BUG at %pB [verbose debug info unavailable]\n", (void *)bugaddr); + pr_flush(1000, true); return BUG_TRAP_TYPE_BUG; } diff --git a/lib/cpumask.c b/lib/cpumask.c index fb22fb266..c3c76b833 100644 --- a/lib/cpumask.c +++ b/lib/cpumask.c @@ -261,3 +261,21 @@ int cpumask_any_and_distribute(const struct cpumask *src1p, return next; } EXPORT_SYMBOL(cpumask_any_and_distribute); + +int cpumask_any_distribute(const struct cpumask *srcp) +{ + int next, prev; + + /* NOTE: our first selection will skip 0. */ + prev = __this_cpu_read(distribute_cpu_mask_prev); + + next = cpumask_next(prev, srcp); + if (next >= nr_cpu_ids) + next = cpumask_first(srcp); + + if (next < nr_cpu_ids) + __this_cpu_write(distribute_cpu_mask_prev, next); + + return next; +} +EXPORT_SYMBOL(cpumask_any_distribute); diff --git a/lib/debugobjects.c b/lib/debugobjects.c index 9e14ae023..083882a3c 100644 --- a/lib/debugobjects.c +++ b/lib/debugobjects.c @@ -557,7 +557,10 @@ __debug_object_init(void *addr, const struct debug_obj_descr *descr, int onstack struct debug_obj *obj; unsigned long flags; - fill_pool(); +#ifdef CONFIG_PREEMPT_RT + if (preempt_count() == 0 && !irqs_disabled()) +#endif + fill_pool(); db = get_bucket((unsigned long) addr); diff --git a/lib/dump_stack.c b/lib/dump_stack.c index a00ee6eed..f5a33b6f7 100644 --- a/lib/dump_stack.c +++ b/lib/dump_stack.c @@ -12,6 +12,7 @@ #include <linux/atomic.h> #include <linux/kexec.h> #include <linux/utsname.h> +#include <linux/stop_machine.h> static char dump_stack_arch_desc_str[128]; @@ -57,6 +58,7 @@ void dump_stack_print_info(const char *log_lvl) log_lvl, dump_stack_arch_desc_str); print_worker_info(log_lvl, current); + print_stop_info(log_lvl, current); } /** diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 2f17b488d..7557bf7ec 100644 --- a/lib/irq_poll.c +++ b/lib/irq_poll.c @@ -37,6 +37,7 @@ void irq_poll_sched(struct irq_poll *iop) list_add_tail(&iop->list, this_cpu_ptr(&blk_cpu_iopoll)); raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); local_irq_restore(flags); + preempt_check_resched_rt(); } EXPORT_SYMBOL(irq_poll_sched); @@ -72,6 +73,7 @@ void irq_poll_complete(struct irq_poll *iop) local_irq_save(flags); __irq_poll_complete(iop); local_irq_restore(flags); + preempt_check_resched_rt(); } EXPORT_SYMBOL(irq_poll_complete); @@ -96,6 +98,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h) } local_irq_enable(); + preempt_check_resched_rt(); /* Even though interrupts have been re-enabled, this * access is safe because interrupts can only add new @@ -133,6 +136,7 @@ static void __latent_entropy irq_poll_softirq(struct softirq_action *h) __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); local_irq_enable(); + preempt_check_resched_rt(); } /** @@ -196,6 +200,7 @@ static int irq_poll_cpu_dead(unsigned int cpu) this_cpu_ptr(&blk_cpu_iopoll)); __raise_softirq_irqoff(IRQ_POLL_SOFTIRQ); local_irq_enable(); + preempt_check_resched_rt(); return 0; } diff --git a/lib/locking-selftest.c b/lib/locking-selftest.c index 76c52b0b7..98c376b02 100644 --- a/lib/locking-selftest.c +++ b/lib/locking-selftest.c @@ -787,6 +787,8 @@ GENERATE_TESTCASE(init_held_rtmutex); #include "locking-selftest-spin-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_spin) +#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-rlock-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_hard_rlock) @@ -802,9 +804,12 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe1_soft_wlock) +#endif + #undef E1 #undef E2 +#ifndef CONFIG_PREEMPT_RT /* * Enabling hardirqs with a softirq-safe lock held: */ @@ -837,6 +842,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) #undef E1 #undef E2 +#endif + /* * Enabling irqs with an irq-safe lock held: */ @@ -860,6 +867,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2A_rlock) #include "locking-selftest-spin-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_spin) +#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-rlock-hardirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_hard_rlock) @@ -875,6 +884,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) +#endif + #undef E1 #undef E2 @@ -906,6 +917,8 @@ GENERATE_PERMUTATIONS_2_EVENTS(irqsafe2B_soft_wlock) #include "locking-selftest-spin-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_spin) +#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-rlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_hard_rlock) @@ -921,6 +934,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) +#endif + #undef E1 #undef E2 #undef E3 @@ -954,6 +969,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe3_soft_wlock) #include "locking-selftest-spin-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_spin) +#ifndef CONFIG_PREEMPT_RT + #include "locking-selftest-rlock-hardirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_hard_rlock) @@ -969,10 +986,14 @@ GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_rlock) #include "locking-selftest-wlock-softirq.h" GENERATE_PERMUTATIONS_3_EVENTS(irqsafe4_soft_wlock) +#endif + #undef E1 #undef E2 #undef E3 +#ifndef CONFIG_PREEMPT_RT + /* * read-lock / write-lock irq inversion. * @@ -1162,6 +1183,11 @@ GENERATE_PERMUTATIONS_3_EVENTS(W1W2_R2R3_R3W1) #undef E1 #undef E2 #undef E3 + +#endif + +#ifndef CONFIG_PREEMPT_RT + /* * read-lock / write-lock recursion that is actually safe. */ @@ -1208,6 +1234,8 @@ GENERATE_PERMUTATIONS_3_EVENTS(irq_read_recursion_soft_wlock) #undef E2 #undef E3 +#endif + /* * read-lock / write-lock recursion that is unsafe. */ @@ -2456,6 +2484,7 @@ void locking_selftest(void) printk(" --------------------------------------------------------------------------\n"); +#ifndef CONFIG_PREEMPT_RT /* * irq-context testcases: */ @@ -2470,6 +2499,28 @@ void locking_selftest(void) DO_TESTCASE_6x2x2RW("irq read-recursion #2", irq_read_recursion2); DO_TESTCASE_6x2x2RW("irq read-recursion #3", irq_read_recursion3); +#else + /* On -rt, we only do hardirq context test for raw spinlock */ + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 12); + DO_TESTCASE_1B("hard-irqs-on + irq-safe-A", irqsafe1_hard_spin, 21); + + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 12); + DO_TESTCASE_1B("hard-safe-A + irqs-on", irqsafe2B_hard_spin, 21); + + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 123); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 132); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 213); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 231); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 312); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #1", irqsafe3_hard_spin, 321); + + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 123); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 132); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 213); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 231); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 312); + DO_TESTCASE_1B("hard-safe-A + unsafe-B #2", irqsafe4_hard_spin, 321); +#endif ww_tests(); force_read_lock_recursive = 0; diff --git a/lib/nmi_backtrace.c b/lib/nmi_backtrace.c index 8abe1870d..b09a490f5 100644 --- a/lib/nmi_backtrace.c +++ b/lib/nmi_backtrace.c @@ -75,12 +75,6 @@ void nmi_trigger_cpumask_backtrace(const cpumask_t *mask, touch_softlockup_watchdog(); } - /* - * Force flush any remote buffers that might be stuck in IRQ context - * and therefore could not run their irq_work. - */ - printk_safe_flush(); - clear_bit_unlock(0, &backtrace_flag); put_cpu(); } diff --git a/lib/scatterlist.c b/lib/scatterlist.c index a59778946..907f59045 100644 --- a/lib/scatterlist.c +++ b/lib/scatterlist.c @@ -892,7 +892,7 @@ void sg_miter_stop(struct sg_mapping_iter *miter) flush_kernel_dcache_page(miter->page); if (miter->__flags & SG_MITER_ATOMIC) { - WARN_ON_ONCE(preemptible()); + WARN_ON_ONCE(!pagefault_disabled()); kunmap_atomic(miter->addr); } else kunmap(miter->page); diff --git a/lib/smp_processor_id.c b/lib/smp_processor_id.c index 525222e4f..1c1dbd300 100644 --- a/lib/smp_processor_id.c +++ b/lib/smp_processor_id.c @@ -26,6 +26,11 @@ unsigned int check_preemption_disabled(const char *what1, const char *what2) if (current->nr_cpus_allowed == 1) goto out; +#ifdef CONFIG_SMP + if (current->migration_disabled) + goto out; +#endif + /* * It is valid to assume CPU-locality during early bootup: */ diff --git a/lib/test_lockup.c b/lib/test_lockup.c index f1a020bcc..864554e76 100644 --- a/lib/test_lockup.c +++ b/lib/test_lockup.c @@ -480,6 +480,21 @@ static int __init test_lockup_init(void) return -EINVAL; #ifdef CONFIG_DEBUG_SPINLOCK +#ifdef CONFIG_PREEMPT_RT + if (test_magic(lock_spinlock_ptr, + offsetof(spinlock_t, lock.wait_lock.magic), + SPINLOCK_MAGIC) || + test_magic(lock_rwlock_ptr, + offsetof(rwlock_t, rtmutex.wait_lock.magic), + SPINLOCK_MAGIC) || + test_magic(lock_mutex_ptr, + offsetof(struct mutex, lock.wait_lock.magic), + SPINLOCK_MAGIC) || + test_magic(lock_rwsem_ptr, + offsetof(struct rw_semaphore, rtmutex.wait_lock.magic), + SPINLOCK_MAGIC)) + return -EINVAL; +#else if (test_magic(lock_spinlock_ptr, offsetof(spinlock_t, rlock.magic), SPINLOCK_MAGIC) || @@ -493,6 +508,7 @@ static int __init test_lockup_init(void) offsetof(struct rw_semaphore, wait_lock.magic), SPINLOCK_MAGIC)) return -EINVAL; +#endif #endif if ((wait_state != TASK_RUNNING || diff --git a/mm/Kconfig b/mm/Kconfig index 4475bd9f8..9d225b5c2 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -404,7 +404,7 @@ config NOMMU_INITIAL_TRIM_EXCESS config TRANSPARENT_HUGEPAGE bool "Transparent Hugepage Support" - depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE + depends on HAVE_ARCH_TRANSPARENT_HUGEPAGE && !PREEMPT_RT select COMPACTION select XARRAY_MULTI help @@ -971,4 +971,7 @@ config MEMORY_RELIABLE source "mm/damon/Kconfig" +config KMAP_LOCAL + bool + endmenu diff --git a/mm/highmem.c b/mm/highmem.c index efe38ab47..16f3ecd4a 100644 --- a/mm/highmem.c +++ b/mm/highmem.c @@ -31,10 +31,6 @@ #include <asm/tlbflush.h> #include <linux/vmalloc.h> -#if defined(CONFIG_HIGHMEM) || defined(CONFIG_X86_32) -DEFINE_PER_CPU(int, __kmap_atomic_idx); -#endif - /* * Virtual_count is not a pure "count". * 0 means that it is not mapped, and has not been mapped @@ -108,9 +104,7 @@ static inline wait_queue_head_t *get_pkmap_wait_queue_head(unsigned int color) atomic_long_t _totalhigh_pages __read_mostly; EXPORT_SYMBOL(_totalhigh_pages); -EXPORT_PER_CPU_SYMBOL(__kmap_atomic_idx); - -unsigned int nr_free_highpages (void) +unsigned int __nr_free_highpages (void) { struct zone *zone; unsigned int pages = 0; @@ -147,7 +141,7 @@ pte_t * pkmap_page_table; do { spin_unlock(&kmap_lock); (void)(flags); } while (0) #endif -struct page *kmap_to_page(void *vaddr) +struct page *__kmap_to_page(void *vaddr) { unsigned long addr = (unsigned long)vaddr; @@ -158,7 +152,7 @@ struct page *kmap_to_page(void *vaddr) return virt_to_page(addr); } -EXPORT_SYMBOL(kmap_to_page); +EXPORT_SYMBOL(__kmap_to_page); static void flush_all_zero_pkmaps(void) { @@ -200,10 +194,7 @@ static void flush_all_zero_pkmaps(void) flush_tlb_kernel_range(PKMAP_ADDR(0), PKMAP_ADDR(LAST_PKMAP)); } -/** - * kmap_flush_unused - flush all unused kmap mappings in order to remove stray mappings - */ -void kmap_flush_unused(void) +void __kmap_flush_unused(void) { lock_kmap(); flush_all_zero_pkmaps(); @@ -367,7 +358,6 @@ void kunmap_high(struct page *page) if (need_wakeup) wake_up(pkmap_map_wait); } - EXPORT_SYMBOL(kunmap_high); #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -428,7 +418,249 @@ void zero_user_segments(struct page *page, unsigned start1, unsigned end1, } EXPORT_SYMBOL(zero_user_segments); #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ -#endif /* CONFIG_HIGHMEM */ +#endif /* CONFIG_HIGHMEM */ + +#ifdef CONFIG_KMAP_LOCAL + +#include <asm/kmap_size.h> + +/* + * With DEBUG_HIGHMEM the stack depth is doubled and every second + * slot is unused which acts as a guard page + */ +#ifdef CONFIG_DEBUG_HIGHMEM +# define KM_INCR 2 +#else +# define KM_INCR 1 +#endif + +static inline int kmap_local_idx_push(void) +{ + WARN_ON_ONCE(in_irq() && !irqs_disabled()); + current->kmap_ctrl.idx += KM_INCR; + BUG_ON(current->kmap_ctrl.idx >= KM_MAX_IDX); + return current->kmap_ctrl.idx - 1; +} + +static inline int kmap_local_idx(void) +{ + return current->kmap_ctrl.idx - 1; +} + +static inline void kmap_local_idx_pop(void) +{ + current->kmap_ctrl.idx -= KM_INCR; + BUG_ON(current->kmap_ctrl.idx < 0); +} + +#ifndef arch_kmap_local_post_map +# define arch_kmap_local_post_map(vaddr, pteval) do { } while (0) +#endif + +#ifndef arch_kmap_local_pre_unmap +# define arch_kmap_local_pre_unmap(vaddr) do { } while (0) +#endif + +#ifndef arch_kmap_local_post_unmap +# define arch_kmap_local_post_unmap(vaddr) do { } while (0) +#endif + +#ifndef arch_kmap_local_map_idx +#define arch_kmap_local_map_idx(idx, pfn) kmap_local_calc_idx(idx) +#endif + +#ifndef arch_kmap_local_unmap_idx +#define arch_kmap_local_unmap_idx(idx, vaddr) kmap_local_calc_idx(idx) +#endif + +#ifndef arch_kmap_local_high_get +static inline void *arch_kmap_local_high_get(struct page *page) +{ + return NULL; +} +#endif + +/* Unmap a local mapping which was obtained by kmap_high_get() */ +static inline bool kmap_high_unmap_local(unsigned long vaddr) +{ +#ifdef ARCH_NEEDS_KMAP_HIGH_GET + if (vaddr >= PKMAP_ADDR(0) && vaddr < PKMAP_ADDR(LAST_PKMAP)) { + kunmap_high(pte_page(pkmap_page_table[PKMAP_NR(vaddr)])); + return true; + } +#endif + return false; +} + +static inline int kmap_local_calc_idx(int idx) +{ + return idx + KM_MAX_IDX * smp_processor_id(); +} + +static pte_t *__kmap_pte; + +static pte_t *kmap_get_pte(void) +{ + if (!__kmap_pte) + __kmap_pte = virt_to_kpte(__fix_to_virt(FIX_KMAP_BEGIN)); + return __kmap_pte; +} + +void *__kmap_local_pfn_prot(unsigned long pfn, pgprot_t prot) +{ + pte_t pteval, *kmap_pte = kmap_get_pte(); + unsigned long vaddr; + int idx; + + /* + * Disable migration so resulting virtual address is stable + * accross preemption. + */ + migrate_disable(); + preempt_disable(); + idx = arch_kmap_local_map_idx(kmap_local_idx_push(), pfn); + vaddr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + BUG_ON(!pte_none(*(kmap_pte - idx))); + pteval = pfn_pte(pfn, prot); + set_pte_at(&init_mm, vaddr, kmap_pte - idx, pteval); + arch_kmap_local_post_map(vaddr, pteval); + current->kmap_ctrl.pteval[kmap_local_idx()] = pteval; + preempt_enable(); + + return (void *)vaddr; +} +EXPORT_SYMBOL_GPL(__kmap_local_pfn_prot); + +void *__kmap_local_page_prot(struct page *page, pgprot_t prot) +{ + void *kmap; + + if (!PageHighMem(page)) + return page_address(page); + + /* Try kmap_high_get() if architecture has it enabled */ + kmap = arch_kmap_local_high_get(page); + if (kmap) + return kmap; + + return __kmap_local_pfn_prot(page_to_pfn(page), prot); +} +EXPORT_SYMBOL(__kmap_local_page_prot); + +void kunmap_local_indexed(void *vaddr) +{ + unsigned long addr = (unsigned long) vaddr & PAGE_MASK; + pte_t *kmap_pte = kmap_get_pte(); + int idx; + + if (addr < __fix_to_virt(FIX_KMAP_END) || + addr > __fix_to_virt(FIX_KMAP_BEGIN)) { + /* + * Handle mappings which were obtained by kmap_high_get() + * first as the virtual address of such mappings is below + * PAGE_OFFSET. Warn for all other addresses which are in + * the user space part of the virtual address space. + */ + if (!kmap_high_unmap_local(addr)) + WARN_ON_ONCE(addr < PAGE_OFFSET); + return; + } + + preempt_disable(); + idx = arch_kmap_local_unmap_idx(kmap_local_idx(), addr); + WARN_ON_ONCE(addr != __fix_to_virt(FIX_KMAP_BEGIN + idx)); + + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte - idx); + arch_kmap_local_post_unmap(addr); + current->kmap_ctrl.pteval[kmap_local_idx()] = __pte(0); + kmap_local_idx_pop(); + preempt_enable(); + migrate_enable(); +} +EXPORT_SYMBOL(kunmap_local_indexed); + +/* + * Invoked before switch_to(). This is safe even when during or after + * clearing the maps an interrupt which needs a kmap_local happens because + * the task::kmap_ctrl.idx is not modified by the unmapping code so a + * nested kmap_local will use the next unused index and restore the index + * on unmap. The already cleared kmaps of the outgoing task are irrelevant + * because the interrupt context does not know about them. The same applies + * when scheduling back in for an interrupt which happens before the + * restore is complete. + */ +void __kmap_local_sched_out(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte = kmap_get_pte(); + int i; + + /* Clear kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + WARN_ON_ONCE(!pte_none(pteval)); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* + * This is a horrible hack for XTENSA to calculate the + * coloured PTE index. Uses the PFN encoded into the pteval + * and the map index calculation because the actual mapped + * virtual address is not stored in task::kmap_ctrl. + * For any sane architecture this is optimized out. + */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + arch_kmap_local_pre_unmap(addr); + pte_clear(&init_mm, addr, kmap_pte - idx); + arch_kmap_local_post_unmap(addr); + } +} + +void __kmap_local_sched_in(void) +{ + struct task_struct *tsk = current; + pte_t *kmap_pte = kmap_get_pte(); + int i; + + /* Restore kmaps */ + for (i = 0; i < tsk->kmap_ctrl.idx; i++) { + pte_t pteval = tsk->kmap_ctrl.pteval[i]; + unsigned long addr; + int idx; + + /* With debug all even slots are unmapped and act as guard */ + if (IS_ENABLED(CONFIG_DEBUG_HIGHMEM) && !(i & 0x01)) { + WARN_ON_ONCE(!pte_none(pteval)); + continue; + } + if (WARN_ON_ONCE(pte_none(pteval))) + continue; + + /* See comment in __kmap_local_sched_out() */ + idx = arch_kmap_local_map_idx(i, pte_pfn(pteval)); + addr = __fix_to_virt(FIX_KMAP_BEGIN + idx); + set_pte_at(&init_mm, addr, kmap_pte - idx, pteval); + arch_kmap_local_post_map(addr, pteval); + } +} + +void kmap_local_fork(struct task_struct *tsk) +{ + if (WARN_ON_ONCE(tsk->kmap_ctrl.idx)) + memset(&tsk->kmap_ctrl, 0, sizeof(tsk->kmap_ctrl)); +} + +#endif #if defined(HASHED_PAGE_VIRTUAL) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2804fe9d3..45ea058a3 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -66,6 +66,7 @@ #include <net/sock.h> #include <net/ip.h> #include "slab.h" +#include <linux/local_lock.h> #include <linux/uaccess.h> @@ -100,6 +101,13 @@ static bool cgroup_memory_kswapd = false; DEFINE_STATIC_KEY_FALSE(memcg_kswapd_key); EXPORT_SYMBOL(memcg_kswapd_key); +struct event_lock { + local_lock_t l; +}; +static DEFINE_PER_CPU(struct event_lock, event_lock) = { + .l = INIT_LOCAL_LOCK(l), +}; + /* Whether legacy memory+swap accounting is active */ static bool do_memsw_account(void) { @@ -742,6 +750,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, pn = container_of(lruvec, struct mem_cgroup_per_node, lruvec); memcg = pn->memcg; + preempt_disable_rt(); /* Update memcg */ __this_cpu_add(memcg->vmstats_percpu->state[idx], val); @@ -749,6 +758,7 @@ void __mod_memcg_lruvec_state(struct lruvec *lruvec, enum node_stat_item idx, __this_cpu_add(pn->lruvec_stats_percpu->state[idx], val); memcg_rstat_updated(memcg); + preempt_enable_rt(); } /** @@ -2156,6 +2166,7 @@ void unlock_page_memcg(struct page *page) EXPORT_SYMBOL(unlock_page_memcg); struct memcg_stock_pcp { + local_lock_t lock; struct mem_cgroup *cached; /* this never be root cgroup */ unsigned int nr_pages; @@ -2207,7 +2218,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (nr_pages > MEMCG_CHARGE_BATCH) return ret; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.lock, flags); stock = this_cpu_ptr(&memcg_stock); if (memcg == stock->cached && stock->nr_pages >= nr_pages) { @@ -2215,7 +2226,7 @@ static bool consume_stock(struct mem_cgroup *memcg, unsigned int nr_pages) ret = true; } - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.lock, flags); return ret; } @@ -2250,14 +2261,14 @@ static void drain_local_stock(struct work_struct *dummy) * The only protection from memory hotplug vs. drain_stock races is * that we always operate on local CPU stock here with IRQ disabled */ - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.lock, flags); stock = this_cpu_ptr(&memcg_stock); drain_obj_stock(stock); drain_stock(stock); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.lock, flags); } /* @@ -2269,7 +2280,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) struct memcg_stock_pcp *stock; unsigned long flags; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.lock, flags); stock = this_cpu_ptr(&memcg_stock); if (stock->cached != memcg) { /* reset if necessary */ @@ -2282,7 +2293,7 @@ static void refill_stock(struct mem_cgroup *memcg, unsigned int nr_pages) if (stock->nr_pages > MEMCG_CHARGE_BATCH) drain_stock(stock); - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.lock, flags); } /* @@ -2302,7 +2313,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) * as well as workers from this path always operate on the local * per-cpu data. CPU up doesn't touch memcg_stock at all. */ - curcpu = get_cpu(); + curcpu = get_cpu_light(); for_each_online_cpu(cpu) { struct memcg_stock_pcp *stock = &per_cpu(memcg_stock, cpu); struct mem_cgroup *memcg; @@ -2325,7 +2336,7 @@ static void drain_all_stock(struct mem_cgroup *root_memcg) schedule_work_on(cpu, &stock->work); } } - put_cpu(); + put_cpu_light(); mutex_unlock(&percpu_charge_mutex); } @@ -3086,7 +3097,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) unsigned long flags; bool ret = false; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.lock, flags); stock = this_cpu_ptr(&memcg_stock); if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { @@ -3094,7 +3105,7 @@ static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) ret = true; } - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.lock, flags); return ret; } @@ -3150,7 +3161,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) struct memcg_stock_pcp *stock; unsigned long flags; - local_irq_save(flags); + local_lock_irqsave(&memcg_stock.lock, flags); stock = this_cpu_ptr(&memcg_stock); if (stock->cached_objcg != objcg) { /* reset if necessary */ @@ -3164,7 +3175,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) if (stock->nr_bytes > PAGE_SIZE) drain_obj_stock(stock); - local_irq_restore(flags); + local_unlock_irqrestore(&memcg_stock.lock, flags); } int obj_cgroup_charge(struct obj_cgroup *objcg, gfp_t gfp, size_t size) @@ -5990,12 +6001,12 @@ static int mem_cgroup_move_account(struct page *page, ret = 0; - local_irq_disable(); + local_lock_irq(&event_lock.l); mem_cgroup_charge_statistics(to, page, nr_pages); memcg_check_events(to, page); mem_cgroup_charge_statistics(from, page, -nr_pages); memcg_check_events(from, page); - local_irq_enable(); + local_unlock_irq(&event_lock.l); out_unlock: unlock_page(page); out: @@ -6967,10 +6978,10 @@ int mem_cgroup_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask) css_get(&memcg->css); commit_charge(page, memcg); - local_irq_disable(); + local_lock_irq(&event_lock.l); mem_cgroup_charge_statistics(memcg, page, nr_pages); memcg_check_events(memcg, page); - local_irq_enable(); + local_unlock_irq(&event_lock.l); /* * Cgroup1's unified memory+swap counter has been charged with the @@ -7026,11 +7037,11 @@ static void uncharge_batch(const struct uncharge_gather *ug) memcg_oom_recover(ug->memcg); } - local_irq_save(flags); + local_lock_irqsave(&event_lock.l, flags); __count_memcg_events(ug->memcg, PGPGOUT, ug->pgpgout); __this_cpu_add(ug->memcg->vmstats_percpu->nr_page_events, ug->nr_memory); memcg_check_events(ug->memcg, ug->dummy_page); - local_irq_restore(flags); + local_unlock_irqrestore(&event_lock.l, flags); /* drop reference from uncharge_page */ css_put(&ug->memcg->css); @@ -7202,10 +7213,10 @@ void mem_cgroup_migrate(struct page *oldpage, struct page *newpage) css_get(&memcg->css); commit_charge(newpage, memcg); - local_irq_save(flags); + local_lock_irqsave(&event_lock.l, flags); mem_cgroup_charge_statistics(memcg, newpage, nr_pages); memcg_check_events(memcg, newpage); - local_irq_restore(flags); + local_unlock_irqrestore(&event_lock.l, flags); } DEFINE_STATIC_KEY_FALSE(memcg_sockets_enabled_key); @@ -7342,9 +7353,13 @@ static int __init mem_cgroup_init(void) cpuhp_setup_state_nocalls(CPUHP_MM_MEMCQ_DEAD, "mm/memctrl:dead", NULL, memcg_hotplug_cpu_dead); - for_each_possible_cpu(cpu) - INIT_WORK(&per_cpu_ptr(&memcg_stock, cpu)->work, - drain_local_stock); + for_each_possible_cpu(cpu) { + struct memcg_stock_pcp *stock; + + stock = per_cpu_ptr(&memcg_stock, cpu); + INIT_WORK(&stock->work, drain_local_stock); + local_lock_init(&stock->lock); + } for_each_node(node) { struct mem_cgroup_tree_per_node *rtpn; @@ -7393,6 +7408,7 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) struct mem_cgroup *memcg, *swap_memcg; unsigned int nr_entries; unsigned short oldid; + unsigned long flags; VM_BUG_ON_PAGE(PageLRU(page), page); VM_BUG_ON_PAGE(page_count(page), page); @@ -7438,9 +7454,13 @@ void mem_cgroup_swapout(struct page *page, swp_entry_t entry) * important here to have the interrupts disabled because it is the * only synchronisation we have for updating the per-CPU variables. */ + local_lock_irqsave(&event_lock.l, flags); +#ifndef CONFIG_PREEMPT_RT VM_BUG_ON(!irqs_disabled()); +#endif mem_cgroup_charge_statistics(memcg, page, -nr_entries); memcg_check_events(memcg, page); + local_unlock_irqrestore(&event_lock.l, flags); css_put(&memcg->css); } diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 64da39a16..bb39c242f 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -61,6 +61,7 @@ #include <linux/hugetlb.h> #include <linux/sched/rt.h> #include <linux/sched/mm.h> +#include <linux/local_lock.h> #include <linux/page_owner.h> #include <linux/kthread.h> #include <linux/memcontrol.h> @@ -389,6 +390,13 @@ EXPORT_SYMBOL(nr_node_ids); EXPORT_SYMBOL(nr_online_nodes); #endif +struct pa_lock { + local_lock_t l; +}; +static DEFINE_PER_CPU(struct pa_lock, pa_lock) = { + .l = INIT_LOCAL_LOCK(l), +}; + int page_group_by_mobility_disabled __read_mostly; #ifdef CONFIG_DEFERRED_STRUCT_PAGE_INIT @@ -1333,7 +1341,7 @@ static inline void prefetch_buddy(struct page *page) } /* - * Frees a number of pages from the PCP lists + * Frees a number of pages which have been collected from the pcp lists. * Assumes all pages on list are in same zone, and of same order. * count is the number of pages to free. * @@ -1343,15 +1351,56 @@ static inline void prefetch_buddy(struct page *page) * And clear the zone's pages_scanned counter, to hold off the "all pages are * pinned" detection logic. */ -static void free_pcppages_bulk(struct zone *zone, int count, - struct per_cpu_pages *pcp) +static void free_pcppages_bulk(struct zone *zone, struct list_head *head, + bool zone_retry) +{ + bool isolated_pageblocks; + struct page *page, *tmp; + unsigned long flags; + + spin_lock_irqsave(&zone->lock, flags); + isolated_pageblocks = has_isolate_pageblock(zone); + + /* + * Use safe version since after __free_one_page(), + * page->lru.next will not point to original list. + */ + list_for_each_entry_safe(page, tmp, head, lru) { + int mt = get_pcppage_migratetype(page); + + if (page_zone(page) != zone) { + /* + * free_unref_page_list() sorts pages by zone. If we end + * up with pages from a different NUMA nodes belonging + * to the same ZONE index then we need to redo with the + * correct ZONE pointer. Skip the page for now, redo it + * on the next iteration. + */ + WARN_ON_ONCE(zone_retry == false); + if (zone_retry) + continue; + } + + /* MIGRATE_ISOLATE page should not go to pcplists */ + VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); + /* Pageblock could have been isolated meanwhile */ + if (unlikely(isolated_pageblocks)) + mt = get_pageblock_migratetype(page); + + list_del(&page->lru); + __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); + trace_mm_page_pcpu_drain(page, 0, mt); + } + spin_unlock_irqrestore(&zone->lock, flags); +} + +static void isolate_pcp_pages(int count, struct per_cpu_pages *pcp, + struct list_head *dst) { int migratetype = 0; int batch_free = 0; int prefetch_nr = READ_ONCE(pcp->batch); - bool isolated_pageblocks; - struct page *page, *tmp; - LIST_HEAD(head); + struct page *page; /* * Ensure proper count is passed which otherwise would stuck in the @@ -1388,7 +1437,7 @@ static void free_pcppages_bulk(struct zone *zone, int count, if (bulkfree_pcp_prepare(page)) continue; - list_add_tail(&page->lru, &head); + list_add_tail(&page->lru, dst); /* * We are going to put the page back to the global @@ -1405,26 +1454,6 @@ static void free_pcppages_bulk(struct zone *zone, int count, } } while (--count && --batch_free && !list_empty(list)); } - - spin_lock(&zone->lock); - isolated_pageblocks = has_isolate_pageblock(zone); - - /* - * Use safe version since after __free_one_page(), - * page->lru.next will not point to original list. - */ - list_for_each_entry_safe(page, tmp, &head, lru) { - int mt = get_pcppage_migratetype(page); - /* MIGRATE_ISOLATE page should not go to pcplists */ - VM_BUG_ON_PAGE(is_migrate_isolate(mt), page); - /* Pageblock could have been isolated meanwhile */ - if (unlikely(isolated_pageblocks)) - mt = get_pageblock_migratetype(page); - - __free_one_page(page, page_to_pfn(page), zone, 0, mt, FPI_NONE); - trace_mm_page_pcpu_drain(page, 0, mt); - } - spin_unlock(&zone->lock); } static void free_one_page(struct zone *zone, @@ -1526,11 +1555,11 @@ static void __free_pages_ok(struct page *page, unsigned int order, return; migratetype = get_pfnblock_migratetype(page, pfn); - local_irq_save(flags); + local_lock_irqsave(&pa_lock.l, flags); __count_vm_events(PGFREE, 1 << order); free_one_page(page_zone(page), page, pfn, order, migratetype, fpi_flags); - local_irq_restore(flags); + local_unlock_irqrestore(&pa_lock.l, flags); } void __free_pages_core(struct page *page, unsigned int order) @@ -2941,13 +2970,18 @@ void drain_zone_pages(struct zone *zone, struct per_cpu_pages *pcp) { unsigned long flags; int to_drain, batch; + LIST_HEAD(dst); - local_irq_save(flags); + local_lock_irqsave(&pa_lock.l, flags); batch = READ_ONCE(pcp->batch); to_drain = min(pcp->count, batch); if (to_drain > 0) - free_pcppages_bulk(zone, to_drain, pcp); - local_irq_restore(flags); + isolate_pcp_pages(to_drain, pcp, &dst); + + local_unlock_irqrestore(&pa_lock.l, flags); + + if (to_drain > 0) + free_pcppages_bulk(zone, &dst, false); } #endif @@ -2963,14 +2997,21 @@ static void drain_pages_zone(unsigned int cpu, struct zone *zone) unsigned long flags; struct per_cpu_pageset *pset; struct per_cpu_pages *pcp; + LIST_HEAD(dst); + int count; - local_irq_save(flags); + local_lock_irqsave(&pa_lock.l, flags); pset = per_cpu_ptr(zone->pageset, cpu); pcp = &pset->pcp; - if (pcp->count) - free_pcppages_bulk(zone, pcp->count, pcp); - local_irq_restore(flags); + count = pcp->count; + if (count) + isolate_pcp_pages(count, pcp, &dst); + + local_unlock_irqrestore(&pa_lock.l, flags); + + if (count) + free_pcppages_bulk(zone, &dst, false); } /* @@ -3018,9 +3059,9 @@ static void drain_local_pages_wq(struct work_struct *work) * cpu which is allright but we also have to make sure to not move to * a different one. */ - preempt_disable(); + migrate_disable(); drain_local_pages(drain->zone); - preempt_enable(); + migrate_enable(); } /* @@ -3190,7 +3231,8 @@ static bool free_unref_page_prepare(struct page *page, unsigned long pfn) return true; } -static void free_unref_page_commit(struct page *page, unsigned long pfn) +static void free_unref_page_commit(struct page *page, unsigned long pfn, + struct list_head *dst) { struct zone *zone = page_zone(page); struct per_cpu_pages *pcp; @@ -3218,8 +3260,11 @@ static void free_unref_page_commit(struct page *page, unsigned long pfn) pcp = &this_cpu_ptr(zone->pageset)->pcp; list_add(&page->lru, &pcp->lists[migratetype]); pcp->count++; - if (pcp->count >= READ_ONCE(pcp->high)) - free_pcppages_bulk(zone, READ_ONCE(pcp->batch), pcp); + if (pcp->count >= READ_ONCE(pcp->high)) { + unsigned long batch = READ_ONCE(pcp->batch); + + isolate_pcp_pages(batch, pcp, dst); + } } /* @@ -3229,6 +3274,8 @@ void free_unref_page(struct page *page) { unsigned long flags; unsigned long pfn = page_to_pfn(page); + struct zone *zone = page_zone(page); + LIST_HEAD(dst); /* Free dynamic hugetlb page */ if (free_page_to_dhugetlb_pool(page)) @@ -3237,9 +3284,11 @@ void free_unref_page(struct page *page) if (!free_unref_page_prepare(page, pfn)) return; - local_irq_save(flags); - free_unref_page_commit(page, pfn); - local_irq_restore(flags); + local_lock_irqsave(&pa_lock.l, flags); + free_unref_page_commit(page, pfn, &dst); + local_unlock_irqrestore(&pa_lock.l, flags); + if (!list_empty(&dst)) + free_pcppages_bulk(zone, &dst, false); } /* @@ -3250,6 +3299,11 @@ void free_unref_page_list(struct list_head *list) struct page *page, *next; unsigned long flags, pfn; int batch_count = 0; + struct list_head dsts[__MAX_NR_ZONES]; + int i; + + for (i = 0; i < __MAX_NR_ZONES; i++) + INIT_LIST_HEAD(&dsts[i]); /* Free dynamic hugetlb page list */ free_page_list_to_dhugetlb_pool(list); @@ -3262,25 +3316,42 @@ void free_unref_page_list(struct list_head *list) set_page_private(page, pfn); } - local_irq_save(flags); + local_lock_irqsave(&pa_lock.l, flags); list_for_each_entry_safe(page, next, list, lru) { unsigned long pfn = page_private(page); + enum zone_type type; set_page_private(page, 0); trace_mm_page_free_batched(page); - free_unref_page_commit(page, pfn); + type = page_zonenum(page); + free_unref_page_commit(page, pfn, &dsts[type]); /* * Guard against excessive IRQ disabled times when we get * a large list of pages to free. */ if (++batch_count == SWAP_CLUSTER_MAX) { - local_irq_restore(flags); + local_unlock_irqrestore(&pa_lock.l, flags); batch_count = 0; - local_irq_save(flags); + local_lock_irqsave(&pa_lock.l, flags); } } - local_irq_restore(flags); + local_unlock_irqrestore(&pa_lock.l, flags); + + for (i = 0; i < __MAX_NR_ZONES; ) { + struct page *page; + struct zone *zone; + + if (list_empty(&dsts[i])) { + i++; + continue; + } + + page = list_first_entry(&dsts[i], struct page, lru); + zone = page_zone(page); + + free_pcppages_bulk(zone, &dsts[i], true); + } } /* @@ -3437,7 +3508,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, struct page *page; unsigned long flags; - local_irq_save(flags); + local_lock_irqsave(&pa_lock.l, flags); pcp = &this_cpu_ptr(zone->pageset)->pcp; list = &pcp->lists[migratetype]; page = __rmqueue_pcplist(zone, migratetype, alloc_flags, pcp, list); @@ -3445,7 +3516,7 @@ static struct page *rmqueue_pcplist(struct zone *preferred_zone, __count_zid_vm_events(PGALLOC, page_zonenum(page), 1); zone_statistics(preferred_zone, zone); } - local_irq_restore(flags); + local_unlock_irqrestore(&pa_lock.l, flags); return page; } @@ -3479,7 +3550,8 @@ struct page *rmqueue(struct zone *preferred_zone, * allocate greater than order-1 page units with __GFP_NOFAIL. */ WARN_ON_ONCE((gfp_flags & __GFP_NOFAIL) && (order > 1)); - spin_lock_irqsave(&zone->lock, flags); + local_lock_irqsave(&pa_lock.l, flags); + spin_lock(&zone->lock); do { page = NULL; @@ -3505,7 +3577,7 @@ struct page *rmqueue(struct zone *preferred_zone, __count_zid_vm_events(PGALLOC, page_zonenum(page), 1 << order); zone_statistics(preferred_zone, zone); - local_irq_restore(flags); + local_unlock_irqrestore(&pa_lock.l, flags); out: /* Separate test+clear to avoid unnecessary atomics */ @@ -3518,7 +3590,7 @@ struct page *rmqueue(struct zone *preferred_zone, return page; failed: - local_irq_restore(flags); + local_unlock_irqrestore(&pa_lock.l, flags); return NULL; } @@ -9064,7 +9136,7 @@ void zone_pcp_reset(struct zone *zone) struct per_cpu_pageset *pset; /* avoid races with drain_pages() */ - local_irq_save(flags); + local_lock_irqsave(&pa_lock.l, flags); if (zone->pageset != &boot_pageset) { for_each_online_cpu(cpu) { pset = per_cpu_ptr(zone->pageset, cpu); @@ -9073,7 +9145,7 @@ void zone_pcp_reset(struct zone *zone) free_percpu(zone->pageset); zone->pageset = &boot_pageset; } - local_irq_restore(flags); + local_unlock_irqrestore(&pa_lock.l, flags); } #ifdef CONFIG_MEMORY_HOTREMOVE diff --git a/mm/shmem.c b/mm/shmem.c index 746e48454..d8f03bb23 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -307,10 +307,10 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) ino_t ino; if (!(sb->s_flags & SB_KERNMOUNT)) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); if (sbinfo->max_inodes) { if (!sbinfo->free_inodes) { - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return -ENOSPC; } sbinfo->free_inodes--; @@ -333,7 +333,7 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) } *inop = ino; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } else if (inop) { /* * __shmem_file_setup, one of our callers, is lock-free: it @@ -348,13 +348,14 @@ static int shmem_reserve_inode(struct super_block *sb, ino_t *inop) * to worry about things like glibc compatibility. */ ino_t *next_ino; + next_ino = per_cpu_ptr(sbinfo->ino_batch, get_cpu()); ino = *next_ino; if (unlikely(ino % SHMEM_INO_BATCH == 0)) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); ino = sbinfo->next_ino; sbinfo->next_ino += SHMEM_INO_BATCH; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); if (unlikely(is_zero_ino(ino))) ino++; } @@ -370,9 +371,9 @@ static void shmem_free_inode(struct super_block *sb) { struct shmem_sb_info *sbinfo = SHMEM_SB(sb); if (sbinfo->max_inodes) { - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); sbinfo->free_inodes++; - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } } @@ -1480,10 +1481,10 @@ static struct mempolicy *shmem_get_sbmpol(struct shmem_sb_info *sbinfo) { struct mempolicy *mpol = NULL; if (sbinfo->mpol) { - spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ + raw_spin_lock(&sbinfo->stat_lock); /* prevent replace/use races */ mpol = sbinfo->mpol; mpol_get(mpol); - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); } return mpol; } @@ -3534,9 +3535,10 @@ static int shmem_reconfigure(struct fs_context *fc) struct shmem_options *ctx = fc->fs_private; struct shmem_sb_info *sbinfo = SHMEM_SB(fc->root->d_sb); unsigned long inodes; + struct mempolicy *mpol = NULL; const char *err; - spin_lock(&sbinfo->stat_lock); + raw_spin_lock(&sbinfo->stat_lock); inodes = sbinfo->max_inodes - sbinfo->free_inodes; if ((ctx->seen & SHMEM_SEEN_BLOCKS) && ctx->blocks) { if (!sbinfo->max_blocks) { @@ -3581,14 +3583,15 @@ static int shmem_reconfigure(struct fs_context *fc) * Preserve previous mempolicy unless mpol remount option was specified. */ if (ctx->mpol) { - mpol_put(sbinfo->mpol); + mpol = sbinfo->mpol; sbinfo->mpol = ctx->mpol; /* transfers initial ref */ ctx->mpol = NULL; } - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); + mpol_put(mpol); return 0; out: - spin_unlock(&sbinfo->stat_lock); + raw_spin_unlock(&sbinfo->stat_lock); return invalfc(fc, "%s", err); } @@ -3705,7 +3708,7 @@ static int shmem_fill_super(struct super_block *sb, struct fs_context *fc) sbinfo->mpol = ctx->mpol; ctx->mpol = NULL; - spin_lock_init(&sbinfo->stat_lock); + raw_spin_lock_init(&sbinfo->stat_lock); if (percpu_counter_init(&sbinfo->used_blocks, 0, GFP_KERNEL)) goto failed; spin_lock_init(&sbinfo->shrinklist_lock); diff --git a/mm/slab.c b/mm/slab.c index d152f910d..a3b51ad67 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -234,7 +234,7 @@ static void kmem_cache_node_init(struct kmem_cache_node *parent) parent->shared = NULL; parent->alien = NULL; parent->colour_next = 0; - spin_lock_init(&parent->list_lock); + raw_spin_lock_init(&parent->list_lock); parent->free_objects = 0; parent->free_touched = 0; } @@ -559,9 +559,9 @@ static noinline void cache_free_pfmemalloc(struct kmem_cache *cachep, page_node = page_to_nid(page); n = get_node(cachep, page_node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, &objp, 1, page_node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); } @@ -699,7 +699,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, struct kmem_cache_node *n = get_node(cachep, node); if (ac->avail) { - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); /* * Stuff objects into the remote nodes shared array first. * That way we could avoid the overhead of putting the objects @@ -710,7 +710,7 @@ static void __drain_alien_cache(struct kmem_cache *cachep, free_block(cachep, ac->entry, ac->avail, node, list); ac->avail = 0; - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); } } @@ -783,9 +783,9 @@ static int __cache_free_alien(struct kmem_cache *cachep, void *objp, slabs_destroy(cachep, &list); } else { n = get_node(cachep, page_node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, &objp, 1, page_node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); slabs_destroy(cachep, &list); } return 1; @@ -826,10 +826,10 @@ static int init_cache_node(struct kmem_cache *cachep, int node, gfp_t gfp) */ n = get_node(cachep, node); if (n) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); n->free_limit = (1 + nr_cpus_node(node)) * cachep->batchcount + cachep->num; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); return 0; } @@ -908,7 +908,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, goto fail; n = get_node(cachep, node); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); if (n->shared && force_change) { free_block(cachep, n->shared->entry, n->shared->avail, node, &list); @@ -926,7 +926,7 @@ static int setup_kmem_cache_node(struct kmem_cache *cachep, new_alien = NULL; } - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); /* @@ -965,7 +965,7 @@ static void cpuup_canceled(long cpu) if (!n) continue; - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); /* Free limit for this kmem_cache_node */ n->free_limit -= cachep->batchcount; @@ -976,7 +976,7 @@ static void cpuup_canceled(long cpu) nc->avail = 0; if (!cpumask_empty(mask)) { - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); goto free_slab; } @@ -990,7 +990,7 @@ static void cpuup_canceled(long cpu) alien = n->alien; n->alien = NULL; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); kfree(shared); if (alien) { @@ -1174,7 +1174,7 @@ static void __init init_list(struct kmem_cache *cachep, struct kmem_cache_node * /* * Do not assume that spinlocks can be initialized via memcpy: */ - spin_lock_init(&ptr->list_lock); + raw_spin_lock_init(&ptr->list_lock); MAKE_ALL_LISTS(cachep, ptr, nodeid); cachep->node[nodeid] = ptr; @@ -1345,11 +1345,11 @@ slab_out_of_memory(struct kmem_cache *cachep, gfp_t gfpflags, int nodeid) for_each_kmem_cache_node(cachep, node, n) { unsigned long total_slabs, free_slabs, free_objs; - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); total_slabs = n->total_slabs; free_slabs = n->free_slabs; free_objs = n->free_objects; - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); pr_warn(" node %d: slabs: %ld/%ld, objs: %ld/%ld\n", node, total_slabs - free_slabs, total_slabs, @@ -2106,7 +2106,7 @@ static void check_spinlock_acquired(struct kmem_cache *cachep) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); + assert_raw_spin_locked(&get_node(cachep, numa_mem_id())->list_lock); #endif } @@ -2114,7 +2114,7 @@ static void check_spinlock_acquired_node(struct kmem_cache *cachep, int node) { #ifdef CONFIG_SMP check_irq_off(); - assert_spin_locked(&get_node(cachep, node)->list_lock); + assert_raw_spin_locked(&get_node(cachep, node)->list_lock); #endif } @@ -2154,9 +2154,9 @@ static void do_drain(void *arg) check_irq_off(); ac = cpu_cache_get(cachep); n = get_node(cachep, node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); ac->avail = 0; slabs_destroy(cachep, &list); } @@ -2174,9 +2174,9 @@ static void drain_cpu_caches(struct kmem_cache *cachep) drain_alien_cache(cachep, n->alien); for_each_kmem_cache_node(cachep, node, n) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); drain_array_locked(cachep, n->shared, node, true, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } @@ -2198,10 +2198,10 @@ static int drain_freelist(struct kmem_cache *cache, nr_freed = 0; while (nr_freed < tofree && !list_empty(&n->slabs_free)) { - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); p = n->slabs_free.prev; if (p == &n->slabs_free) { - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); goto out; } @@ -2214,7 +2214,7 @@ static int drain_freelist(struct kmem_cache *cache, * to the cache. */ n->free_objects -= cache->num; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slab_destroy(cache, page); nr_freed++; } @@ -2650,7 +2650,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) INIT_LIST_HEAD(&page->slab_list); n = get_node(cachep, page_to_nid(page)); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); n->total_slabs++; if (!page->active) { list_add_tail(&page->slab_list, &n->slabs_free); @@ -2660,7 +2660,7 @@ static void cache_grow_end(struct kmem_cache *cachep, struct page *page) STATS_INC_GROWN(cachep); n->free_objects += cachep->num - page->active; - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); } @@ -2826,7 +2826,7 @@ static struct page *get_first_slab(struct kmem_cache_node *n, bool pfmemalloc) { struct page *page; - assert_spin_locked(&n->list_lock); + assert_raw_spin_locked(&n->list_lock); page = list_first_entry_or_null(&n->slabs_partial, struct page, slab_list); if (!page) { @@ -2853,10 +2853,10 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, if (!gfp_pfmemalloc_allowed(flags)) return NULL; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); page = get_first_slab(n, true); if (!page) { - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); return NULL; } @@ -2865,7 +2865,7 @@ static noinline void *cache_alloc_pfmemalloc(struct kmem_cache *cachep, fixup_slab_list(cachep, n, page, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); return obj; @@ -2924,7 +2924,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) if (!n->free_objects && (!shared || !shared->avail)) goto direct_grow; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); shared = READ_ONCE(n->shared); /* See if we can refill from the shared array */ @@ -2948,7 +2948,7 @@ static void *cache_alloc_refill(struct kmem_cache *cachep, gfp_t flags) must_grow: n->free_objects -= ac->avail; alloc_done: - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); direct_grow: @@ -3173,7 +3173,7 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, BUG_ON(!n); check_irq_off(); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); page = get_first_slab(n, false); if (!page) goto must_grow; @@ -3191,12 +3191,12 @@ static void *____cache_alloc_node(struct kmem_cache *cachep, gfp_t flags, fixup_slab_list(cachep, n, page, &list); - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); fixup_objfreelist_debug(cachep, &list); return obj; must_grow: - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); page = cache_grow_begin(cachep, gfp_exact_node(flags), nodeid); if (page) { /* This slab isn't counted yet so don't update free_objects */ @@ -3384,7 +3384,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) check_irq_off(); n = get_node(cachep, node); - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); if (n->shared) { struct array_cache *shared_array = n->shared; int max = shared_array->limit - shared_array->avail; @@ -3413,7 +3413,7 @@ static void cache_flusharray(struct kmem_cache *cachep, struct array_cache *ac) STATS_SET_FREEABLE(cachep, i); } #endif - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); ac->avail -= batchcount; memmove(ac->entry, &(ac->entry[batchcount]), sizeof(void *)*ac->avail); slabs_destroy(cachep, &list); @@ -3848,9 +3848,9 @@ static int do_tune_cpucache(struct kmem_cache *cachep, int limit, node = cpu_to_mem(cpu); n = get_node(cachep, node); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); free_block(cachep, ac->entry, ac->avail, node, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } free_percpu(prev); @@ -3945,9 +3945,9 @@ static void drain_array(struct kmem_cache *cachep, struct kmem_cache_node *n, return; } - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); drain_array_locked(cachep, ac, node, false, &list); - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); slabs_destroy(cachep, &list); } @@ -4031,7 +4031,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) for_each_kmem_cache_node(cachep, node, n) { check_irq_on(); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); total_slabs += n->total_slabs; free_slabs += n->free_slabs; @@ -4040,7 +4040,7 @@ void get_slabinfo(struct kmem_cache *cachep, struct slabinfo *sinfo) if (n->shared) shared_avail += n->shared->avail; - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); } num_objs = total_slabs * cachep->num; active_slabs = total_slabs - free_slabs; diff --git a/mm/slab.h b/mm/slab.h index 8414c3451..d937f8673 100644 --- a/mm/slab.h +++ b/mm/slab.h @@ -523,7 +523,7 @@ static inline void slab_post_alloc_hook(struct kmem_cache *s, * The slab lists for all objects. */ struct kmem_cache_node { - spinlock_t list_lock; + raw_spinlock_t list_lock; #ifdef CONFIG_SLAB struct list_head slabs_partial; /* partial list first, better asm code */ diff --git a/mm/slub.c b/mm/slub.c index 7a7b0bf82..c0cc11f82 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -458,7 +458,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, #ifdef CONFIG_SLUB_DEBUG static unsigned long object_map[BITS_TO_LONGS(MAX_OBJS_PER_PAGE)]; -static DEFINE_SPINLOCK(object_map_lock); +static DEFINE_RAW_SPINLOCK(object_map_lock); /* * Determine a map of object in use on a page. @@ -474,7 +474,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) VM_BUG_ON(!irqs_disabled()); - spin_lock(&object_map_lock); + raw_spin_lock(&object_map_lock); bitmap_zero(object_map, page->objects); @@ -487,7 +487,7 @@ static unsigned long *get_map(struct kmem_cache *s, struct page *page) static void put_map(unsigned long *map) __releases(&object_map_lock) { VM_BUG_ON(map != object_map); - spin_unlock(&object_map_lock); + raw_spin_unlock(&object_map_lock); } static inline unsigned int size_from_object(struct kmem_cache *s) @@ -1238,7 +1238,7 @@ static noinline int free_debug_processing( unsigned long flags; int ret = 0; - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); slab_lock(page); if (s->flags & SLAB_CONSISTENCY_CHECKS) { @@ -1273,7 +1273,7 @@ static noinline int free_debug_processing( bulk_cnt, cnt); slab_unlock(page); - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); if (!ret) slab_fix(s, "Object at 0x%p not freed", object); return ret; @@ -1521,6 +1521,12 @@ static bool freelist_corrupted(struct kmem_cache *s, struct page *page, } #endif /* CONFIG_SLUB_DEBUG */ +struct slub_free_list { + raw_spinlock_t lock; + struct list_head list; +}; +static DEFINE_PER_CPU(struct slub_free_list, slub_free_list); + /* * Hooks for other subsystems that check memory allocations. In a typical * production configuration these hooks all should produce no code at all. @@ -1776,10 +1782,18 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) void *start, *p, *next; int idx; bool shuffle; + bool enableirqs = false; flags &= gfp_allowed_mask; if (gfpflags_allow_blocking(flags)) + enableirqs = true; + +#ifdef CONFIG_PREEMPT_RT + if (system_state > SYSTEM_BOOTING && system_state < SYSTEM_SUSPEND) + enableirqs = true; +#endif + if (enableirqs) local_irq_enable(); flags |= s->allocflags; @@ -1838,7 +1852,7 @@ static struct page *allocate_slab(struct kmem_cache *s, gfp_t flags, int node) page->frozen = 1; out: - if (gfpflags_allow_blocking(flags)) + if (enableirqs) local_irq_disable(); if (!page) return NULL; @@ -1881,6 +1895,16 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __free_pages(page, order); } +static void free_delayed(struct list_head *h) +{ + while (!list_empty(h)) { + struct page *page = list_first_entry(h, struct page, lru); + + list_del(&page->lru); + __free_slab(page->slab_cache, page); + } +} + static void rcu_free_slab(struct rcu_head *h) { struct page *page = container_of(h, struct page, rcu_head); @@ -1892,6 +1916,12 @@ static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { call_rcu(&page->rcu_head, rcu_free_slab); + } else if (irqs_disabled()) { + struct slub_free_list *f = this_cpu_ptr(&slub_free_list); + + raw_spin_lock(&f->lock); + list_add(&page->lru, &f->list); + raw_spin_unlock(&f->lock); } else __free_slab(s, page); } @@ -1999,7 +2029,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, if (!n || !n->nr_partial) return NULL; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); list_for_each_entry_safe(page, page2, &n->partial, slab_list) { void *t; @@ -2024,7 +2054,7 @@ static void *get_partial_node(struct kmem_cache *s, struct kmem_cache_node *n, break; } - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); return object; } @@ -2267,7 +2297,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, * that acquire_slab() will see a slab page that * is frozen */ - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); } } else { m = M_FULL; @@ -2279,7 +2309,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, * slabs from diagnostic functions will not see * any frozen slabs. */ - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); } #endif } @@ -2304,7 +2334,7 @@ static void deactivate_slab(struct kmem_cache *s, struct page *page, goto redo; if (lock) - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); if (m == M_PARTIAL) stat(s, tail); @@ -2343,10 +2373,10 @@ static void unfreeze_partials(struct kmem_cache *s, n2 = get_node(s, page_to_nid(page)); if (n != n2) { if (n) - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); n = n2; - spin_lock(&n->list_lock); + raw_spin_lock(&n->list_lock); } do { @@ -2375,7 +2405,7 @@ static void unfreeze_partials(struct kmem_cache *s, } if (n) - spin_unlock(&n->list_lock); + raw_spin_unlock(&n->list_lock); while (discard_page) { page = discard_page; @@ -2412,14 +2442,21 @@ static void put_cpu_partial(struct kmem_cache *s, struct page *page, int drain) pobjects = oldpage->pobjects; pages = oldpage->pages; if (drain && pobjects > slub_cpu_partial(s)) { + struct slub_free_list *f; unsigned long flags; + LIST_HEAD(tofree); /* * partial array is full. Move the existing * set to the per node partial list. */ local_irq_save(flags); unfreeze_partials(s, this_cpu_ptr(s->cpu_slab)); + f = this_cpu_ptr(&slub_free_list); + raw_spin_lock(&f->lock); + list_splice_init(&f->list, &tofree); + raw_spin_unlock(&f->lock); local_irq_restore(flags); + free_delayed(&tofree); oldpage = NULL; pobjects = 0; pages = 0; @@ -2487,7 +2524,19 @@ static bool has_cpu_slab(int cpu, void *info) static void flush_all(struct kmem_cache *s) { + LIST_HEAD(tofree); + int cpu; + on_each_cpu_cond(has_cpu_slab, flush_cpu_slab, s, 1); + for_each_online_cpu(cpu) { + struct slub_free_list *f; + + f = &per_cpu(slub_free_list, cpu); + raw_spin_lock_irq(&f->lock); + list_splice_init(&f->list, &tofree); + raw_spin_unlock_irq(&f->lock); + free_delayed(&tofree); + } } /* @@ -2542,10 +2591,10 @@ static unsigned long count_partial(struct kmem_cache_node *n, unsigned long x = 0; struct page *page; - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, slab_list) x += get_count(page); - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); return x; } #endif /* CONFIG_SLUB_DEBUG || CONFIG_SYSFS */ @@ -2684,8 +2733,10 @@ static inline void *get_freelist(struct kmem_cache *s, struct page *page) * already disabled (which is the case for bulk allocation). */ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, - unsigned long addr, struct kmem_cache_cpu *c) + unsigned long addr, struct kmem_cache_cpu *c, + struct list_head *to_free) { + struct slub_free_list *f; void *freelist; struct page *page; @@ -2753,6 +2804,13 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, VM_BUG_ON(!c->page->frozen); c->freelist = get_freepointer(s, freelist); c->tid = next_tid(c->tid); + +out: + f = this_cpu_ptr(&slub_free_list); + raw_spin_lock(&f->lock); + list_splice_init(&f->list, to_free); + raw_spin_unlock(&f->lock); + return freelist; new_slab: @@ -2768,7 +2826,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, if (unlikely(!freelist)) { slab_out_of_memory(s, gfpflags, node); - return NULL; + goto out; } page = c->page; @@ -2781,7 +2839,7 @@ static void *___slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, goto new_slab; /* Slab failed checks. Next slab needed */ deactivate_slab(s, page, get_freepointer(s, freelist), c); - return freelist; + goto out; } /* @@ -2793,6 +2851,7 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, { void *p; unsigned long flags; + LIST_HEAD(tofree); local_irq_save(flags); #ifdef CONFIG_PREEMPTION @@ -2804,8 +2863,9 @@ static void *__slab_alloc(struct kmem_cache *s, gfp_t gfpflags, int node, c = this_cpu_ptr(s->cpu_slab); #endif - p = ___slab_alloc(s, gfpflags, node, addr, c); + p = ___slab_alloc(s, gfpflags, node, addr, c, &tofree); local_irq_restore(flags); + free_delayed(&tofree); return p; } @@ -2839,6 +2899,10 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, unsigned long tid; struct obj_cgroup *objcg = NULL; + if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP)) + WARN_ON_ONCE(!preemptible() && + (system_state > SYSTEM_BOOTING && system_state < SYSTEM_SUSPEND)); + s = slab_pre_alloc_hook(s, &objcg, 1, gfpflags); if (!s) return NULL; @@ -3013,7 +3077,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, do { if (unlikely(n)) { - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); n = NULL; } prior = page->freelist; @@ -3045,7 +3109,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, * Otherwise the list_lock will synchronize with * other processors updating the list of slabs. */ - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); } } @@ -3087,7 +3151,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, add_partial(n, page, DEACTIVATE_TO_TAIL); stat(s, FREE_ADD_PARTIAL); } - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); return; slab_empty: @@ -3102,7 +3166,7 @@ static void __slab_free(struct kmem_cache *s, struct page *page, remove_full(s, n, page); } - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); stat(s, FREE_SLAB); discard_slab(s, page); } @@ -3327,9 +3391,14 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, void **p) { struct kmem_cache_cpu *c; + LIST_HEAD(to_free); int i; struct obj_cgroup *objcg = NULL; + if (IS_ENABLED(CONFIG_PREEMPT_RT) && IS_ENABLED(CONFIG_DEBUG_ATOMIC_SLEEP)) + WARN_ON_ONCE(!preemptible() && + (system_state > SYSTEM_BOOTING && system_state < SYSTEM_SUSPEND)); + /* memcg and kmem_cache debug support */ s = slab_pre_alloc_hook(s, &objcg, size, flags); if (unlikely(!s)) @@ -3366,7 +3435,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, * of re-populating per CPU c->freelist */ p[i] = ___slab_alloc(s, flags, NUMA_NO_NODE, - _RET_IP_, c); + _RET_IP_, c, &to_free); if (unlikely(!p[i])) goto error; @@ -3381,6 +3450,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, } c->tid = next_tid(c->tid); local_irq_enable(); + free_delayed(&to_free); /* Clear memory outside IRQ disabled fastpath loop */ if (unlikely(slab_want_init_on_alloc(flags, s))) { @@ -3395,6 +3465,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, return i; error: local_irq_enable(); + free_delayed(&to_free); slab_post_alloc_hook(s, objcg, flags, i, p); __kmem_cache_free_bulk(s, i, p); return 0; @@ -3530,7 +3601,7 @@ static void init_kmem_cache_node(struct kmem_cache_node *n) { n->nr_partial = 0; - spin_lock_init(&n->list_lock); + raw_spin_lock_init(&n->list_lock); INIT_LIST_HEAD(&n->partial); #ifdef CONFIG_SLUB_DEBUG atomic_long_set(&n->nr_slabs, 0); @@ -3925,7 +3996,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) struct page *page, *h; BUG_ON(irqs_disabled()); - spin_lock_irq(&n->list_lock); + raw_spin_lock_irq(&n->list_lock); list_for_each_entry_safe(page, h, &n->partial, slab_list) { if (!page->inuse) { remove_partial(n, page); @@ -3935,7 +4006,7 @@ static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n) "Objects remaining in %s on __kmem_cache_shutdown()"); } } - spin_unlock_irq(&n->list_lock); + raw_spin_unlock_irq(&n->list_lock); list_for_each_entry_safe(page, h, &discard, slab_list) discard_slab(s, page); @@ -4204,7 +4275,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) for (i = 0; i < SHRINK_PROMOTE_MAX; i++) INIT_LIST_HEAD(promote + i); - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); /* * Build lists of slabs to discard or promote. @@ -4235,7 +4306,7 @@ int __kmem_cache_shrink(struct kmem_cache *s) for (i = SHRINK_PROMOTE_MAX - 1; i >= 0; i--) list_splice(promote + i, &n->partial); - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); /* Release empty slabs */ list_for_each_entry_safe(page, t, &discard, slab_list) @@ -4411,6 +4482,12 @@ void __init kmem_cache_init(void) static __initdata struct kmem_cache boot_kmem_cache, boot_kmem_cache_node; int node; + int cpu; + + for_each_possible_cpu(cpu) { + raw_spin_lock_init(&per_cpu(slub_free_list, cpu).lock); + INIT_LIST_HEAD(&per_cpu(slub_free_list, cpu).list); + } if (debug_guardpage_minorder()) slub_max_order = 0; @@ -4609,7 +4686,7 @@ static int validate_slab_node(struct kmem_cache *s, struct page *page; unsigned long flags; - spin_lock_irqsave(&n->list_lock, flags); + raw_spin_lock_irqsave(&n->list_lock, flags); list_for_each_entry(page, &n->partial, slab_list) { validate_slab(s, page); @@ -4631,7 +4708,7 @@ static int validate_slab_node(struct kmem_cache *s, s->name, count, atomic_long_read(&n->nr_slabs)); out: - spin_unlock_irqrestore(&n->list_lock, flags); + raw_spin_unlock_irqrestore(&n->list_lock, flags); return count; } @@ -4682,6 +4759,9 @@ static int alloc_loc_track(struct loc_track *t, unsigned long max, gfp_t flags) struct location *l; int order; + if (IS_ENABLED(CONFIG_PREEMPT_RT) && flags == GFP_ATOMIC) + return 0; + order = get_order(sizeof(struct location) * max); l = (void *)__get_free_pages(flags, order); @@ -4810,12 +4890,12 @@ static int list_locations(struct kmem_cache *s, char *buf, if (!atomic_long_read(&n->nr_slabs)) continue; - spin_lock_irqsave(&n->list_lock,