diff options
Diffstat (limited to 'sys-kernel/linux-image-redcore-lts')
57 files changed, 55728 insertions, 42 deletions
diff --git a/sys-kernel/linux-image-redcore-lts/Manifest b/sys-kernel/linux-image-redcore-lts/Manifest index ce5c5ca8..89a7e6c1 100644 --- a/sys-kernel/linux-image-redcore-lts/Manifest +++ b/sys-kernel/linux-image-redcore-lts/Manifest @@ -1 +1,2 @@ -DIST linux-4.14.90.tar.xz 101040436 BLAKE2B 09a86e08150b3c01734078e3525a4cef16ee390ea7ba464152d4c886caafe078009a3fae265f626541240a542d7bc43944a102caa2ed067a002d5b687dde6f84 SHA512 df744c138d48f3a711a9abe9908da4cc115091c090f8a9cb13259677572a995361de42710052d6654a28dec0b104ca5c3e9a54dea98a961ec91217cd72dbe58e +DIST linux-4.14.95.tar.xz 101049552 BLAKE2B 48e55bc8b7844d2ede44254b884201f4a239afd45e680ac6c4fc5fae8550eb4a0ad714c4625421ddffea89b09f7841a347e1e032716178bda123abecbecaa28f SHA512 ab1e9c54a852adfa4b0fb451db8cdafecb3cf1adca9dcc0574c2c2a5e7edd9ba77a551ac27a4e16fcf23d89c8f278b04b5c522b7d7b4043ab0309084a3b35a89 +DIST linux-4.19.20.tar.xz 103142620 BLAKE2B a1dbb52aa6727906792741a80b49a26d62ecb40306e8545854967def16875fb1b8d5e09894dd310aa32060155006eecad5fe461c44074a8fecd90d4fc5dc47ce SHA512 1eac44b81c54f34faf782c9d6990703c463206d8c16716c4c3be5c7a7add3a8f4c5695f6191ffdf3a0ffdc549dda5f0ca154e6751fa024d2fae2684cc4e5e182 diff --git a/sys-kernel/linux-image-redcore-lts/files/0001-BFQ-v8r12-20171108.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch index db7d064b..db7d064b 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0001-BFQ-v8r12-20171108.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0001-BFQ-v8r12-20171108.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch index a81dbeac..a81dbeac 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0002-BFQ-v8r12-20180404.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch index 104325d6..104325d6 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0002-BFQ-v8r12-20180404.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0002-BFQ-v8r12-20180404.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0002-Make-preemptible-kernel-default.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0002-Make-preemptible-kernel-default.patch index 69abb373..69abb373 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0002-Make-preemptible-kernel-default.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0002-Make-preemptible-kernel-default.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0003-Expose-vmsplit-for-our-poor-32-bit-users.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0003-Expose-vmsplit-for-our-poor-32-bit-users.patch index b7897dbe..b7897dbe 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0003-Expose-vmsplit-for-our-poor-32-bit-users.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0003-Expose-vmsplit-for-our-poor-32-bit-users.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0004-Create-highres-timeout-variants-of-schedule_timeout-.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0004-Create-highres-timeout-variants-of-schedule_timeout-.patch index 3c182fbe..3c182fbe 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0004-Create-highres-timeout-variants-of-schedule_timeout-.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0004-Create-highres-timeout-variants-of-schedule_timeout-.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch index 3c889719..3c889719 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0006-Convert-msleep-to-use-hrtimers-when-active.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0006-Convert-msleep-to-use-hrtimers-when-active.patch index 2f065652..2f065652 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0006-Convert-msleep-to-use-hrtimers-when-active.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0006-Convert-msleep-to-use-hrtimers-when-active.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch index ff071da8..ff071da8 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch index f9f274ce..f9f274ce 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch index c910f3df..c910f3df 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch index 260bb98d..260bb98d 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch index 5ac20300..5ac20300 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch index 99b28d65..99b28d65 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch index 63ec9fdf..63ec9fdf 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0014-Swap-sucks.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0014-Swap-sucks.patch index 6bf5bcda..6bf5bcda 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0014-Swap-sucks.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0014-Swap-sucks.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch index bfa509a5..bfa509a5 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0016-unfuck-MuQSS-on-linux-4_14_15+.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0016-unfuck-MuQSS-on-linux-4_14_15+.patch index f7dc1d1c..f7dc1d1c 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0016-unfuck-MuQSS-on-linux-4_14_15+.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0016-unfuck-MuQSS-on-linux-4_14_15+.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/0017-unfuck-MuQSS-on-linux-4_14_75+.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-0017-unfuck-MuQSS-on-linux-4_14_75+.patch index 1a1717bf..1a1717bf 100644 --- a/sys-kernel/linux-image-redcore-lts/files/0017-unfuck-MuQSS-on-linux-4_14_75+.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-0017-unfuck-MuQSS-on-linux-4_14_75+.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/Revert-ath10k-activate-user-space-firmware-loading.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-Revert-ath10k-activate-user-space-firmware-loading.patch index 28f9b2f6..28f9b2f6 100644 --- a/sys-kernel/linux-image-redcore-lts/files/Revert-ath10k-activate-user-space-firmware-loading.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-Revert-ath10k-activate-user-space-firmware-loading.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/introduce-NUMA-identity-node-sched-domain.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-introduce-NUMA-identity-node-sched-domain.patch index 2376edae..2376edae 100644 --- a/sys-kernel/linux-image-redcore-lts/files/introduce-NUMA-identity-node-sched-domain.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-introduce-NUMA-identity-node-sched-domain.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/k10temp-add-ZEN-support.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-k10temp-add-ZEN-support.patch index b1e8a9b0..b1e8a9b0 100644 --- a/sys-kernel/linux-image-redcore-lts/files/k10temp-add-ZEN-support.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-k10temp-add-ZEN-support.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/linux-hardened.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-linux-hardened.patch index d07e831b..9280791e 100644 --- a/sys-kernel/linux-image-redcore-lts/files/linux-hardened.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-linux-hardened.patch @@ -1,5 +1,5 @@ diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 5f3d58142600..c5566972d058 100644 +index 7d8b17ce8804..7e4f071c3bf2 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -490,16 +490,6 @@ @@ -19,7 +19,7 @@ index 5f3d58142600..c5566972d058 100644 cio_ignore= [S390] See Documentation/s390/CommonIO for details. clk_ignore_unused -@@ -2981,6 +2971,11 @@ +@@ -2984,6 +2974,11 @@ the specified number of seconds. This is to be used if your oopses keep scrolling off the screen. @@ -71,7 +71,7 @@ index 694968c7523c..002d86416ef8 100644 The value in this file affects behavior of handling NMI. When the diff --git a/Makefile b/Makefile -index 280c7193e246..c869bc294766 100644 +index 70cc37cb3e99..edc3de99b3cd 100644 --- a/Makefile +++ b/Makefile @@ -714,6 +714,9 @@ endif @@ -536,7 +536,7 @@ index 3141e67ec24c..e93173193f60 100644 printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", start, start+size); diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c -index 642357aff216..8bbf93ce3cd2 100644 +index 624edfbff02d..54bb0705dd53 100644 --- a/arch/x86/mm/init_64.c +++ b/arch/x86/mm/init_64.c @@ -65,7 +65,7 @@ @@ -548,7 +548,7 @@ index 642357aff216..8bbf93ce3cd2 100644 EXPORT_SYMBOL_GPL(__supported_pte_mask); int force_personality32; -@@ -1185,7 +1185,7 @@ void __init mem_init(void) +@@ -1179,7 +1179,7 @@ void __init mem_init(void) mem_init_print_info(NULL); } @@ -557,7 +557,7 @@ index 642357aff216..8bbf93ce3cd2 100644 void set_kernel_text_rw(void) { -@@ -1234,9 +1234,8 @@ void mark_rodata_ro(void) +@@ -1228,9 +1228,8 @@ void mark_rodata_ro(void) printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", (end - start) >> 10); @@ -779,7 +779,7 @@ index b811442c5ce6..4f62a63cbcb1 100644 A pseudo terminal (PTY) is a software device consisting of two halves: a master and a slave. The slave device behaves identical to diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c -index 83376caa571b..4aa47ca17268 100644 +index 417b81c67fe9..4e9bb7851ab1 100644 --- a/drivers/tty/tty_io.c +++ b/drivers/tty/tty_io.c @@ -171,6 +171,7 @@ static void free_tty_struct(struct tty_struct *tty) @@ -790,7 +790,7 @@ index 83376caa571b..4aa47ca17268 100644 kfree(tty); } -@@ -2159,11 +2160,19 @@ static int tty_fasync(int fd, struct file *filp, int on) +@@ -2167,11 +2168,19 @@ static int tty_fasync(int fd, struct file *filp, int on) * FIXME: may race normal receive processing */ @@ -810,7 +810,7 @@ index 83376caa571b..4aa47ca17268 100644 if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (get_user(ch, p)) -@@ -2846,6 +2855,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) +@@ -2854,6 +2863,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) tty->index = idx; tty_line_name(driver, idx, tty->name); tty->dev = tty_get_device(tty); @@ -1584,7 +1584,7 @@ index 991af683ef9e..66f66b648707 100644 if (err) return err; diff --git a/kernel/fork.c b/kernel/fork.c -index 6a219fea4926..013703c020f6 100644 +index 6d6ce2c3a364..951a76b3dc32 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -102,6 +102,11 @@ @@ -1610,7 +1610,7 @@ index 6a219fea4926..013703c020f6 100644 /* * Thread groups must share signals as well, and detached threads * can only be started up within the thread group. -@@ -2348,6 +2357,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) +@@ -2357,6 +2366,12 @@ SYSCALL_DEFINE1(unshare, unsigned long, unshare_flags) if (unshare_flags & CLONE_NEWNS) unshare_flags |= CLONE_FS; @@ -1672,10 +1672,10 @@ index 710ce1d6b982..4013b634e820 100644 struct rcu_state *rsp; diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 7240bb4a4090..9adcec5bcbd9 100644 +index f33b24080b1c..99c5e423906f 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c -@@ -9005,7 +9005,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } +@@ -8982,7 +8982,7 @@ static void nohz_idle_balance(struct rq *this_rq, enum cpu_idle_type idle) { } * run_rebalance_domains is triggered when needed from the scheduler tick. * Also triggered for nohz idle balancing (with nohz_balancing_kick set). */ @@ -2581,7 +2581,7 @@ index f48fe6fc7e8c..d78c52835c08 100644 Normal TCP/IP networking is open to an attack known as "SYN flooding". This denial-of-service attack prevents legitimate remote diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c -index 957f6041dd79..7be404c9fb47 100644 +index 18bc8738e989..d2866f6dd736 100644 --- a/scripts/mod/modpost.c +++ b/scripts/mod/modpost.c @@ -37,6 +37,7 @@ static int vmlinux_section_warnings = 1; diff --git a/sys-kernel/linux-image-redcore-lts/files/mute-pps_state_mismatch.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-mute-pps_state_mismatch.patch index dc1d254b..dc1d254b 100644 --- a/sys-kernel/linux-image-redcore-lts/files/mute-pps_state_mismatch.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-mute-pps_state_mismatch.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/redcore-lts-amd64.config b/sys-kernel/linux-image-redcore-lts/files/4.14-redcore-lts-amd64.config index 23e35863..23e35863 100644 --- a/sys-kernel/linux-image-redcore-lts/files/redcore-lts-amd64.config +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-redcore-lts-amd64.config diff --git a/sys-kernel/linux-image-redcore-lts/files/restore-SD_PREFER_SIBLING-on-MC-domains.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-restore-SD_PREFER_SIBLING-on-MC-domains.patch index b6be46cc..b6be46cc 100644 --- a/sys-kernel/linux-image-redcore-lts/files/restore-SD_PREFER_SIBLING-on-MC-domains.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-restore-SD_PREFER_SIBLING-on-MC-domains.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/uksm-linux-hardened.patch b/sys-kernel/linux-image-redcore-lts/files/4.14-uksm-linux-hardened.patch index f0596117..f0596117 100644 --- a/sys-kernel/linux-image-redcore-lts/files/uksm-linux-hardened.patch +++ b/sys-kernel/linux-image-redcore-lts/files/4.14-uksm-linux-hardened.patch diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0001-MultiQueue-Skiplist-Scheduler-version-v0.180-linux-hardened.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0001-MultiQueue-Skiplist-Scheduler-version-v0.180-linux-hardened.patch new file mode 100644 index 00000000..ee298f6a --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0001-MultiQueue-Skiplist-Scheduler-version-v0.180-linux-hardened.patch @@ -0,0 +1,10305 @@ +diff -Nur a/arch/powerpc/platforms/cell/spufs/sched.c b/arch/powerpc/platforms/cell/spufs/sched.c +--- a/arch/powerpc/platforms/cell/spufs/sched.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/arch/powerpc/platforms/cell/spufs/sched.c 2019-02-09 17:46:11.991297545 +0000 +@@ -65,11 +65,6 @@ + static struct timer_list spuloadavg_timer; + + /* +- * Priority of a normal, non-rt, non-niced'd process (aka nice level 0). +- */ +-#define NORMAL_PRIO 120 +- +-/* + * Frequency of the spu scheduler tick. By default we do one SPU scheduler + * tick for every 10 CPU scheduler ticks. + */ +diff -Nur a/arch/x86/Kconfig b/arch/x86/Kconfig +--- a/arch/x86/Kconfig 2019-02-09 17:20:30.461820549 +0000 ++++ b/arch/x86/Kconfig 2019-02-09 17:51:10.780941815 +0000 +@@ -1003,6 +1003,20 @@ + config SCHED_SMT + def_bool y if SMP + ++config SMT_NICE ++ bool "SMT (Hyperthreading) aware nice priority and policy support" ++ depends on SCHED_MUQSS && SCHED_SMT ++ default y ++ ---help--- ++ Enabling Hyperthreading on Intel CPUs decreases the effectiveness ++ of the use of 'nice' levels and different scheduling policies ++ (e.g. realtime) due to sharing of CPU power between hyperthreads. ++ SMT nice support makes each logical CPU aware of what is running on ++ its hyperthread siblings, maintaining appropriate distribution of ++ CPU according to nice levels and scheduling policies at the expense ++ of slightly increased overhead. ++ If unsure say Y here. ++ + config SCHED_MC + def_bool y + prompt "Multi-core scheduler support" +@@ -1033,6 +1047,80 @@ + + If unsure say Y here. + ++choice ++ prompt "CPU scheduler runqueue sharing" ++ default RQ_MC if SCHED_MUQSS ++ default RQ_NONE ++ ++config RQ_NONE ++ bool "No sharing" ++ help ++ This is the default behaviour where the CPU scheduler has one runqueue ++ per CPU, whether it is a physical or logical CPU (hyperthread). ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=none ++ ++ If unsure, say N. ++ ++config RQ_SMT ++ bool "SMT (hyperthread) siblings" ++ depends on SCHED_SMT && SCHED_MUQSS ++ ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by SMT (hyperthread) siblings. As these logical cores share ++ one physical core, sharing the runqueue resource can lead to decreased ++ overhead, lower latency and higher throughput. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smt ++ ++ If unsure, say N. ++ ++config RQ_MC ++ bool "Multicore siblings" ++ depends on SCHED_MC && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by multicore siblings in addition to any SMT siblings. ++ As these physical cores share caches, sharing the runqueue resource ++ will lead to lower latency, but its effects on overhead and throughput ++ are less predictable. As a general rule, 6 or fewer cores will likely ++ benefit from this, while larger CPUs will only derive a latency ++ benefit. If your workloads are primarily single threaded, this will ++ possibly worsen throughput. If you are only concerned about latency ++ then enable this regardless of how many cores you have. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=mc ++ ++ If unsure, say Y. ++ ++config RQ_SMP ++ bool "Symmetric Multi-Processing" ++ depends on SMP && SCHED_MUQSS ++ help ++ With this option enabled, the CPU scheduler will have one runqueue ++ shared by all physical CPUs unless they are on separate NUMA nodes. ++ As physical CPUs usually do not share resources, sharing the runqueue ++ will normally worsen throughput but improve latency. If you only ++ care about latency enable this. ++ ++ This can still be enabled runtime with the boot parameter ++ rqshare=smp ++ ++ If unsure, say N. ++endchoice ++ ++config SHARERQ ++ int ++ default 0 if RQ_NONE ++ default 1 if RQ_SMT ++ default 2 if RQ_MC ++ default 3 if RQ_SMP ++ ++ + config UP_LATE_INIT + def_bool y + depends on !SMP && X86_LOCAL_APIC +diff -Nur a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +--- a/Documentation/admin-guide/kernel-parameters.txt 2019-02-09 17:20:30.451820228 +0000 ++++ b/Documentation/admin-guide/kernel-parameters.txt 2019-02-09 17:46:11.981297222 +0000 +@@ -4003,6 +4003,14 @@ + Memory area to be used by remote processor image, + managed by CMA. + ++ rqshare= [X86] Select the MuQSS scheduler runqueue sharing type. ++ Format: <string> ++ smt -- Share SMT (hyperthread) sibling runqueues ++ mc -- Share MC (multicore) sibling runqueues ++ smp -- Share SMP runqueues ++ none -- So not share any runqueues ++ Default value is mc ++ + rw [KNL] Mount root device read-write on boot + + S [KNL] Run init in single mode +diff -Nur a/Documentation/scheduler/sched-BFS.txt b/Documentation/scheduler/sched-BFS.txt +--- a/Documentation/scheduler/sched-BFS.txt 1970-01-01 01:00:00.000000000 +0100 ++++ b/Documentation/scheduler/sched-BFS.txt 2019-02-09 17:46:11.981297222 +0000 +@@ -0,0 +1,351 @@ ++BFS - The Brain Fuck Scheduler by Con Kolivas. ++ ++Goals. ++ ++The goal of the Brain Fuck Scheduler, referred to as BFS from here on, is to ++completely do away with the complex designs of the past for the cpu process ++scheduler and instead implement one that is very simple in basic design. ++The main focus of BFS is to achieve excellent desktop interactivity and ++responsiveness without heuristics and tuning knobs that are difficult to ++understand, impossible to model and predict the effect of, and when tuned to ++one workload cause massive detriment to another. ++ ++ ++Design summary. ++ ++BFS is best described as a single runqueue, O(n) lookup, earliest effective ++virtual deadline first design, loosely based on EEVDF (earliest eligible virtual ++deadline first) and my previous Staircase Deadline scheduler. Each component ++shall be described in order to understand the significance of, and reasoning for ++it. The codebase when the first stable version was released was approximately ++9000 lines less code than the existing mainline linux kernel scheduler (in ++2.6.31). This does not even take into account the removal of documentation and ++the cgroups code that is not used. ++ ++Design reasoning. ++ ++The single runqueue refers to the queued but not running processes for the ++entire system, regardless of the number of CPUs. The reason for going back to ++a single runqueue design is that once multiple runqueues are introduced, ++per-CPU or otherwise, there will be complex interactions as each runqueue will ++be responsible for the scheduling latency and fairness of the tasks only on its ++own runqueue, and to achieve fairness and low latency across multiple CPUs, any ++advantage in throughput of having CPU local tasks causes other disadvantages. ++This is due to requiring a very complex balancing system to at best achieve some ++semblance of fairness across CPUs and can only maintain relatively low latency ++for tasks bound to the same CPUs, not across them. To increase said fairness ++and latency across CPUs, the advantage of local runqueue locking, which makes ++for better scalability, is lost due to having to grab multiple locks. ++ ++A significant feature of BFS is that all accounting is done purely based on CPU ++used and nowhere is sleep time used in any way to determine entitlement or ++interactivity. Interactivity "estimators" that use some kind of sleep/run ++algorithm are doomed to fail to detect all interactive tasks, and to falsely tag ++tasks that aren't interactive as being so. The reason for this is that it is ++close to impossible to determine that when a task is sleeping, whether it is ++doing it voluntarily, as in a userspace application waiting for input in the ++form of a mouse click or otherwise, or involuntarily, because it is waiting for ++another thread, process, I/O, kernel activity or whatever. Thus, such an ++estimator will introduce corner cases, and more heuristics will be required to ++cope with those corner cases, introducing more corner cases and failed ++interactivity detection and so on. Interactivity in BFS is built into the design ++by virtue of the fact that tasks that are waking up have not used up their quota ++of CPU time, and have earlier effective deadlines, thereby making it very likely ++they will preempt any CPU bound task of equivalent nice level. See below for ++more information on the virtual deadline mechanism. Even if they do not preempt ++a running task, because the rr interval is guaranteed to have a bound upper ++limit on how long a task will wait for, it will be scheduled within a timeframe ++that will not cause visible interface jitter. ++ ++ ++Design details. ++ ++Task insertion. ++ ++BFS inserts tasks into each relevant queue as an O(1) insertion into a double ++linked list. On insertion, *every* running queue is checked to see if the newly ++queued task can run on any idle queue, or preempt the lowest running task on the ++system. This is how the cross-CPU scheduling of BFS achieves significantly lower ++latency per extra CPU the system has. In this case the lookup is, in the worst ++case scenario, O(n) where n is the number of CPUs on the system. ++ ++Data protection. ++ ++BFS has one single lock protecting the process local data of every task in the ++global queue. Thus every insertion, removal and modification of task data in the ++global runqueue needs to grab the global lock. However, once a task is taken by ++a CPU, the CPU has its own local data copy of the running process' accounting ++information which only that CPU accesses and modifies (such as during a ++timer tick) thus allowing the accounting data to be updated lockless. Once a ++CPU has taken a task to run, it removes it from the global queue. Thus the ++global queue only ever has, at most, ++ ++ (number of tasks requesting cpu time) - (number of logical CPUs) + 1 ++ ++tasks in the global queue. This value is relevant for the time taken to look up ++tasks during scheduling. This will increase if many tasks with CPU affinity set ++in their policy to limit which CPUs they're allowed to run on if they outnumber ++the number of CPUs. The +1 is because when rescheduling a task, the CPU's ++currently running task is put back on the queue. Lookup will be described after ++the virtual deadline mechanism is explained. ++ ++Virtual deadline. ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in BFS is entirely in the virtual deadline mechanism. The one ++tunable in BFS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in jiffies by this equation: ++ ++ jiffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. Once a task is descheduled, it is put back on the queue, and an ++O(n) lookup of all queued-but-not-running tasks is done to determine which has ++the earliest deadline and that task is chosen to receive CPU next. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (jiffies) is ++constantly moving. ++ ++Task lookup. ++ ++BFS has 103 priority queues. 100 of these are dedicated to the static priority ++of realtime tasks, and the remaining 3 are, in order of best to worst priority, ++SCHED_ISO (isochronous), SCHED_NORMAL, and SCHED_IDLEPRIO (idle priority ++scheduling). When a task of these priorities is queued, a bitmap of running ++priorities is set showing which of these priorities has tasks waiting for CPU ++time. When a CPU is made to reschedule, the lookup for the next task to get ++CPU time is performed in the following way: ++ ++First the bitmap is checked to see what static priority tasks are queued. If ++any realtime priorities are found, the corresponding queue is checked and the ++first task listed there is taken (provided CPU affinity is suitable) and lookup ++is complete. If the priority corresponds to a SCHED_ISO task, they are also ++taken in FIFO order (as they behave like SCHED_RR). If the priority corresponds ++to either SCHED_NORMAL or SCHED_IDLEPRIO, then the lookup becomes O(n). At this ++stage, every task in the runlist that corresponds to that priority is checked ++to see which has the earliest set deadline, and (provided it has suitable CPU ++affinity) it is taken off the runqueue and given the CPU. If a task has an ++expired deadline, it is taken and the rest of the lookup aborted (as they are ++chosen in FIFO order). ++ ++Thus, the lookup is O(n) in the worst case only, where n is as described ++earlier, as tasks may be chosen before the whole task list is looked over. ++ ++ ++Scalability. ++ ++The major limitations of BFS will be that of scalability, as the separate ++runqueue designs will have less lock contention as the number of CPUs rises. ++However they do not scale linearly even with separate runqueues as multiple ++runqueues will need to be locked concurrently on such designs to be able to ++achieve fair CPU balancing, to try and achieve some sort of nice-level fairness ++across CPUs, and to achieve low enough latency for tasks on a busy CPU when ++other CPUs would be more suited. BFS has the advantage that it requires no ++balancing algorithm whatsoever, as balancing occurs by proxy simply because ++all CPUs draw off the global runqueue, in priority and deadline order. Despite ++the fact that scalability is _not_ the prime concern of BFS, it both shows very ++good scalability to smaller numbers of CPUs and is likely a more scalable design ++at these numbers of CPUs. ++ ++It also has some very low overhead scalability features built into the design ++when it has been deemed their overhead is so marginal that they're worth adding. ++The first is the local copy of the running process' data to the CPU it's running ++on to allow that data to be updated lockless where possible. Then there is ++deference paid to the last CPU a task was running on, by trying that CPU first ++when looking for an idle CPU to use the next time it's scheduled. Finally there ++is the notion of cache locality beyond the last running CPU. The sched_domains ++information is used to determine the relative virtual "cache distance" that ++other CPUs have from the last CPU a task was running on. CPUs with shared ++caches, such as SMT siblings, or multicore CPUs with shared caches, are treated ++as cache local. CPUs without shared caches are treated as not cache local, and ++CPUs on different NUMA nodes are treated as very distant. This "relative cache ++distance" is used by modifying the virtual deadline value when doing lookups. ++Effectively, the deadline is unaltered between "cache local" CPUs, doubled for ++"cache distant" CPUs, and quadrupled for "very distant" CPUs. The reasoning ++behind the doubling of deadlines is as follows. The real cost of migrating a ++task from one CPU to another is entirely dependant on the cache footprint of ++the task, how cache intensive the task is, how long it's been running on that ++CPU to take up the bulk of its cache, how big the CPU cache is, how fast and ++how layered the CPU cache is, how fast a context switch is... and so on. In ++other words, it's close to random in the real world where we do more than just ++one sole workload. The only thing we can be sure of is that it's not free. So ++BFS uses the principle that an idle CPU is a wasted CPU and utilising idle CPUs ++is more important than cache locality, and cache locality only plays a part ++after that. Doubling the effective deadline is based on the premise that the ++"cache local" CPUs will tend to work on the same tasks up to double the number ++of cache local CPUs, and once the workload is beyond that amount, it is likely ++that none of the tasks are cache warm anywhere anyway. The quadrupling for NUMA ++is a value I pulled out of my arse. ++ ++When choosing an idle CPU for a waking task, the cache locality is determined ++according to where the task last ran and then idle CPUs are ranked from best ++to worst to choose the most suitable idle CPU based on cache locality, NUMA ++node locality and hyperthread sibling business. They are chosen in the ++following preference (if idle): ++ ++* Same core, idle or busy cache, idle threads ++* Other core, same cache, idle or busy cache, idle threads. ++* Same node, other CPU, idle cache, idle threads. ++* Same node, other CPU, busy cache, idle threads. ++* Same core, busy threads. ++* Other core, same cache, busy threads. ++* Same node, other CPU, busy threads. ++* Other node, other CPU, idle cache, idle threads. ++* Other node, other CPU, busy cache, idle threads. ++* Other node, other CPU, busy threads. ++ ++This shows the SMT or "hyperthread" awareness in the design as well which will ++choose a real idle core first before a logical SMT sibling which already has ++tasks on the physical CPU. ++ ++Early benchmarking of BFS suggested scalability dropped off at the 16 CPU mark. ++However this benchmarking was performed on an earlier design that was far less ++scalable than the current one so it's hard to know how scalable it is in terms ++of both CPUs (due to the global runqueue) and heavily loaded machines (due to ++O(n) lookup) at this stage. Note that in terms of scalability, the number of ++_logical_ CPUs matters, not the number of _physical_ CPUs. Thus, a dual (2x) ++quad core (4X) hyperthreaded (2X) machine is effectively a 16X. Newer benchmark ++results are very promising indeed, without needing to tweak any knobs, features ++or options. Benchmark contributions are most welcome. ++ ++ ++Features ++ ++As the initial prime target audience for BFS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are precisely 2 tunables, and 2 extra scheduling policies. The rr_interval ++and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO policies. In addition ++to this, BFS also uses sub-tick accounting. What BFS does _not_ now feature is ++support for CGROUPS. The average user should neither need to know what these ++are, nor should they need to be using them to have good desktop behaviour. ++ ++rr_interval ++ ++There is only one "scheduler" tunable, the round robin interval. This can be ++accessed in ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6 on a ++uniprocessor machine, and automatically set to a progressively higher value on ++multiprocessor machines. The reasoning behind increasing the value on more CPUs ++is that the effective latency is decreased by virtue of there being more CPUs on ++BFS (for reasons explained above), and increasing the value allows for less ++cache contention and more throughput. Valid values are from 1 to 1000 ++Decreasing the value will decrease latencies at the cost of decreasing ++throughput, while increasing it will improve throughput, but at the cost of ++worsening latencies. The accuracy of the rr interval is limited by HZ resolution ++of the kernel configuration. Thus, the worst case latencies are usually slightly ++higher than this actual value. The default value of 6 is not an arbitrary one. ++It is based on the fact that humans can detect jitter at approximately 7ms, so ++aiming for much lower latencies is pointless under most circumstances. It is ++worth noting this fact when comparing the latency performance of BFS to other ++schedulers. Worst case latencies being higher than 7ms are far worse than ++average latencies not being in the microsecond range. ++ ++Isochronous scheduling. ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of _total CPU_ available across the machine, configurable ++as a percentage in the following "resource handling" tunable (as opposed to a ++scheduler tunable): ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of BFS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++Because some applications constantly set their policy as well as their nice ++level, there is potential for them to undo the override specified by the user ++on the command line of setting the policy to SCHED_ISO. To counter this, once ++a task has been set to SCHED_ISO policy, it needs superuser privileges to set ++it back to SCHED_NORMAL. This will ensure the task remains ISO and all child ++processes and threads will also inherit the ISO policy. ++ ++Idleprio scheduling. ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start ++a video encode or so on without any slowdown of other tasks. To avoid this ++policy from grabbing shared resources and holding them indefinitely, if it ++detects a state where the task is waiting on I/O, the machine is about to ++suspend to ram and so on, it will transiently schedule them as SCHED_NORMAL. As ++per the Isochronous task management, once a task has been scheduled as IDLEPRIO, ++it cannot be put back to SCHED_NORMAL without superuser privileges. Tasks can ++be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++ schedtool -D -e ./mprime ++ ++Subtick accounting. ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the ++timer tick frequency (HZ) is lowered. It is possible to create an application ++which uses almost 100% CPU, yet by being descheduled at the right time, records ++zero CPU usage. While the main problem with this is that there are possible ++security implications, it is also difficult to determine how much CPU a task ++really does use. BFS tries to use the sub-tick accounting from the TSC clock, ++where possible, to determine real CPU usage. This is not entirely reliable, but ++is far more likely to produce accurate CPU usage data than the existing designs ++and will not show tasks as consuming no CPU usage when they actually are. Thus, ++the amount of CPU reported as being used by BFS will more accurately represent ++how much CPU the task itself is using (as is shown for example by the 'time' ++application), so the reported values may be quite different to other schedulers. ++Values reported as the 'load' are more prone to problems with this design, but ++per process values are closer to real usage. When comparing throughput of BFS ++to other designs, it is important to compare the actual completed work in terms ++of total wall clock time taken and total work done, rather than the reported ++"cpu usage". ++ ++ ++Con Kolivas <kernel@kolivas.org> Fri Aug 27 2010 +diff -Nur a/Documentation/scheduler/sched-MuQSS.txt b/Documentation/scheduler/sched-MuQSS.txt +--- a/Documentation/scheduler/sched-MuQSS.txt 1970-01-01 01:00:00.000000000 +0100 ++++ b/Documentation/scheduler/sched-MuQSS.txt 2019-02-09 17:46:11.991297545 +0000 +@@ -0,0 +1,373 @@ ++MuQSS - The Multiple Queue Skiplist Scheduler by Con Kolivas. ++ ++MuQSS is a per-cpu runqueue variant of the original BFS scheduler with ++one 8 level skiplist per runqueue, and fine grained locking for much more ++scalability. ++ ++ ++Goals. ++ ++The goal of the Multiple Queue Skiplist Scheduler, referred to as MuQSS from ++here on (pronounced mux) is to completely do away with the complex designs of ++the past for the cpu process scheduler and instead implement one that is very ++simple in basic design. The main focus of MuQSS is to achieve excellent desktop ++interactivity and responsiveness without heuristics and tuning knobs that are ++difficult to understand, impossible to model and predict the effect of, and when ++tuned to one workload cause massive detriment to another, while still being ++scalable to many CPUs and processes. ++ ++ ++Design summary. ++ ++MuQSS is best described as per-cpu multiple runqueue, O(log n) insertion, O(1) ++lookup, earliest effective virtual deadline first tickless design, loosely based ++on EEVDF (earliest eligible virtual deadline first) and my previous Staircase ++Deadline scheduler, and evolved from the single runqueue O(n) BFS scheduler. ++Each component shall be described in order to understand the significance of, ++and reasoning for it. ++ ++ ++Design reasoning. ++ ++In BFS, the use of a single runqueue across all CPUs meant that each CPU would ++need to scan the entire runqueue looking for the process with the earliest ++deadline and schedule that next, regardless of which CPU it originally came ++from. This made BFS deterministic with respect to latency and provided ++guaranteed latencies dependent on number of processes and CPUs. The single ++runqueue, however, meant that all CPUs would compete for the single lock ++protecting it, which would lead to increasing lock contention as the number of ++CPUs rose and appeared to limit scalability of common workloads beyond 16 ++logical CPUs. Additionally, the O(n) lookup of the runqueue list obviously ++increased overhead proportionate to the number of queued proecesses and led to ++cache thrashing while iterating over the linked list. ++ ++MuQSS is an evolution of BFS, designed to maintain the same scheduling ++decision mechanism and be virtually deterministic without relying on the ++constrained design of the single runqueue by splitting out the single runqueue ++to be per-CPU and use skiplists instead of linked lists. ++ ++The original reason for going back to a single runqueue design for BFS was that ++once multiple runqueues are introduced, per-CPU or otherwise, there will be ++complex interactions as each runqueue will be responsible for the scheduling ++latency and fairness of the tasks only on its own runqueue, and to achieve ++fairness and low latency across multiple CPUs, any advantage in throughput of ++having CPU local tasks causes other disadvantages. This is due to requiring a ++very complex balancing system to at best achieve some semblance of fairness ++across CPUs and can only maintain relatively low latency for tasks bound to the ++same CPUs, not across them. To increase said fairness and latency across CPUs, ++the advantage of local runqueue locking, which makes for better scalability, is ++lost due to having to grab multiple locks. ++ ++MuQSS works around the problems inherent in multiple runqueue designs by ++making its skip lists priority ordered and through novel use of lockless ++examination of each other runqueue it can decide if it should take the earliest ++deadline task from another runqueue for latency reasons, or for CPU balancing ++reasons. It still does not have a balancing system, choosing to allow the ++next task scheduling decision and task wakeup CPU choice to allow balancing to ++happen by virtue of its choices. ++ ++As a further evolution of the design, MuQSS normally configures sharing of ++runqueues in a logical fashion for when CPU resources are shared for improved ++latency and throughput. By default it shares runqueues and locks between ++multicore siblings. Optionally it can be configured to run with sharing of ++SMT siblings only, all SMP packages or no sharing at all. Additionally it can ++be selected at boot time. ++ ++ ++Design details. ++ ++Custom skip list implementation: ++ ++To avoid the overhead of building up and tearing down skip list structures, ++the variant used by MuQSS has a number of optimisations making it specific for ++its use case in the scheduler. It uses static arrays of 8 'levels' instead of ++building up and tearing down structures dynamically. This makes each runqueue ++only scale O(log N) up to 64k tasks. However as there is one runqueue per CPU ++it means that it scales O(log N) up to 64k x number of logical CPUs which is ++far beyond the realistic task limits each CPU could handle. By being 8 levels ++it also makes the array exactly one cacheline in size. Additionally, each ++skip list node is bidirectional making insertion and removal amortised O(1), ++being O(k) where k is 1-8. Uniquely, we are only ever interested in the very ++first entry in each list at all times with MuQSS, so there is never a need to ++do a search and thus look up is always O(1). In interactive mode, the queues ++will be searched beyond their first entry if the first task is not suitable ++for affinity or SMT nice reasons. ++ ++Task insertion: ++ ++MuQSS inserts tasks into a per CPU runqueue as an O(log N) insertion into ++a custom skip list as described above (based on the original design by William ++Pugh). Insertion is ordered in such a way that there is never a need to do a ++search by ordering tasks according to static priority primarily, and then ++virtual deadline at the time of insertion. ++ ++Niffies: ++ ++Niffies are a monotonic forward moving timer not unlike the "jiffies" but are ++of nanosecond resolution. Niffies are calculated per-runqueue from the high ++resolution TSC timers, and in order to maintain fairness are synchronised ++between CPUs whenever both runqueues are locked concurrently. ++ ++Virtual deadline: ++ ++The key to achieving low latency, scheduling fairness, and "nice level" ++distribution in MuQSS is entirely in the virtual deadline mechanism. The one ++tunable in MuQSS is the rr_interval, or "round robin interval". This is the ++maximum time two SCHED_OTHER (or SCHED_NORMAL, the common scheduling policy) ++tasks of the same nice level will be running for, or looking at it the other ++way around, the longest duration two tasks of the same nice level will be ++delayed for. When a task requests cpu time, it is given a quota (time_slice) ++equal to the rr_interval and a virtual deadline. The virtual deadline is ++offset from the current time in niffies by this equation: ++ ++ niffies + (prio_ratio * rr_interval) ++ ++The prio_ratio is determined as a ratio compared to the baseline of nice -20 ++and increases by 10% per nice level. The deadline is a virtual one only in that ++no guarantee is placed that a task will actually be scheduled by this time, but ++it is used to compare which task should go next. There are three components to ++how a task is next chosen. First is time_slice expiration. If a task runs out ++of its time_slice, it is descheduled, the time_slice is refilled, and the ++deadline reset to that formula above. Second is sleep, where a task no longer ++is requesting CPU for whatever reason. The time_slice and deadline are _not_ ++adjusted in this case and are just carried over for when the task is next ++scheduled. Third is preemption, and that is when a newly waking task is deemed ++higher priority than a currently running task on any cpu by virtue of the fact ++that it has an earlier virtual deadline than the currently running task. The ++earlier deadline is the key to which task is next chosen for the first and ++second cases. ++ ++The CPU proportion of different nice tasks works out to be approximately the ++ ++ (prio_ratio difference)^2 ++ ++The reason it is squared is that a task's deadline does not change while it is ++running unless it runs out of time_slice. Thus, even if the time actually ++passes the deadline of another task that is queued, it will not get CPU time ++unless the current running task deschedules, and the time "base" (niffies) is ++constantly moving. ++ ++Task lookup: ++ ++As tasks are already pre-ordered according to anticipated scheduling order in ++the skip lists, lookup for the next suitable task per-runqueue is always a ++matter of simply selecting the first task in the 0th level skip list entry. ++In order to maintain optimal latency and fairness across CPUs, MuQSS does a ++novel examination of every other runqueue in cache locality order, choosing the ++best task across all runqueues. This provides near-determinism of how long any ++task across the entire system may wait before receiving CPU time. The other ++runqueues are first examine lockless and then trylocked to minimise the ++potential lock contention if they are likely to have a suitable better task. ++Each other runqueue lock is only held for as long as it takes to examine the ++entry for suitability. In "interactive" mode, the default setting, MuQSS will ++look for the best deadline task across all CPUs, while in !interactive mode, ++it will only select a better deadline task from another CPU if it is more ++heavily laden than the current one. ++ ++Lookup is therefore O(k) where k is number of CPUs. ++ ++ ++Latency. ++ ++Through the use of virtual deadlines to govern the scheduling order of normal ++tasks, queue-to-activation latency per runqueue is guaranteed to be bound by ++the rr_interval tunable which is set to 6ms by default. This means that the ++longest a CPU bound task will wait for more CPU is proportional to the number ++of running tasks and in the common case of 0-2 running tasks per CPU, will be ++under the 7ms threshold for human perception of jitter. Additionally, as newly ++woken tasks will have an early deadline from their previous runtime, the very ++tasks that are usually latency sensitive will have the shortest interval for ++activation, usually preempting any existing CPU bound tasks. ++ ++Tickless expiry: ++ ++A feature of MuQSS is that it is not tied to the resolution of the chosen tick ++rate in Hz, instead depending entirely on the high resolution timers where ++possible for sub-millisecond accuracy on timeouts regarless of the underlying ++tick rate. This allows MuQSS to be run with the low overhead of low Hz rates ++such as 100 by default, benefiting from the improved throughput and lower ++power usage it provides. Another advantage of this approach is that in ++combination with the Full No HZ option, which disables ticks on running task ++CPUs instead of just idle CPUs, the tick can be disabled at all times ++regardless of how many tasks are running instead of being limited to just one ++running task. Note that this option is NOT recommended for regular desktop ++users. ++ ++ ++Scalability and balancing. ++ ++Unlike traditional approaches where balancing is a combination of CPU selection ++at task wakeup and intermittent balancing based on a vast array of rules set ++according to architecture, busyness calculations and special case management, ++MuQSS indirectly balances on the fly at task wakeup and next task selection. ++During initialisation, MuQSS creates a cache coherency ordered list of CPUs for ++each logical CPU and uses this to aid task/CPU selection when CPUs are busy. ++Additionally it selects any idle CPUs, if they are available, at any time over ++busy CPUs according to the following preference: ++ ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ ++Mux is therefore SMT, MC and Numa aware without the need for extra ++intermittent balancing to maintain CPUs busy and make the most of cache ++coherency. ++ ++ ++Features ++ ++As the initial prime target audience for MuQSS was the average desktop user, it ++was designed to not need tweaking, tuning or have features set to obtain benefit ++from it. Thus the number of knobs and features has been kept to an absolute ++minimum and should not require extra user input for the vast majority of cases. ++There are 3 optional tunables, and 2 extra scheduling policies. The rr_interval, ++interactive, and iso_cpu tunables, and the SCHED_ISO and SCHED_IDLEPRIO ++policies. In addition to this, MuQSS also uses sub-tick accounting. What MuQSS ++does _not_ now feature is support for CGROUPS. The average user should neither ++need to know what these are, nor should they need to be using them to have good ++desktop behaviour. However since some applications refuse to work without ++cgroups, one can enable them with MuQSS as a stub and the filesystem will be ++created which will allow the applications to work. ++ ++rr_interval: ++ ++ /proc/sys/kernel/rr_interval ++ ++The value is in milliseconds, and the default value is set to 6. Valid values ++are from 1 to 1000 Decreasing the value will decrease latencies at the cost of ++decreasing throughput, while increasing it will improve throughput, but at the ++cost of worsening latencies. It is based on the fact that humans can detect ++jitter at approximately 7ms, so aiming for much lower latencies is pointless ++under most circumstances. It is worth noting this fact when comparing the ++latency performance of MuQSS to other schedulers. Worst case latencies being ++higher than 7ms are far worse than average latencies not being in the ++microsecond range. ++ ++interactive: ++ ++ /proc/sys/kernel/interactive ++ ++The value is a simple boolean of 1 for on and 0 for off and is set to on by ++default. Disabling this will disable the near-determinism of MuQSS when ++selecting the next task by not examining all CPUs for the earliest deadline ++task, or which CPU to wake to, instead prioritising CPU balancing for improved ++throughput. Latency will still be bound by rr_interval, but on a per-CPU basis ++instead of across the whole system. ++ ++Runqueue sharing. ++ ++By default MuQSS chooses to share runqueue resources (specifically the skip ++list and locking) between multicore siblings. It is configurable at build time ++to select between None, SMT, MC and SMP, corresponding to no sharing, sharing ++only between simultaneous mulithreading siblings, multicore siblings, or ++symmetric multiprocessing physical packages. Additionally it can be se at ++bootime with the use of the rqshare parameter. The reason for configurability ++is that some architectures have CPUs with many multicore siblings (>= 16) ++where it may be detrimental to throughput to share runqueues and another ++sharing option may be desirable. Additionally, more sharing than usual can ++improve latency on a system-wide level at the expense of throughput if desired. ++ ++The options are: ++none, smt, mc, smp ++ ++eg: ++ rqshare=mc ++ ++Isochronous scheduling: ++ ++Isochronous scheduling is a unique scheduling policy designed to provide ++near-real-time performance to unprivileged (ie non-root) users without the ++ability to starve the machine indefinitely. Isochronous tasks (which means ++"same time") are set using, for example, the schedtool application like so: ++ ++ schedtool -I -e amarok ++ ++This will start the audio application "amarok" as SCHED_ISO. How SCHED_ISO works ++is that it has a priority level between true realtime tasks and SCHED_NORMAL ++which would allow them to preempt all normal tasks, in a SCHED_RR fashion (ie, ++if multiple SCHED_ISO tasks are running, they purely round robin at rr_interval ++rate). However if ISO tasks run for more than a tunable finite amount of time, ++they are then demoted back to SCHED_NORMAL scheduling. This finite amount of ++time is the percentage of CPU available per CPU, configurable as a percentage in ++the following "resource handling" tunable (as opposed to a scheduler tunable): ++ ++iso_cpu: ++ ++ /proc/sys/kernel/iso_cpu ++ ++and is set to 70% by default. It is calculated over a rolling 5 second average ++Because it is the total CPU available, it means that on a multi CPU machine, it ++is possible to have an ISO task running as realtime scheduling indefinitely on ++just one CPU, as the other CPUs will be available. Setting this to 100 is the ++equivalent of giving all users SCHED_RR access and setting it to 0 removes the ++ability to run any pseudo-realtime tasks. ++ ++A feature of MuQSS is that it detects when an application tries to obtain a ++realtime policy (SCHED_RR or SCHED_FIFO) and the caller does not have the ++appropriate privileges to use those policies. When it detects this, it will ++give the task SCHED_ISO policy instead. Thus it is transparent to the user. ++ ++ ++Idleprio scheduling: ++ ++Idleprio scheduling is a scheduling policy designed to give out CPU to a task ++_only_ when the CPU would be otherwise idle. The idea behind this is to allow ++ultra low priority tasks to be run in the background that have virtually no ++effect on the foreground tasks. This is ideally suited to distributed computing ++clients (like setiathome, folding, mprime etc) but can also be used to start a ++video encode or so on without any slowdown of other tasks. To avoid this policy ++from grabbing shared resources and holding them indefinitely, if it detects a ++state where the task is waiting on I/O, the machine is about to suspend to ram ++and so on, it will transiently schedule them as SCHED_NORMAL. Once a task has ++been scheduled as IDLEPRIO, it cannot be put back to SCHED_NORMAL without ++superuser privileges since it is effectively a lower scheduling policy. Tasks ++can be set to start as SCHED_IDLEPRIO with the schedtool command like so: ++ ++schedtool -D -e ./mprime ++ ++Subtick accounting: ++ ++It is surprisingly difficult to get accurate CPU accounting, and in many cases, ++the accounting is done by simply determining what is happening at the precise ++moment a timer tick fires off. This becomes increasingly inaccurate as the timer ++tick frequency (HZ) is lowered. It is possible to create an application which ++uses almost 100% CPU, yet by being descheduled at the right time, records zero ++CPU usage. While the main problem with this is that there are possible security ++implications, it is also difficult to determine how much CPU a task really does ++use. Mux uses sub-tick accounting from the TSC clock to determine real CPU ++usage. Thus, the amount of CPU reported as being used by MuQSS will more ++accurately represent how much CPU the task itself is using (as is shown for ++example by the 'time' application), so the reported values may be quite ++different to other schedulers. When comparing throughput of MuQSS to other ++designs, it is important to compare the actual completed work in terms of total ++wall clock time taken and total work done, rather than the reported "cpu usage". ++ ++Symmetric MultiThreading (SMT) aware nice: ++ ++SMT, a.k.a. hyperthreading, is a very common feature on modern CPUs. While the ++logical CPU count rises by adding thread units to each CPU core, allowing more ++than one task to be run simultaneously on the same core, the disadvantage of it ++is that the CPU power is shared between the tasks, not summating to the power ++of two CPUs. The practical upshot of this is that two tasks running on ++separate threads of the same core run significantly slower than if they had one ++core each to run on. While smart CPU selection allows each task to have a core ++to itself whenever available (as is done on MuQSS), it cannot offset the ++slowdown that occurs when the cores are all loaded and only a thread is left. ++Most of the time this is harmless as the CPU is effectively overloaded at this ++point and the extra thread is of benefit. However when running a niced task in ++the presence of an un-niced task (say nice 19 v nice 0), the nice task gets ++precisely the same amount of CPU power as the unniced one. MuQSS has an ++optional configuration feature known as SMT-NICE which selectively idles the ++secondary niced thread for a period proportional to the nice difference, ++allowing CPU distribution according to nice level to be maintained, at the ++expense of a small amount of extra overhead. If this is configured in on a ++machine without SMT threads, the overhead is minimal. ++ ++ ++Con Kolivas <kernel@kolivas.org> Sat, 29th October 2016 +diff -Nur a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt +--- a/Documentation/sysctl/kernel.txt 2019-02-09 17:20:30.451820228 +0000 ++++ b/Documentation/sysctl/kernel.txt 2019-02-09 17:46:11.991297545 +0000 +@@ -41,6 +41,7 @@ + - hung_task_check_interval_secs + - hung_task_warnings + - hyperv_record_panic_msg ++- iso_cpu + - kexec_load_disabled + - kptr_restrict + - l2cr [ PPC only ] +@@ -76,6 +77,7 @@ + - randomize_va_space + - real-root-dev ==> Documentation/admin-guide/initrd.rst + - reboot-cmd [ SPARC only ] ++- rr_interval + - rtsig-max + - rtsig-nr + - seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst +@@ -98,6 +100,7 @@ + - unknown_nmi_panic + - watchdog + - watchdog_thresh ++- yield_type + - version + + ============================================================== +@@ -436,6 +439,16 @@ + + ============================================================== + ++iso_cpu: (MuQSS CPU scheduler only). ++ ++This sets the percentage cpu that the unprivileged SCHED_ISO tasks can ++run effectively at realtime priority, averaged over a rolling five ++seconds over the -whole- system, meaning all cpus. ++ ++Set to 70 (percent) by default. ++ ++============================================================== ++ + l2cr: (PPC only) + + This flag controls the L2 cache of G3 processor boards. If +@@ -863,6 +876,20 @@ + + ============================================================== + ++rr_interval: (MuQSS CPU scheduler only) ++ ++This is the smallest duration that any cpu process scheduling unit ++will run for. Increasing this value can increase throughput of cpu ++bound tasks substantially but at the expense of increased latencies ++overall. Conversely decreasing it will decrease average and maximum ++latencies but at the expense of throughput. This value is in ++milliseconds and the default value chosen depends on the number of ++cpus available at scheduler initialisation with a minimum of 6. ++ ++Valid values are from 1-1000. ++ ++============================================================== ++ + rtsig-max & rtsig-nr: + + The file rtsig-max can be used to tune the maximum number +@@ -1123,3 +1150,13 @@ + tunable to zero will disable lockup detection altogether. + + ============================================================== ++ ++yield_type: (MuQSS CPU scheduler only) ++ ++This determines what type of yield calls to sched_yield will perform. ++ ++ 0: No yield. ++ 1: Yield only to better priority/deadline tasks. (default) ++ 2: Expire timeslice and recalculate deadline. ++ ++============================================================== +diff -Nur a/fs/proc/base.c b/fs/proc/base.c +--- a/fs/proc/base.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/fs/proc/base.c 2019-02-09 17:46:11.991297545 +0000 +@@ -459,7 +459,7 @@ + seq_printf(m, "0 0 0\n"); + else + seq_printf(m, "%llu %llu %lu\n", +- (unsigned long long)task->se.sum_exec_runtime, ++ (unsigned long long)tsk_seruntime(task), + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); + +diff -Nur a/include/linux/init_task.h b/include/linux/init_task.h +--- a/include/linux/init_task.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/init_task.h 2019-02-09 17:46:11.991297545 +0000 +@@ -46,7 +46,11 @@ + #define INIT_CPU_TIMERS(s) + #endif + ++#ifdef CONFIG_SCHED_MUQSS ++#define INIT_TASK_COMM "MuQSS" ++#else + #define INIT_TASK_COMM "swapper" ++#endif + + /* Attach to the init_task data structure for proper alignment */ + #ifdef CONFIG_ARCH_TASK_STRUCT_ON_STACK +diff -Nur a/include/linux/ioprio.h b/include/linux/ioprio.h +--- a/include/linux/ioprio.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/ioprio.h 2019-02-09 17:46:11.991297545 +0000 +@@ -53,6 +53,8 @@ + */ + static inline int task_nice_ioprio(struct task_struct *task) + { ++ if (iso_task(task)) ++ return 0; + return (task_nice(task) + 20) / 5; + } + +diff -Nur a/include/linux/sched/nohz.h b/include/linux/sched/nohz.h +--- a/include/linux/sched/nohz.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/sched/nohz.h 2019-02-09 17:46:11.991297545 +0000 +@@ -6,7 +6,7 @@ + * This is the interface between the scheduler and nohz/dynticks: + */ + +-#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) ++#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + extern void cpu_load_update_nohz_start(void); + extern void cpu_load_update_nohz_stop(void); + #else +@@ -21,7 +21,7 @@ + static inline void nohz_balance_enter_idle(int cpu) { } + #endif + +-#ifdef CONFIG_NO_HZ_COMMON ++#if defined(CONFIG_NO_HZ_COMMON) && !defined(CONFIG_SCHED_MUQSS) + void calc_load_nohz_start(void); + void calc_load_nohz_stop(void); + #else +diff -Nur a/include/linux/sched/prio.h b/include/linux/sched/prio.h +--- a/include/linux/sched/prio.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/sched/prio.h 2019-02-09 17:46:11.991297545 +0000 +@@ -20,8 +20,20 @@ + */ + + #define MAX_USER_RT_PRIO 100 ++ ++#ifdef CONFIG_SCHED_MUQSS ++/* Note different MAX_RT_PRIO */ ++#define MAX_RT_PRIO (MAX_USER_RT_PRIO + 1) ++ ++#define ISO_PRIO (MAX_RT_PRIO) ++#define NORMAL_PRIO (MAX_RT_PRIO + 1) ++#define IDLE_PRIO (MAX_RT_PRIO + 2) ++#define PRIO_LIMIT ((IDLE_PRIO) + 1) ++#else /* CONFIG_SCHED_MUQSS */ + #define MAX_RT_PRIO MAX_USER_RT_PRIO + ++#endif /* CONFIG_SCHED_MUQSS */ ++ + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) + +diff -Nur a/include/linux/sched/rt.h b/include/linux/sched/rt.h +--- a/include/linux/sched/rt.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/sched/rt.h 2019-02-09 17:46:11.991297545 +0000 +@@ -24,8 +24,10 @@ + + if (policy == SCHED_FIFO || policy == SCHED_RR) + return true; ++#ifndef CONFIG_SCHED_MUQSS + if (policy == SCHED_DEADLINE) + return true; ++#endif + return false; + } + +diff -Nur a/include/linux/sched/task.h b/include/linux/sched/task.h +--- a/include/linux/sched/task.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/sched/task.h 2019-02-09 17:46:11.991297545 +0000 +@@ -80,7 +80,7 @@ + extern void free_task(struct task_struct *tsk); + + /* sched_exec is called by processes performing an exec */ +-#ifdef CONFIG_SMP ++#if defined(CONFIG_SMP) && !defined(CONFIG_SCHED_MUQSS) + extern void sched_exec(void); + #else + #define sched_exec() {} +diff -Nur a/include/linux/sched.h b/include/linux/sched.h +--- a/include/linux/sched.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/sched.h 2019-02-09 17:46:11.991297545 +0000 +@@ -28,6 +28,9 @@ + #include <linux/mm_types_task.h> + #include <linux/task_io_accounting.h> + #include <linux/rseq.h> ++#ifdef CONFIG_SCHED_MUQSS ++#include <linux/skip_list.h> ++#endif + + /* task_struct member predeclarations (sorted alphabetically): */ + struct audit_context; +@@ -613,9 +616,11 @@ + unsigned int flags; + unsigned int ptrace; + ++#if defined(CONFIG_SMP) || defined(CONFIG_SCHED_MUQSS) ++ int on_cpu; ++#endif + #ifdef CONFIG_SMP + struct llist_node wake_entry; +- int on_cpu; + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* Current CPU: */ + unsigned int cpu; +@@ -640,10 +645,25 @@ + int static_prio; + int normal_prio; + unsigned int rt_priority; ++#ifdef CONFIG_SCHED_MUQSS ++ int time_slice; ++ u64 deadline; ++ skiplist_node node; /* Skip list node */ ++ u64 last_ran; ++ u64 sched_time; /* sched_clock time spent running */ ++#ifdef CONFIG_SMT_NICE ++ int smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++#ifdef CONFIG_HOTPLUG_CPU ++ bool zerobound; /* Bound to CPU0 for hotplug */ ++#endif ++ unsigned long rt_timeout; ++#else /* CONFIG_SCHED_MUQSS */ + + const struct sched_class *sched_class; + struct sched_entity se; + struct sched_rt_entity rt; ++#endif + #ifdef CONFIG_CGROUP_SCHED + struct task_group *sched_task_group; + #endif +@@ -798,6 +818,10 @@ + u64 utimescaled; + u64 stimescaled; + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ /* Unbanked cpu time */ ++ unsigned long utime_ns, stime_ns; ++#endif + u64 gtime; + struct prev_cputime prev_cputime; + #ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN +@@ -1210,6 +1234,40 @@ + */ + }; + ++#ifdef CONFIG_SCHED_MUQSS ++#define tsk_seruntime(t) ((t)->sched_time) ++#define tsk_rttimeout(t) ((t)->rt_timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++} ++ ++void print_scheduler_version(void); ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return (p->policy == SCHED_ISO); ++} ++#else /* CFS */ ++#define tsk_seruntime(t) ((t)->se.sum_exec_runtime) ++#define tsk_rttimeout(t) ((t)->rt.timeout) ++ ++static inline void tsk_cpus_current(struct task_struct *p) ++{ ++ p->nr_cpus_allowed = current->nr_cpus_allowed; ++} ++ ++static inline void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "CFS CPU scheduler.\n"); ++} ++ ++static inline bool iso_task(struct task_struct *p) ++{ ++ return false; ++} ++#endif /* CONFIG_SCHED_MUQSS */ ++ + static inline struct pid *task_pid(struct task_struct *task) + { + return task->thread_pid; +diff -Nur a/include/linux/skip_list.h b/include/linux/skip_list.h +--- a/include/linux/skip_list.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/include/linux/skip_list.h 2019-02-09 17:46:11.991297545 +0000 +@@ -0,0 +1,33 @@ ++#ifndef _LINUX_SKIP_LISTS_H ++#define _LINUX_SKIP_LISTS_H ++typedef u64 keyType; ++typedef void *valueType; ++ ++typedef struct nodeStructure skiplist_node; ++ ++struct nodeStructure { ++ int level; /* Levels in this structure */ ++ keyType key; ++ valueType value; ++ skiplist_node *next[8]; ++ skiplist_node *prev[8]; ++}; ++ ++typedef struct listStructure { ++ int entries; ++ int level; /* Maximum level of the list ++ (1 more than the number of levels in the list) */ ++ skiplist_node *header; /* pointer to header */ ++} skiplist; ++ ++void skiplist_init(skiplist_node *slnode); ++skiplist *new_skiplist(skiplist_node *slnode); ++void free_skiplist(skiplist *l); ++void skiplist_node_init(skiplist_node *node); ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed); ++void skiplist_delete(skiplist *l, skiplist_node *node); ++ ++static inline bool skiplist_node_empty(skiplist_node *node) { ++ return (!node->next[0]); ++} ++#endif /* _LINUX_SKIP_LISTS_H */ +diff -Nur a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h +--- a/include/uapi/linux/sched.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/uapi/linux/sched.h 2019-02-09 17:46:11.991297545 +0000 +@@ -37,9 +37,16 @@ + #define SCHED_FIFO 1 + #define SCHED_RR 2 + #define SCHED_BATCH 3 +-/* SCHED_ISO: reserved but not implemented yet */ ++/* SCHED_ISO: Implemented on MuQSS only */ + #define SCHED_IDLE 5 ++#ifdef CONFIG_SCHED_MUQSS ++#define SCHED_ISO 4 ++#define SCHED_IDLEPRIO SCHED_IDLE ++#define SCHED_MAX (SCHED_IDLEPRIO) ++#define SCHED_RANGE(policy) ((policy) <= SCHED_MAX) ++#else /* CONFIG_SCHED_MUQSS */ + #define SCHED_DEADLINE 6 ++#endif /* CONFIG_SCHED_MUQSS */ + + /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ + #define SCHED_RESET_ON_FORK 0x40000000 +diff -Nur a/init/init_task.c b/init/init_task.c +--- a/init/init_task.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/init/init_task.c 2019-02-09 17:46:11.991297545 +0000 +@@ -67,9 +67,17 @@ + .stack = init_stack, + .usage = ATOMIC_INIT(2), + .flags = PF_KTHREAD, ++#ifdef CONFIG_SCHED_MUQSS ++ .prio = NORMAL_PRIO, ++ .static_prio = MAX_PRIO-20, ++ .normal_prio = NORMAL_PRIO, ++ .deadline = 0, ++ .time_slice = 1000000, ++#else + .prio = MAX_PRIO - 20, + .static_prio = MAX_PRIO - 20, + .normal_prio = MAX_PRIO - 20, ++#endif + .policy = SCHED_NORMAL, + .cpus_allowed = CPU_MASK_ALL, + .nr_cpus_allowed= NR_CPUS, +@@ -78,6 +86,7 @@ + .restart_block = { + .fn = do_no_restart_syscall, + }, ++#ifndef CONFIG_SCHED_MUQSS + .se = { + .group_node = LIST_HEAD_INIT(init_task.se.group_node), + }, +@@ -85,6 +94,7 @@ + .run_list = LIST_HEAD_INIT(init_task.rt.run_list), + .time_slice = RR_TIMESLICE, + }, ++#endif + .tasks = LIST_HEAD_INIT(init_task.tasks), + #ifdef CONFIG_SMP + .pushable_tasks = PLIST_NODE_INIT(init_task.pushable_tasks, MAX_PRIO), +diff -Nur a/init/Kconfig b/init/Kconfig +--- a/init/Kconfig 2019-02-09 17:20:30.481821193 +0000 ++++ b/init/Kconfig 2019-02-09 17:46:11.991297545 +0000 +@@ -45,6 +45,18 @@ + + menu "General setup" + ++config SCHED_MUQSS ++ bool "MuQSS cpu scheduler" ++ select HIGH_RES_TIMERS ++ ---help--- ++ The Multiple Queue Skiplist Scheduler for excellent interactivity and ++ responsiveness on the desktop and highly scalable deterministic ++ low latency on any hardware. ++ ++ Say Y here. ++ default y ++ ++ + config BROKEN + bool + +@@ -653,6 +665,7 @@ + depends on ARCH_SUPPORTS_NUMA_BALANCING + depends on !ARCH_WANT_NUMA_VARIABLE_LOCALITY + depends on SMP && NUMA && MIGRATION ++ depends on !SCHED_MUQSS + help + This option adds support for automatic NUMA aware memory/task placement. + The mechanism is quite primitive and is based on migrating memory when +@@ -760,9 +773,13 @@ + help + This feature lets CPU scheduler recognize task groups and control CPU + bandwidth allocation to such task groups. It uses cgroups to group +- tasks. ++ tasks. In combination with MuQSS this is purely a STUB to create the ++ files associated with the CPU controller cgroup but most of the ++ controls do nothing. This is useful for working in environments and ++ with applications that will only work if this control group is ++ present. + +-if CGROUP_SCHED ++if CGROUP_SCHED && !SCHED_MUQSS + config FAIR_GROUP_SCHED + bool "Group scheduling for SCHED_OTHER" + depends on CGROUP_SCHED +@@ -869,6 +886,7 @@ + + config CGROUP_CPUACCT + bool "Simple CPU accounting controller" ++ depends on !SCHED_MUQSS + help + Provides a simple controller for monitoring the + total CPU consumed by the tasks in a cgroup. +@@ -987,6 +1005,7 @@ + + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" ++ depends on !SCHED_MUQSS + select CGROUPS + select CGROUP_SCHED + select FAIR_GROUP_SCHED +diff -Nur a/init/main.c b/init/main.c +--- a/init/main.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/init/main.c 2019-02-09 17:46:11.991297545 +0000 +@@ -1079,6 +1079,8 @@ + + rcu_end_inkernel_boot(); + ++ print_scheduler_version(); ++ + if (ramdisk_execute_command) { + ret = run_init_process(ramdisk_execute_command); + if (!ret) +diff -Nur a/kernel/delayacct.c b/kernel/delayacct.c +--- a/kernel/delayacct.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/delayacct.c 2019-02-09 17:46:11.991297545 +0000 +@@ -115,7 +115,7 @@ + */ + t1 = tsk->sched_info.pcount; + t2 = tsk->sched_info.run_delay; +- t3 = tsk->se.sum_exec_runtime; ++ t3 = tsk_seruntime(tsk); + + d->cpu_count += t1; + +diff -Nur a/kernel/exit.c b/kernel/exit.c +--- a/kernel/exit.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/exit.c 2019-02-09 17:46:11.991297545 +0000 +@@ -130,7 +130,7 @@ + sig->curr_target = next_thread(tsk); + } + +- add_device_randomness((const void*) &tsk->se.sum_exec_runtime, ++ add_device_randomness((const void*) &tsk_seruntime(tsk), + sizeof(unsigned long long)); + + /* +@@ -151,7 +151,7 @@ + sig->inblock += task_io_get_inblock(tsk); + sig->oublock += task_io_get_oublock(tsk); + task_io_accounting_add(&sig->ioac, &tsk->ioac); +- sig->sum_sched_runtime += tsk->se.sum_exec_runtime; ++ sig->sum_sched_runtime += tsk_seruntime(tsk); + sig->nr_threads--; + __unhash_process(tsk, group_dead); + write_sequnlock(&sig->stats_lock); +diff -Nur a/kernel/kthread.c b/kernel/kthread.c +--- a/kernel/kthread.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/kthread.c 2019-02-09 17:46:11.991297545 +0000 +@@ -424,6 +424,34 @@ + } + EXPORT_SYMBOL(kthread_bind); + ++#if defined(CONFIG_SCHED_MUQSS) && defined(CONFIG_SMP) ++extern void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask); ++ ++/* ++ * new_kthread_bind is a special variant of __kthread_bind_mask. ++ * For new threads to work on muqss we want to call do_set_cpus_allowed ++ * without the task_cpu being set and the task rescheduled until they're ++ * rescheduled on their own so we call __do_set_cpus_allowed directly which ++ * only changes the cpumask. This is particularly important for smpboot threads ++ * to work. ++ */ ++static void new_kthread_bind(struct task_struct *p, unsigned int cpu) ++{ ++ unsigned long flags; ++ ++ if (WARN_ON(!wait_task_inactive(p, TASK_UNINTERRUPTIBLE))) ++ return; ++ ++ /* It's safe because the task is inactive. */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ __do_set_cpus_allowed(p, cpumask_of(cpu)); ++ p->flags |= PF_NO_SETAFFINITY; ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++#else ++#define new_kthread_bind(p, cpu) kthread_bind(p, cpu) ++#endif ++ + /** + * kthread_create_on_cpu - Create a cpu bound kthread + * @threadfn: the function to run until signal_pending(current). +@@ -445,7 +473,7 @@ + cpu); + if (IS_ERR(p)) + return p; +- kthread_bind(p, cpu); ++ new_kthread_bind(p, cpu); + /* CPU hotplug need to bind once again when unparking the thread. */ + set_bit(KTHREAD_IS_PER_CPU, &to_kthread(p)->flags); + to_kthread(p)->cpu = cpu; +diff -Nur a/kernel/livepatch/transition.c b/kernel/livepatch/transition.c +--- a/kernel/livepatch/transition.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/livepatch/transition.c 2019-02-09 17:46:11.991297545 +0000 +@@ -290,6 +290,12 @@ + return 0; + } + ++#ifdef CONFIG_SCHED_MUQSS ++typedef unsigned long rq_flags_t; ++#else ++typedef struct rq_flags rq_flag_t; ++#endif ++ + /* + * Try to safely switch a task to the target patch state. If it's currently + * running, or it's sleeping on a to-be-patched or to-be-unpatched function, or +@@ -298,7 +304,7 @@ + static bool klp_try_switch_task(struct task_struct *task) + { + struct rq *rq; +- struct rq_flags flags; ++ rq_flags_t flags; + int ret; + bool success = false; + char err_buf[STACK_ERR_BUF_SIZE]; +diff -Nur a/kernel/Makefile b/kernel/Makefile +--- a/kernel/Makefile 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/Makefile 2019-02-09 17:46:11.991297545 +0000 +@@ -10,7 +10,7 @@ + extable.o params.o \ + kthread.o sys_ni.o nsproxy.o \ + notifier.o ksysfs.o cred.o reboot.o \ +- async.o range.o smpboot.o ucount.o ++ async.o range.o smpboot.o ucount.o skip_list.o + + obj-$(CONFIG_MODULES) += kmod.o + obj-$(CONFIG_MULTIUSER) += groups.o +diff -Nur a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig +--- a/kernel/rcu/Kconfig 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/rcu/Kconfig 2019-02-09 17:46:11.991297545 +0000 +@@ -93,7 +93,7 @@ + config CONTEXT_TRACKING_FORCE + bool "Force context tracking" + depends on CONTEXT_TRACKING +- default y if !NO_HZ_FULL ++ default y if !NO_HZ_FULL && !SCHED_MUQSS + help + The major pre-requirement for full dynticks to work is to + support the context tracking subsystem. But there are also +diff -Nur a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +--- a/kernel/sched/cpufreq_schedutil.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/cpufreq_schedutil.c 2019-02-09 17:46:12.001297867 +0000 +@@ -177,6 +177,12 @@ + return cpufreq_driver_resolve_freq(policy, freq); + } + ++#ifdef CONFIG_SCHED_MUQSS ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(rq) ++#else ++#define rt_rq_runnable(rq_rt) rt_rq_is_runnable(&rq->rt) ++#endif ++ + /* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. +@@ -205,7 +211,7 @@ + sg_cpu->max = max = arch_scale_cpu_capacity(NULL, sg_cpu->cpu); + sg_cpu->bw_dl = cpu_bw_dl(rq); + +- if (rt_rq_is_runnable(&rq->rt)) ++ if (rt_rq_runnable(rq)) + return max; + + /* +@@ -626,7 +632,11 @@ + struct task_struct *thread; + struct sched_attr attr = { + .size = sizeof(struct sched_attr), ++#ifdef CONFIG_SCHED_MUQSS ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, ++#endif + .sched_flags = SCHED_FLAG_SUGOV, + .sched_nice = 0, + .sched_priority = 0, +diff -Nur a/kernel/sched/cpupri.h b/kernel/sched/cpupri.h +--- a/kernel/sched/cpupri.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/cpupri.h 2019-02-09 17:46:12.001297867 +0000 +@@ -17,9 +17,11 @@ + int *cpu_to_pri; + }; + ++#ifndef CONFIG_SCHED_MUQSS + #ifdef CONFIG_SMP + int cpupri_find(struct cpupri *cp, struct task_struct *p, struct cpumask *lowest_mask); + void cpupri_set(struct cpupri *cp, int cpu, int pri); + int cpupri_init(struct cpupri *cp); + void cpupri_cleanup(struct cpupri *cp); + #endif ++#endif +diff -Nur a/kernel/sched/cputime.c b/kernel/sched/cputime.c +--- a/kernel/sched/cputime.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/cputime.c 2019-02-09 17:46:12.001297867 +0000 +@@ -265,26 +265,6 @@ + return accounted; + } + +-#ifdef CONFIG_64BIT +-static inline u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- return t->se.sum_exec_runtime; +-} +-#else +-static u64 read_sum_exec_runtime(struct task_struct *t) +-{ +- u64 ns; +- struct rq_flags rf; +- struct rq *rq; +- +- rq = task_rq_lock(t, &rf); +- ns = t->se.sum_exec_runtime; +- task_rq_unlock(rq, t, &rf); +- +- return ns; +-} +-#endif +- + /* + * Accumulate raw cputime values of dead tasks (sig->[us]time) and live + * tasks (sum on group iteration) belonging to @tsk's group. +@@ -662,7 +642,7 @@ + void task_cputime_adjusted(struct task_struct *p, u64 *ut, u64 *st) + { + struct task_cputime cputime = { +- .sum_exec_runtime = p->se.sum_exec_runtime, ++ .sum_exec_runtime = tsk_seruntime(p), + }; + + task_cputime(p, &cputime.utime, &cputime.stime); +diff -Nur a/kernel/sched/idle.c b/kernel/sched/idle.c +--- a/kernel/sched/idle.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/idle.c 2019-02-09 17:46:12.001297867 +0000 +@@ -224,6 +224,8 @@ + static void do_idle(void) + { + int cpu = smp_processor_id(); ++ bool pending = false; ++ + /* + * If the arch has a polling bit, we maintain an invariant: + * +@@ -234,7 +236,10 @@ + */ + + __current_set_polling(); +- tick_nohz_idle_enter(); ++ if (unlikely(softirq_pending(cpu))) ++ pending = true; ++ else ++ tick_nohz_idle_enter(); + + while (!need_resched()) { + check_pgt_cache(); +@@ -272,7 +277,8 @@ + * an IPI to fold the state for us. + */ + preempt_set_need_resched(); +- tick_nohz_idle_exit(); ++ if (!pending) ++ tick_nohz_idle_exit(); + __current_clr_polling(); + + /* +@@ -368,6 +374,7 @@ + do_idle(); + } + ++#ifndef CONFIG_SCHED_MUQSS + /* + * idle-task scheduling class. + */ +@@ -480,3 +487,4 @@ + .switched_to = switched_to_idle, + .update_curr = update_curr_idle, + }; ++#endif /* CONFIG_SCHED_MUQSS */ +diff -Nur a/kernel/sched/Makefile b/kernel/sched/Makefile +--- a/kernel/sched/Makefile 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/Makefile 2019-02-09 17:46:11.991297545 +0000 +@@ -16,6 +16,17 @@ + CFLAGS_core.o := $(PROFILING) -fno-omit-frame-pointer + endif + ++ifdef CONFIG_SCHED_MUQSS ++obj-y += MuQSS.o clock.o cputime.o ++obj-y += idle.o ++obj-y += wait.o wait_bit.o swait.o completion.o ++ ++obj-$(CONFIG_SMP) += topology.o ++obj-$(CONFIG_SCHEDSTATS) += stats.o ++obj-$(CONFIG_CPU_FREQ) += cpufreq.o ++obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o ++obj-$(CONFIG_CPU_ISOLATION) += isolation.o ++else + obj-y += core.o loadavg.o clock.o cputime.o + obj-y += idle.o fair.o rt.o deadline.o + obj-y += wait.o wait_bit.o swait.o completion.o +@@ -29,3 +40,4 @@ + obj-$(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) += cpufreq_schedutil.o + obj-$(CONFIG_MEMBARRIER) += membarrier.o + obj-$(CONFIG_CPU_ISOLATION) += isolation.o ++endif +diff -Nur a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +--- a/kernel/sched/MuQSS.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/kernel/sched/MuQSS.c 2019-02-09 17:46:12.001297867 +0000 +@@ -0,0 +1,7366 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * kernel/sched/MuQSS.c, was kernel/sched.c ++ * ++ * Kernel scheduler and related syscalls ++ * ++ * Copyright (C) 1991-2002 Linus Torvalds ++ * ++ * 1996-12-23 Modified by Dave Grothe to fix bugs in semaphores and ++ * make semaphores SMP safe ++ * 1998-11-19 Implemented schedule_timeout() and related stuff ++ * by Andrea Arcangeli ++ * 2002-01-04 New ultra-scalable O(1) scheduler by Ingo Molnar: ++ * hybrid priority-list and round-robin design with ++ * an array-switch method of distributing timeslices ++ * and per-CPU runqueues. Cleanups and useful suggestions ++ * by Davide Libenzi, preemptible kernel bits by Robert Love. ++ * 2003-09-03 Interactivity tuning by Con Kolivas. ++ * 2004-04-02 Scheduler domains code by Nick Piggin ++ * 2007-04-15 Work begun on replacing all interactivity tuning with a ++ * fair scheduling design by Con Kolivas. ++ * 2007-05-05 Load balancing (smp-nice) and other improvements ++ * by Peter Williams ++ * 2007-05-06 Interactivity improvements to CFS by Mike Galbraith ++ * 2007-07-01 Group scheduling enhancements by Srivatsa Vaddagiri ++ * 2007-11-29 RT balancing improvements by Steven Rostedt, Gregory Haskins, ++ * Thomas Gleixner, Mike Kravetz ++ * 2009-08-13 Brainfuck deadline scheduling policy by Con Kolivas deletes ++ * a whole lot of those previous things. ++ * 2016-10-01 Multiple Queue Skiplist Scheduler scalable evolution of BFS ++ * scheduler by Con Kolivas. ++ */ ++ ++#include <linux/sched/isolation.h> ++#include <linux/sched/loadavg.h> ++ ++#include <linux/binfmts.h> ++#include <linux/blkdev.h> ++#include <linux/compat.h> ++#include <linux/context_tracking.h> ++#include <linux/cpuset.h> ++#include <linux/delayacct.h> ++#include <linux/init_task.h> ++#include <linux/kcov.h> ++#include <linux/kprobes.h> ++#include <linux/mmu_context.h> ++#include <linux/module.h> ++#include <linux/nmi.h> ++#include <linux/prefetch.h> ++#include <linux/profile.h> ++#include <linux/rcupdate_wait.h> ++#include <linux/sched.h> ++#include <linux/security.h> ++#include <linux/skip_list.h> ++#include <linux/syscalls.h> ++#include <linux/tick.h> ++#include <linux/wait_bit.h> ++ ++#include <asm/irq_regs.h> ++#include <asm/switch_to.h> ++#include <asm/tlb.h> ++ ++#include "../workqueue_internal.h" ++#include "../smpboot.h" ++ ++#define CREATE_TRACE_POINTS ++#include <trace/events/sched.h> ++ ++#include "MuQSS.h" ++ ++#define rt_prio(prio) unlikely((prio) < MAX_RT_PRIO) ++#define rt_task(p) rt_prio((p)->prio) ++#define batch_task(p) (unlikely((p)->policy == SCHED_BATCH)) ++#define is_rt_policy(policy) ((policy) == SCHED_FIFO || \ ++ (policy) == SCHED_RR) ++#define has_rt_policy(p) unlikely(is_rt_policy((p)->policy)) ++ ++#define is_idle_policy(policy) ((policy) == SCHED_IDLEPRIO) ++#define idleprio_task(p) unlikely(is_idle_policy((p)->policy)) ++#define task_running_idle(p) unlikely((p)->prio == IDLE_PRIO) ++ ++#define is_iso_policy(policy) ((policy) == SCHED_ISO) ++#define iso_task(p) unlikely(is_iso_policy((p)->policy)) ++#define task_running_iso(p) unlikely((p)->prio == ISO_PRIO) ++ ++#define rq_idle(rq) ((rq)->rq_prio == PRIO_LIMIT) ++ ++#define ISO_PERIOD (5 * HZ) ++ ++#define STOP_PRIO (MAX_RT_PRIO - 1) ++ ++/* ++ * Some helpers for converting to/from various scales. Use shifts to get ++ * approximate multiples of ten for less overhead. ++ */ ++#define APPROX_NS_PS (1073741824) /* Approximate ns per second */ ++#define JIFFIES_TO_NS(TIME) ((TIME) * (APPROX_NS_PS / HZ)) ++#define JIFFY_NS (APPROX_NS_PS / HZ) ++#define JIFFY_US (1048576 / HZ) ++#define NS_TO_JIFFIES(TIME) ((TIME) / JIFFY_NS) ++#define HALF_JIFFY_NS (APPROX_NS_PS / HZ / 2) ++#define HALF_JIFFY_US (1048576 / HZ / 2) ++#define MS_TO_NS(TIME) ((TIME) << 20) ++#define MS_TO_US(TIME) ((TIME) << 10) ++#define NS_TO_MS(TIME) ((TIME) >> 20) ++#define NS_TO_US(TIME) ((TIME) >> 10) ++#define US_TO_NS(TIME) ((TIME) << 10) ++#define TICK_APPROX_NS ((APPROX_NS_PS+HZ/2)/HZ) ++ ++#define RESCHED_US (100) /* Reschedule if less than this many μs left */ ++ ++void print_scheduler_version(void) ++{ ++ printk(KERN_INFO "MuQSS CPU scheduler v0.180 by Con Kolivas.\n"); ++} ++ ++#define RQSHARE_NONE 0 ++#define RQSHARE_SMT 1 ++#define RQSHARE_MC 2 ++#define RQSHARE_SMP 3 ++ ++/* ++ * This determines what level of runqueue sharing will be done and is ++ * configurable at boot time with the bootparam rqshare = ++ */ ++static int rqshare __read_mostly = CONFIG_SHARERQ; /* Default RQSHARE_MC */ ++ ++static int __init set_rqshare(char *str) ++{ ++ if (!strncmp(str, "none", 4)) { ++ rqshare = RQSHARE_NONE; ++ return 0; ++ } ++ if (!strncmp(str, "smt", 3)) { ++ rqshare = RQSHARE_SMT; ++ return 0; ++ } ++ if (!strncmp(str, "mc", 2)) { ++ rqshare = RQSHARE_MC; ++ return 0; ++ } ++ if (!strncmp(str, "smp", 2)) { ++ rqshare = RQSHARE_SMP; ++ return 0; ++ } ++ return 1; ++} ++__setup("rqshare=", set_rqshare); ++ ++/* ++ * This is the time all tasks within the same priority round robin. ++ * Value is in ms and set to a minimum of 6ms. ++ * Tunable via /proc interface. ++ */ ++int rr_interval __read_mostly = 6; ++ ++/* ++ * Tunable to choose whether to prioritise latency or throughput, simple ++ * binary yes or no ++ */ ++int sched_interactive __read_mostly = 1; ++ ++/* ++ * sched_iso_cpu - sysctl which determines the cpu percentage SCHED_ISO tasks ++ * are allowed to run five seconds as real time tasks. This is the total over ++ * all online cpus. ++ */ ++int sched_iso_cpu __read_mostly = 70; ++ ++/* ++ * sched_yield_type - Choose what sort of yield sched_yield will perform. ++ * 0: No yield. ++ * 1: Yield only to better priority/deadline tasks. (default) ++ * 2: Expire timeslice and recalculate deadline. ++ */ ++int sched_yield_type __read_mostly = 1; ++ ++/* ++ * The relative length of deadline for each priority(nice) level. ++ */ ++static int prio_ratios[NICE_WIDTH] __read_mostly; ++ ++ ++/* ++ * The quota handed out to tasks of all priority levels when refilling their ++ * time_slice. ++ */ ++static inline int timeslice(void) ++{ ++ return MS_TO_US(rr_interval); ++} ++ ++DEFINE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++ ++#ifdef CONFIG_SMP ++/* ++ * Total number of runqueues. Equals number of CPUs when there is no runqueue ++ * sharing but is usually less with SMT/MC sharing of runqueues. ++ */ ++static int total_runqueues __read_mostly = 1; ++ ++static cpumask_t cpu_idle_map ____cacheline_aligned_in_smp; ++ ++struct rq *cpu_rq(int cpu) ++{ ++ return &per_cpu(runqueues, (cpu)); ++} ++#define cpu_curr(cpu) (cpu_rq(cpu)->curr) ++ ++/* ++ * For asym packing, by default the lower numbered cpu has higher priority. ++ */ ++int __weak arch_asym_cpu_priority(int cpu) ++{ ++ return -cpu; ++} ++ ++int __weak arch_sd_sibling_asym_packing(void) ++{ ++ return 0*SD_ASYM_PACKING; ++} ++#else ++struct rq *uprq; ++#endif /* CONFIG_SMP */ ++ ++#include "stats.h" ++ ++/* ++ * All common locking functions performed on rq->lock. rq->clock is local to ++ * the CPU accessing it so it can be modified just with interrupts disabled ++ * when we're not updating niffies. ++ * Looking up task_rq must be done under rq->lock to be safe. ++ */ ++ ++/* ++ * RQ-clock updating methods: ++ */ ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static void update_irq_load_avg(struct rq *rq, long delta); ++#else ++static inline void update_irq_load_avg(struct rq *rq, long delta) {} ++#endif ++ ++static void update_rq_clock_task(struct rq *rq, s64 delta) ++{ ++/* ++ * In theory, the compile should just see 0 here, and optimize out the call ++ * to sched_rt_avg_update. But I don't trust it... ++ */ ++#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) ++ s64 steal = 0, irq_delta = 0; ++#endif ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time; ++ ++ /* ++ * Since irq_time is only updated on {soft,}irq_exit, we might run into ++ * this case when a previous update_rq_clock() happened inside a ++ * {soft,}irq region. ++ * ++ * When this happens, we stop ->clock_task and only update the ++ * prev_irq_time stamp to account for the part that fit, so that a next ++ * update will consume the rest. This ensures ->clock_task is ++ * monotonic. ++ * ++ * It does however cause some slight miss-attribution of {soft,}irq ++ * time, a more accurate solution would be to update the irq_time using ++ * the current rq->clock timestamp, except that would require using ++ * atomic ops. ++ */ ++ if (irq_delta > delta) ++ irq_delta = delta; ++ ++ rq->prev_irq_time += irq_delta; ++ delta -= irq_delta; ++#endif ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ if (static_key_false((¶virt_steal_rq_enabled))) { ++ steal = paravirt_steal_clock(cpu_of(rq)); ++ steal -= rq->prev_steal_time_rq; ++ ++ if (unlikely(steal > delta)) ++ steal = delta; ++ ++ rq->prev_steal_time_rq += steal; ++ delta -= steal; ++ } ++#endif ++ rq->clock_task += delta; ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++ if (irq_delta + steal) ++ update_irq_load_avg(rq, irq_delta + steal); ++#endif ++} ++ ++static inline void update_rq_clock(struct rq *rq) ++{ ++ s64 delta = sched_clock_cpu(cpu_of(rq)) - rq->clock; ++ ++ if (unlikely(delta < 0)) ++ return; ++ rq->clock += delta; ++ update_rq_clock_task(rq, delta); ++} ++ ++/* ++ * Niffies are a globally increasing nanosecond counter. They're only used by ++ * update_load_avg and time_slice_expired, however deadlines are based on them ++ * across CPUs. Update them whenever we will call one of those functions, and ++ * synchronise them across CPUs whenever we hold both runqueue locks. ++ */ ++static inline void update_clocks(struct rq *rq) ++{ ++ s64 ndiff, minndiff; ++ long jdiff; ++ ++ update_rq_clock(rq); ++ ndiff = rq->clock - rq->old_clock; ++ rq->old_clock = rq->clock; ++ jdiff = jiffies - rq->last_jiffy; ++ ++ /* Subtract any niffies added by balancing with other rqs */ ++ ndiff -= rq->niffies - rq->last_niffy; ++ minndiff = JIFFIES_TO_NS(jdiff) - rq->niffies + rq->last_jiffy_niffies; ++ if (minndiff < 0) ++ minndiff = 0; ++ ndiff = max(ndiff, minndiff); ++ rq->niffies += ndiff; ++ rq->last_niffy = rq->niffies; ++ if (jdiff) { ++ rq->last_jiffy += jdiff; ++ rq->last_jiffy_niffies = rq->niffies; ++ } ++} ++ ++static inline int task_on_rq_queued(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_QUEUED; ++} ++ ++static inline int task_on_rq_migrating(struct task_struct *p) ++{ ++ return p->on_rq == TASK_ON_RQ_MIGRATING; ++} ++ ++/* ++ * Any time we have two runqueues locked we use that as an opportunity to ++ * synchronise niffies to the highest value as idle ticks may have artificially ++ * kept niffies low on one CPU and the truth can only be later. ++ */ ++static inline void synchronise_niffies(struct rq *rq1, struct rq *rq2) ++{ ++ if (rq1->niffies > rq2->niffies) ++ rq2->niffies = rq1->niffies; ++ else ++ rq1->niffies = rq2->niffies; ++} ++ ++/* ++ * double_rq_lock - safely lock two runqueues ++ * ++ * Note this does not disable interrupts like task_rq_lock, ++ * you need to do so manually before calling. ++ */ ++ ++/* For when we know rq1 != rq2 */ ++static inline void __double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ if (rq1 < rq2) { ++ raw_spin_lock(rq1->lock); ++ raw_spin_lock_nested(rq2->lock, SINGLE_DEPTH_NESTING); ++ } else { ++ raw_spin_lock(rq2->lock); ++ raw_spin_lock_nested(rq1->lock, SINGLE_DEPTH_NESTING); ++ } ++} ++ ++static inline void double_rq_lock(struct rq *rq1, struct rq *rq2) ++ __acquires(rq1->lock) ++ __acquires(rq2->lock) ++{ ++ BUG_ON(!irqs_disabled()); ++ if (rq1->lock == rq2->lock) { ++ raw_spin_lock(rq1->lock); ++ __acquire(rq2->lock); /* Fake it out ;) */ ++ } else ++ __double_rq_lock(rq1, rq2); ++ synchronise_niffies(rq1, rq2); ++} ++ ++/* ++ * double_rq_unlock - safely unlock two runqueues ++ * ++ * Note this does not restore interrupts like task_rq_unlock, ++ * you need to do so manually after calling. ++ */ ++static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) ++ __releases(rq1->lock) ++ __releases(rq2->lock) ++{ ++ raw_spin_unlock(rq1->lock); ++ if (rq1->lock != rq2->lock) ++ raw_spin_unlock(rq2->lock); ++ else ++ __release(rq2->lock); ++} ++ ++static inline void lock_all_rqs(void) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_lock(rq->lock); ++ } ++} ++ ++static inline void unlock_all_rqs(void) ++{ ++ int cpu; ++ ++ for_each_possible_cpu(cpu) { ++ struct rq *rq = cpu_rq(cpu); ++ ++ do_raw_spin_unlock(rq->lock); ++ } ++ preempt_enable(); ++} ++ ++/* Specially nest trylock an rq */ ++static inline bool trylock_rq(struct rq *this_rq, struct rq *rq) ++{ ++ if (unlikely(!do_raw_spin_trylock(rq->lock))) ++ return false; ++ spin_acquire(rq->lock.dep_map, SINGLE_DEPTH_NESTING, 1, _RET_IP_); ++ synchronise_niffies(this_rq, rq); ++ return true; ++} ++ ++/* Unlock a specially nested trylocked rq */ ++static inline void unlock_rq(struct rq *rq) ++{ ++ spin_release(rq->lock.dep_map, 1, _RET_IP_); ++ do_raw_spin_unlock(rq->lock); ++} ++ ++/* ++ * cmpxchg based fetch_or, macro so it works for different integer types ++ */ ++#define fetch_or(ptr, mask) \ ++ ({ \ ++ typeof(ptr) _ptr = (ptr); \ ++ typeof(mask) _mask = (mask); \ ++ typeof(*_ptr) _old, _val = *_ptr; \ ++ \ ++ for (;;) { \ ++ _old = cmpxchg(_ptr, _val, _val | _mask); \ ++ if (_old == _val) \ ++ break; \ ++ _val = _old; \ ++ } \ ++ _old; \ ++}) ++ ++#if defined(CONFIG_SMP) && defined(TIF_POLLING_NRFLAG) ++/* ++ * Atomically set TIF_NEED_RESCHED and test for TIF_POLLING_NRFLAG, ++ * this avoids any races wrt polling state changes and thereby avoids ++ * spurious IPIs. ++ */ ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++} ++ ++/* ++ * Atomically set TIF_NEED_RESCHED if TIF_POLLING_NRFLAG is set. ++ * ++ * If this returns true, then the idle task promises to call ++ * sched_ttwu_pending() and reschedule soon. ++ */ ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ struct thread_info *ti = task_thread_info(p); ++ typeof(ti->flags) old, val = READ_ONCE(ti->flags); ++ ++ for (;;) { ++ if (!(val & _TIF_POLLING_NRFLAG)) ++ return false; ++ if (val & _TIF_NEED_RESCHED) ++ return true; ++ old = cmpxchg(&ti->flags, val, val | _TIF_NEED_RESCHED); ++ if (old == val) ++ break; ++ val = old; ++ } ++ return true; ++} ++ ++#else ++static bool set_nr_and_not_polling(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++static bool set_nr_if_polling(struct task_struct *p) ++{ ++ return false; ++} ++#endif ++#endif ++ ++void wake_q_add(struct wake_q_head *head, struct task_struct *task) ++{ ++ struct wake_q_node *node = &task->wake_q; ++ ++ /* ++ * Atomically grab the task, if ->wake_q is !nil already it means ++ * its already queued (either by us or someone else) and will get the ++ * wakeup due to that. ++ * ++ * This cmpxchg() executes a full barrier, which pairs with the full ++ * barrier executed by the wakeup in wake_up_q(). ++ */ ++ if (cmpxchg(&node->next, NULL, WAKE_Q_TAIL)) ++ return; ++ ++ get_task_struct(task); ++ ++ /* ++ * The head is context local, there can be no concurrency. ++ */ ++ *head->lastp = node; ++ head->lastp = &node->next; ++} ++ ++void wake_up_q(struct wake_q_head *head) ++{ ++ struct wake_q_node *node = head->first; ++ ++ while (node != WAKE_Q_TAIL) { ++ struct task_struct *task; ++ ++ task = container_of(node, struct task_struct, wake_q); ++ BUG_ON(!task); ++ /* Task can safely be re-inserted now */ ++ node = node->next; ++ task->wake_q.next = NULL; ++ ++ /* ++ * wake_up_process() executes a full barrier, which pairs with ++ * the queueing in wake_q_add() so as not to miss wakeups. ++ */ ++ wake_up_process(task); ++ put_task_struct(task); ++ } ++} ++ ++static inline void smp_sched_reschedule(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ smp_send_reschedule(cpu); ++} ++ ++/* ++ * resched_task - mark a task 'to be rescheduled now'. ++ * ++ * On UP this means the setting of the need_resched flag, on SMP it ++ * might also involve a cross-CPU call to trigger the scheduler on ++ * the target CPU. ++ */ ++void resched_task(struct task_struct *p) ++{ ++ int cpu; ++#ifdef CONFIG_LOCKDEP ++ /* Kernel threads call this when creating workqueues while still ++ * inactive from __kthread_bind_mask, holding only the pi_lock */ ++ if (!(p->flags & PF_KTHREAD)) { ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(rq->lock); ++ } ++#endif ++ if (test_tsk_need_resched(p)) ++ return; ++ ++ cpu = task_cpu(p); ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(p)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++/* ++ * A task that is not running or queued will not have a node set. ++ * A task that is queued but not running will have a node set. ++ * A task that is currently running will have ->on_cpu set but no node set. ++ */ ++static inline bool task_queued(struct task_struct *p) ++{ ++ return !skiplist_node_empty(&p->node); ++} ++ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags); ++static inline void resched_if_idle(struct rq *rq); ++ ++/* Dodgy workaround till we figure out where the softirqs are going */ ++static inline void do_pending_softirq(struct rq *rq, struct task_struct *next) ++{ ++ if (unlikely(next == rq->idle && local_softirq_pending() && !in_interrupt())) ++ do_softirq_own_stack(); ++} ++ ++static inline bool deadline_before(u64 deadline, u64 time) ++{ ++ return (deadline < time); ++} ++ ++/* ++ * Deadline is "now" in niffies + (offset by priority). Setting the deadline ++ * is the key to everything. It distributes cpu fairly amongst tasks of the ++ * same nice value, it proportions cpu according to nice level, it means the ++ * task that last woke up the longest ago has the earliest deadline, thus ++ * ensuring that interactive tasks get low latency on wake up. The CPU ++ * proportion works out to the square of the virtual deadline difference, so ++ * this equation will give nice 19 3% CPU compared to nice 0. ++ */ ++static inline u64 prio_deadline_diff(int user_prio) ++{ ++ return (prio_ratios[user_prio] * rr_interval * (MS_TO_NS(1) / 128)); ++} ++ ++static inline u64 task_deadline_diff(struct task_struct *p) ++{ ++ return prio_deadline_diff(TASK_USER_PRIO(p)); ++} ++ ++static inline u64 static_deadline_diff(int static_prio) ++{ ++ return prio_deadline_diff(USER_PRIO(static_prio)); ++} ++ ++static inline int longest_deadline_diff(void) ++{ ++ return prio_deadline_diff(39); ++} ++ ++static inline int ms_longest_deadline_diff(void) ++{ ++ return NS_TO_MS(longest_deadline_diff()); ++} ++ ++static inline bool rq_local(struct rq *rq); ++ ++#ifndef SCHED_CAPACITY_SCALE ++#define SCHED_CAPACITY_SCALE 1024 ++#endif ++ ++static inline int rq_load(struct rq *rq) ++{ ++ return rq->nr_running; ++} ++ ++/* ++ * Update the load average for feeding into cpu frequency governors. Use a ++ * rough estimate of a rolling average with ~ time constant of 32ms. ++ * 80/128 ~ 0.63. * 80 / 32768 / 128 == * 5 / 262144 ++ * Make sure a call to update_clocks has been made before calling this to get ++ * an updated rq->niffies. ++ */ ++static void update_load_avg(struct rq *rq, unsigned int flags) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = rq_load(rq); ++ load = rq->load_avg - (rq->load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->load_avg = load; ++ ++ rq->load_update = rq->niffies; ++ update_irq_load_avg(rq, 0); ++ if (likely(rq_local(rq))) ++ cpufreq_trigger(rq, flags); ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++/* ++ * IRQ variant of update_load_avg below. delta is actually time in nanoseconds ++ * here so we scale curload to how long it's been since the last update. ++ */ ++static void update_irq_load_avg(struct rq *rq, long delta) ++{ ++ long us_interval, load; ++ unsigned long curload; ++ ++ us_interval = NS_TO_US(rq->niffies - rq->irq_load_update); ++ if (unlikely(us_interval <= 0)) ++ return; ++ ++ curload = NS_TO_US(delta) / us_interval; ++ load = rq->irq_load_avg - (rq->irq_load_avg * us_interval * 5 / 262144); ++ if (unlikely(load < 0)) ++ load = 0; ++ load += curload * curload * SCHED_CAPACITY_SCALE * us_interval * 5 / 262144; ++ rq->irq_load_avg = load; ++ ++ rq->irq_load_update = rq->niffies; ++} ++#endif ++ ++/* ++ * Removing from the runqueue. Enter with rq locked. Deleting a task ++ * from the skip list is done via the stored node reference in the task struct ++ * and does not require a full look up. Thus it occurs in O(k) time where k ++ * is the "level" of the list the task was stored at - usually < 4, max 8. ++ */ ++static void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ skiplist_delete(rq->sl, &p->node); ++ rq->best_key = rq->node->next[0]->key; ++ update_clocks(rq); ++ ++ if (!(flags & DEQUEUE_SAVE)) ++ sched_info_dequeued(task_rq(p), p); ++ rq->nr_running--; ++ if (rt_task(p)) ++ rq->rt_nr_running--; ++ update_load_avg(rq, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_RCU ++static bool rcu_read_critical(struct task_struct *p) ++{ ++ return p->rcu_read_unlock_special.b.blocked; ++} ++#else /* CONFIG_PREEMPT_RCU */ ++#define rcu_read_critical(p) (false) ++#endif /* CONFIG_PREEMPT_RCU */ ++ ++/* ++ * To determine if it's safe for a task of SCHED_IDLEPRIO to actually run as ++ * an idle task, we ensure none of the following conditions are met. ++ */ ++static bool idleprio_suitable(struct task_struct *p) ++{ ++ return (!(task_contributes_to_load(p)) && !(p->flags & (PF_EXITING)) && ++ !signal_pending(p) && !rcu_read_critical(p) && !freezing(p)); ++} ++ ++/* ++ * To determine if a task of SCHED_ISO can run in pseudo-realtime, we check ++ * that the iso_refractory flag is not set. ++ */ ++static inline bool isoprio_suitable(struct rq *rq) ++{ ++ return !rq->iso_refractory; ++} ++ ++/* ++ * Adding to the runqueue. Enter with rq locked. ++ */ ++static void enqueue_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ unsigned int randseed, cflags = 0; ++ u64 sl_id; ++ ++ if (!rt_task(p)) { ++ /* Check it hasn't gotten rt from PI */ ++ if ((idleprio_task(p) && idleprio_suitable(p)) || ++ (iso_task(p) && isoprio_suitable(rq))) ++ p->prio = p->normal_prio; ++ else ++ p->prio = NORMAL_PRIO; ++ } else ++ rq->rt_nr_running++; ++ /* ++ * The sl_id key passed to the skiplist generates a sorted list. ++ * Realtime and sched iso tasks run FIFO so they only need be sorted ++ * according to priority. The skiplist will put tasks of the same ++ * key inserted later in FIFO order. Tasks of sched normal, batch ++ * and idleprio are sorted according to their deadlines. Idleprio ++ * tasks are offset by an impossibly large deadline value ensuring ++ * they get sorted into last positions, but still according to their ++ * own deadlines. This creates a "landscape" of skiplists running ++ * from priority 0 realtime in first place to the lowest priority ++ * idleprio tasks last. Skiplist insertion is an O(log n) process. ++ */ ++ if (p->prio <= ISO_PRIO) { ++ sl_id = p->prio; ++ } else { ++ sl_id = p->deadline; ++ if (idleprio_task(p)) { ++ if (p->prio == IDLE_PRIO) ++ sl_id |= 0xF000000000000000; ++ else ++ sl_id += longest_deadline_diff(); ++ } ++ } ++ /* ++ * Some architectures don't have better than microsecond resolution ++ * so mask out ~microseconds as the random seed for skiplist insertion. ++ */ ++ update_clocks(rq); ++ if (!(flags & ENQUEUE_RESTORE)) ++ sched_info_queued(rq, p); ++ randseed = (rq->niffies >> 10) & 0xFFFFFFFF; ++ skiplist_insert(rq->sl, &p->node, sl_id, p, randseed); ++ rq->best_key = rq->node->next[0]->key; ++ if (p->in_iowait) ++ cflags |= SCHED_CPUFREQ_IOWAIT; ++ rq->nr_running++; ++ update_load_avg(rq, cflags); ++} ++ ++/* ++ * Returns the relative length of deadline all compared to the shortest ++ * deadline which is that of nice -20. ++ */ ++static inline int task_prio_ratio(struct task_struct *p) ++{ ++ return prio_ratios[TASK_USER_PRIO(p)]; ++} ++ ++/* ++ * task_timeslice - all tasks of all priorities get the exact same timeslice ++ * length. CPU distribution is handled by giving different deadlines to ++ * tasks of different priorities. Use 128 as the base value for fast shifts. ++ */ ++static inline int task_timeslice(struct task_struct *p) ++{ ++ return (rr_interval * task_prio_ratio(p) / 128); ++} ++ ++#ifdef CONFIG_SMP ++/* Entered with rq locked */ ++static inline void resched_if_idle(struct rq *rq) ++{ ++ if (rq_idle(rq)) ++ resched_task(rq->curr); ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return (rq->cpu == smp_processor_id()); ++} ++#ifdef CONFIG_SMT_NICE ++static const cpumask_t *thread_cpumask(int cpu); ++ ++/* Find the best real time priority running on any SMT siblings of cpu and if ++ * none are running, the static priority of the best deadline task running. ++ * The lookups to the other runqueues is done lockless as the occasional wrong ++ * value would be harmless. */ ++static int best_smt_bias(struct rq *this_rq) ++{ ++ int other_cpu, best_bias = 0; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq = cpu_rq(other_cpu); ++ ++ if (rq_idle(rq)) ++ continue; ++ if (unlikely(!rq->online)) ++ continue; ++ if (!rq->rq_mm) ++ continue; ++ if (likely(rq->rq_smt_bias > best_bias)) ++ best_bias = rq->rq_smt_bias; ++ } ++ return best_bias; ++} ++ ++static int task_prio_bias(struct task_struct *p) ++{ ++ if (rt_task(p)) ++ return 1 << 30; ++ else if (task_running_iso(p)) ++ return 1 << 29; ++ else if (task_running_idle(p)) ++ return 0; ++ return MAX_PRIO - p->static_prio; ++} ++ ++static bool smt_always_schedule(struct task_struct __maybe_unused *p, struct rq __maybe_unused *this_rq) ++{ ++ return true; ++} ++ ++static bool (*smt_schedule)(struct task_struct *p, struct rq *this_rq) = &smt_always_schedule; ++ ++/* We've already decided p can run on CPU, now test if it shouldn't for SMT ++ * nice reasons. */ ++static bool smt_should_schedule(struct task_struct *p, struct rq *this_rq) ++{ ++ int best_bias, task_bias; ++ ++ /* Kernel threads always run */ ++ if (unlikely(!p->mm)) ++ return true; ++ if (rt_task(p)) ++ return true; ++ if (!idleprio_suitable(p)) ++ return true; ++ best_bias = best_smt_bias(this_rq); ++ /* The smt siblings are all idle or running IDLEPRIO */ ++ if (best_bias < 1) ++ return true; ++ task_bias = task_prio_bias(p); ++ if (task_bias < 1) ++ return false; ++ if (task_bias >= best_bias) ++ return true; ++ /* Dither 25% cpu of normal tasks regardless of nice difference */ ++ if (best_bias % 4 == 1) ++ return true; ++ /* Sorry, you lose */ ++ return false; ++} ++#else /* CONFIG_SMT_NICE */ ++#define smt_schedule(p, this_rq) (true) ++#endif /* CONFIG_SMT_NICE */ ++ ++static inline void atomic_set_cpu(int cpu, cpumask_t *cpumask) ++{ ++ set_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++/* ++ * The cpu_idle_map stores a bitmap of all the CPUs currently idle to ++ * allow easy lookup of whether any suitable idle CPUs are available. ++ * It's cheaper to maintain a binary yes/no if there are any idle CPUs on the ++ * idle_cpus variable than to do a full bitmask check when we are busy. The ++ * bits are set atomically but read locklessly as occasional false positive / ++ * negative is harmless. ++ */ ++static inline void set_cpuidle_map(int cpu) ++{ ++ if (likely(cpu_online(cpu))) ++ atomic_set_cpu(cpu, &cpu_idle_map); ++} ++ ++static inline void atomic_clear_cpu(int cpu, cpumask_t *cpumask) ++{ ++ clear_bit(cpu, (volatile unsigned long *)cpumask); ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++ atomic_clear_cpu(cpu, &cpu_idle_map); ++} ++ ++static bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return (cpumask_intersects(&p->cpus_allowed, &cpu_idle_map)); ++} ++ ++/* ++ * Resched current on rq. We don't know if rq is local to this CPU nor if it ++ * is locked so we do not use an intermediate variable for the task to avoid ++ * having it dereferenced. ++ */ ++static void resched_curr(struct rq *rq) ++{ ++ int cpu; ++ ++ if (test_tsk_need_resched(rq->curr)) ++ return; ++ ++ rq->preempt = rq->curr; ++ cpu = rq->cpu; ++ ++ /* We're doing this without holding the rq lock if it's not task_rq */ ++ ++ if (cpu == smp_processor_id()) { ++ set_tsk_need_resched(rq->curr); ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ if (set_nr_and_not_polling(rq->curr)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++#define CPUIDLE_DIFF_THREAD (1) ++#define CPUIDLE_DIFF_CORE (2) ++#define CPUIDLE_CACHE_BUSY (4) ++#define CPUIDLE_DIFF_CPU (8) ++#define CPUIDLE_THREAD_BUSY (16) ++#define CPUIDLE_DIFF_NODE (32) ++ ++/* ++ * The best idle CPU is chosen according to the CPUIDLE ranking above where the ++ * lowest value would give the most suitable CPU to schedule p onto next. The ++ * order works out to be the following: ++ * ++ * Same thread, idle or busy cache, idle or busy threads ++ * Other core, same cache, idle or busy cache, idle threads. ++ * Same node, other CPU, idle cache, idle threads. ++ * Same node, other CPU, busy cache, idle threads. ++ * Other core, same cache, busy threads. ++ * Same node, other CPU, busy threads. ++ * Other node, other CPU, idle cache, idle threads. ++ * Other node, other CPU, busy cache, idle threads. ++ * Other node, other CPU, busy threads. ++ */ ++static int best_mask_cpu(int best_cpu, struct rq *rq, cpumask_t *tmpmask) ++{ ++ int best_ranking = CPUIDLE_DIFF_NODE | CPUIDLE_THREAD_BUSY | ++ CPUIDLE_DIFF_CPU | CPUIDLE_CACHE_BUSY | CPUIDLE_DIFF_CORE | ++ CPUIDLE_DIFF_THREAD; ++ int cpu_tmp; ++ ++ if (cpumask_test_cpu(best_cpu, tmpmask)) ++ goto out; ++ ++ for_each_cpu(cpu_tmp, tmpmask) { ++ int ranking, locality; ++ struct rq *tmp_rq; ++ ++ ranking = 0; ++ tmp_rq = cpu_rq(cpu_tmp); ++ ++ locality = rq->cpu_locality[cpu_tmp]; ++#ifdef CONFIG_NUMA ++ if (locality > 3) ++ ranking |= CPUIDLE_DIFF_NODE; ++ else ++#endif ++ if (locality > 2) ++ ranking |= CPUIDLE_DIFF_CPU; ++#ifdef CONFIG_SCHED_MC ++ else if (locality == 2) ++ ranking |= CPUIDLE_DIFF_CORE; ++ else if (!(tmp_rq->cache_idle(tmp_rq))) ++ ranking |= CPUIDLE_CACHE_BUSY; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ if (locality == 1) ++ ranking |= CPUIDLE_DIFF_THREAD; ++ if (!(tmp_rq->siblings_idle(tmp_rq))) ++ ranking |= CPUIDLE_THREAD_BUSY; ++#endif ++ if (ranking < best_ranking) { ++ best_cpu = cpu_tmp; ++ best_ranking = ranking; ++ } ++ } ++out: ++ return best_cpu; ++} ++ ++bool cpus_share_cache(int this_cpu, int that_cpu) ++{ ++ struct rq *this_rq = cpu_rq(this_cpu); ++ ++ return (this_rq->cpu_locality[that_cpu] < 3); ++} ++ ++/* As per resched_curr but only will resched idle task */ ++static inline void resched_idle(struct rq *rq) ++{ ++ if (test_tsk_need_resched(rq->idle)) ++ return; ++ ++ rq->preempt = rq->idle; ++ ++ set_tsk_need_resched(rq->idle); ++ ++ if (rq_local(rq)) { ++ set_preempt_need_resched(); ++ return; ++ } ++ ++ smp_sched_reschedule(rq->cpu); ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ cpumask_t tmpmask; ++ struct rq *rq; ++ int best_cpu; ++ ++ cpumask_and(&tmpmask, &p->cpus_allowed, &cpu_idle_map); ++ best_cpu = best_mask_cpu(cpu, task_rq(p), &tmpmask); ++ rq = cpu_rq(best_cpu); ++ if (!smt_schedule(p, rq)) ++ return NULL; ++ rq->preempt = p; ++ resched_idle(rq); ++ return rq; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq->rq_order[cpu]; ++} ++#else /* CONFIG_SMP */ ++static inline void set_cpuidle_map(int cpu) ++{ ++} ++ ++static inline void clear_cpuidle_map(int cpu) ++{ ++} ++ ++static inline bool suitable_idle_cpus(struct task_struct *p) ++{ ++ return uprq->curr == uprq->idle; ++} ++ ++static inline void resched_suitable_idle(struct task_struct *p) ++{ ++} ++ ++static inline void resched_curr(struct rq *rq) ++{ ++ resched_task(rq->curr); ++} ++ ++static inline void resched_if_idle(struct rq *rq) ++{ ++} ++ ++static inline bool rq_local(struct rq *rq) ++{ ++ return true; ++} ++ ++static inline struct rq *rq_order(struct rq *rq, int cpu) ++{ ++ return rq; ++} ++ ++static inline bool smt_schedule(struct task_struct *p, struct rq *rq) ++{ ++ return true; ++} ++#endif /* CONFIG_SMP */ ++ ++static inline int normal_prio(struct task_struct *p) ++{ ++ if (has_rt_policy(p)) ++ return MAX_RT_PRIO - 1 - p->rt_priority; ++ if (idleprio_task(p)) ++ return IDLE_PRIO; ++ if (iso_task(p)) ++ return ISO_PRIO; ++ return NORMAL_PRIO; ++} ++ ++/* ++ * Calculate the current priority, i.e. the priority ++ * taken into account by the scheduler. This value might ++ * be boosted by RT tasks as it will be RT if the task got ++ * RT-boosted. If not then it returns p->normal_prio. ++ */ ++static int effective_prio(struct task_struct *p) ++{ ++ p->normal_prio = normal_prio(p); ++ /* ++ * If we are RT tasks or we were boosted to RT priority, ++ * keep the priority unchanged. Otherwise, update priority ++ * to the normal priority: ++ */ ++ if (!rt_prio(p->prio)) ++ return p->normal_prio; ++ return p->prio; ++} ++ ++/* ++ * activate_task - move a task to the runqueue. Enter with rq locked. ++ */ ++static void activate_task(struct task_struct *p, struct rq *rq) ++{ ++ resched_if_idle(rq); ++ ++ /* ++ * Sleep time is in units of nanosecs, so shift by 20 to get a ++ * milliseconds-range estimation of the amount of time that the task ++ * spent sleeping: ++ */ ++ if (unlikely(prof_on == SLEEP_PROFILING)) { ++ if (p->state == TASK_UNINTERRUPTIBLE) ++ profile_hits(SLEEP_PROFILING, (void *)get_wchan(p), ++ (rq->niffies - p->last_ran) >> 20); ++ } ++ ++ p->prio = effective_prio(p); ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible--; ++ ++ enqueue_task(rq, p, 0); ++ p->on_rq = TASK_ON_RQ_QUEUED; ++} ++ ++/* ++ * deactivate_task - If it's running, it's not on the runqueue and we can just ++ * decrement the nr_running. Enter with rq locked. ++ */ ++static inline void deactivate_task(struct task_struct *p, struct rq *rq) ++{ ++ if (task_contributes_to_load(p)) ++ rq->nr_uninterruptible++; ++ ++ p->on_rq = 0; ++ sched_info_dequeued(rq, p); ++} ++ ++#ifdef CONFIG_SMP ++void set_task_cpu(struct task_struct *p, unsigned int new_cpu) ++{ ++ struct rq *rq; ++ ++ if (task_cpu(p) == new_cpu) ++ return; ++ ++ /* Do NOT call set_task_cpu on a currently queued task as we will not ++ * be reliably holding the rq lock after changing CPU. */ ++ BUG_ON(task_queued(p)); ++ rq = task_rq(p); ++ ++#ifdef CONFIG_LOCKDEP ++ /* ++ * The caller should hold either p->pi_lock or rq->lock, when changing ++ * a task's CPU. ->pi_lock for waking tasks, rq->lock for runnable tasks. ++ * ++ * Furthermore, all task_rq users should acquire both locks, see ++ * task_rq_lock(). ++ */ ++ WARN_ON_ONCE(debug_locks && !(lockdep_is_held(&p->pi_lock) || ++ lockdep_is_held(rq->lock))); ++#endif ++ ++ trace_sched_migrate_task(p, new_cpu); ++ rseq_migrate(p); ++ perf_event_task_migrate(p); ++ ++ /* ++ * After ->cpu is set up to a new value, task_rq_lock(p, ...) can be ++ * successfully executed on another CPU. We must ensure that updates of ++ * per-task data have been completed by this moment. ++ */ ++ smp_wmb(); ++ ++ p->wake_cpu = new_cpu; ++ ++ if (task_running(rq, p)) { ++ /* ++ * We should only be calling this on a running task if we're ++ * holding rq lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ ++ /* ++ * We can't change the task_thread_info CPU on a running task ++ * as p will still be protected by the rq lock of the CPU it ++ * is still running on so we only set the wake_cpu for it to be ++ * lazily updated once off the CPU. ++ */ ++ return; ++ } ++ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ p->cpu = new_cpu; ++#else ++ task_thread_info(p)->cpu = new_cpu; ++#endif ++ /* We're no longer protecting p after this point since we're holding ++ * the wrong runqueue lock. */ ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Move a task off the runqueue and take it to a cpu for it will ++ * become the running task. ++ */ ++static inline void take_task(struct rq *rq, int cpu, struct task_struct *p) ++{ ++ struct rq *p_rq = task_rq(p); ++ ++ dequeue_task(p_rq, p, DEQUEUE_SAVE); ++ if (p_rq != rq) { ++ sched_info_dequeued(p_rq, p); ++ sched_info_queued(rq, p); ++ } ++ set_task_cpu(p, cpu); ++} ++ ++/* ++ * Returns a descheduling task to the runqueue unless it is being ++ * deactivated. ++ */ ++static inline void return_task(struct task_struct *p, struct rq *rq, ++ int cpu, bool deactivate) ++{ ++ if (deactivate) ++ deactivate_task(p, rq); ++ else { ++#ifdef CONFIG_SMP ++ /* ++ * set_task_cpu was called on the running task that doesn't ++ * want to deactivate so it has to be enqueued to a different ++ * CPU and we need its lock. Tag it to be moved with as the ++ * lock is dropped in finish_lock_switch. ++ */ ++ if (unlikely(p->wake_cpu != cpu)) ++ p->on_rq = TASK_ON_RQ_MIGRATING; ++ else ++#endif ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ } ++} ++ ++/* Enter with rq lock held. We know p is on the local cpu */ ++static inline void __set_tsk_resched(struct task_struct *p) ++{ ++ set_tsk_need_resched(p); ++ set_preempt_need_resched(); ++} ++ ++/** ++ * task_curr - is this task currently executing on a CPU? ++ * @p: the task in question. ++ * ++ * Return: 1 if the task is currently executing. 0 otherwise. ++ */ ++inline int task_curr(const struct task_struct *p) ++{ ++ return cpu_curr(task_cpu(p)) == p; ++} ++ ++#ifdef CONFIG_SMP ++/* ++ * wait_task_inactive - wait for a thread to unschedule. ++ * ++ * If @match_state is nonzero, it's the @p->state value just checked and ++ * not expected to change. If it changes, i.e. @p might have woken up, ++ * then return zero. When we succeed in waiting for @p to be off its CPU, ++ * we return a positive number (its total switch count). If a second call ++ * a short while later returns the same number, the caller can be sure that ++ * @p has remained unscheduled the whole time. ++ * ++ * The caller must ensure that the task *will* unschedule sometime soon, ++ * else this function might spin for a *long* time. This function can't ++ * be called with interrupts off, or it may introduce deadlock with ++ * smp_call_function() if an IPI is sent by the same process we are ++ * waiting to become inactive. ++ */ ++unsigned long wait_task_inactive(struct task_struct *p, long match_state) ++{ ++ int running, queued; ++ unsigned long flags; ++ unsigned long ncsw; ++ struct rq *rq; ++ ++ for (;;) { ++ rq = task_rq(p); ++ ++ /* ++ * If the task is actively running on another CPU ++ * still, just relax and busy-wait without holding ++ * any locks. ++ * ++ * NOTE! Since we don't hold any locks, it's not ++ * even sure that "rq" stays as the right runqueue! ++ * But we don't care, since this will return false ++ * if the runqueue has changed and p is actually now ++ * running somewhere else! ++ */ ++ while (task_running(rq, p)) { ++ if (match_state && unlikely(p->state != match_state)) ++ return 0; ++ cpu_relax(); ++ } ++ ++ /* ++ * Ok, time to look more closely! We need the rq ++ * lock now, to be *sure*. If we're wrong, we'll ++ * just go back and repeat. ++ */ ++ rq = task_rq_lock(p, &flags); ++ trace_sched_wait_task(p); ++ running = task_running(rq, p); ++ queued = task_on_rq_queued(p); ++ ncsw = 0; ++ if (!match_state || p->state == match_state) ++ ncsw = p->nvcsw | LONG_MIN; /* sets MSB */ ++ task_rq_unlock(rq, p, &flags); ++ ++ /* ++ * If it changed from the expected state, bail out now. ++ */ ++ if (unlikely(!ncsw)) ++ break; ++ ++ /* ++ * Was it really running after all now that we ++ * checked with the proper locks actually held? ++ * ++ * Oops. Go back and try again.. ++ */ ++ if (unlikely(running)) { ++ cpu_relax(); ++ continue; ++ } ++ ++ /* ++ * It's not enough that it's not actively running, ++ * it must be off the runqueue _entirely_, and not ++ * preempted! ++ * ++ * So if it was still runnable (but just not actively ++ * running right now), it's preempted, and we should ++ * yield - it could be a while. ++ */ ++ if (unlikely(queued)) { ++ ktime_t to = NSEC_PER_SEC / HZ; ++ ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ schedule_hrtimeout(&to, HRTIMER_MODE_REL); ++ continue; ++ } ++ ++ /* ++ * Ahh, all good. It wasn't running, and it wasn't ++ * runnable, which means that it will never become ++ * running in the future either. We're all done! ++ */ ++ break; ++ } ++ ++ return ncsw; ++} ++ ++/*** ++ * kick_process - kick a running thread to enter/exit the kernel ++ * @p: the to-be-kicked thread ++ * ++ * Cause a process which is running on another CPU to enter ++ * kernel-mode, without any delay. (to get signals handled.) ++ * ++ * NOTE: this function doesn't have to take the runqueue lock, ++ * because all it wants to ensure is that the remote task enters ++ * the kernel. If the IPI races and the task has been migrated ++ * to another CPU then no harm is done and the purpose has been ++ * achieved as well. ++ */ ++void kick_process(struct task_struct *p) ++{ ++ int cpu; ++ ++ preempt_disable(); ++ cpu = task_cpu(p); ++ if ((cpu != smp_processor_id()) && task_curr(p)) ++ smp_sched_reschedule(cpu); ++ preempt_enable(); ++} ++EXPORT_SYMBOL_GPL(kick_process); ++#endif ++ ++/* ++ * RT tasks preempt purely on priority. SCHED_NORMAL tasks preempt on the ++ * basis of earlier deadlines. SCHED_IDLEPRIO don't preempt anything else or ++ * between themselves, they cooperatively multitask. An idle rq scores as ++ * prio PRIO_LIMIT so it is always preempted. ++ */ ++static inline bool ++can_preempt(struct task_struct *p, int prio, u64 deadline) ++{ ++ /* Better static priority RT task or better policy preemption */ ++ if (p->prio < prio) ++ return true; ++ if (p->prio > prio) ++ return false; ++ if (p->policy == SCHED_BATCH) ++ return false; ++ /* SCHED_NORMAL and ISO will preempt based on deadline */ ++ if (!deadline_before(p->deadline, deadline)) ++ return false; ++ return true; ++} ++ ++#ifdef CONFIG_SMP ++ ++static inline bool is_per_cpu_kthread(struct task_struct *p) ++{ ++ if (!(p->flags & PF_KTHREAD)) ++ return false; ++ ++ if (p->nr_cpus_allowed != 1) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Per-CPU kthreads are allowed to run on !active && online CPUs, see ++ * __set_cpus_allowed_ptr(). ++ */ ++static inline bool is_cpu_allowed(struct task_struct *p, int cpu) ++{ ++ if (!cpumask_test_cpu(cpu, &p->cpus_allowed)) ++ return false; ++ ++ if (is_per_cpu_kthread(p)) ++ return cpu_online(cpu); ++ ++ return cpu_active(cpu); ++} ++ ++/* ++ * Check to see if p can run on cpu, and if not, whether there are any online ++ * CPUs it can run on instead. This only happens with the hotplug threads that ++ * bring up the CPUs. ++ */ ++static inline bool sched_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (likely(cpumask_test_cpu(cpu, &p->cpus_allowed))) ++ return false; ++ if (p->nr_cpus_allowed == 1) { ++ cpumask_t valid_mask; ++ ++ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_online_mask); ++ if (unlikely(cpumask_empty(&valid_mask))) ++ return false; ++ } ++ return true; ++} ++ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ if (cpumask_test_cpu(cpu, &p->cpus_allowed)) ++ return false; ++ return true; ++} ++ ++#define cpu_online_map (*(cpumask_t *)cpu_online_mask) ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ int i, this_entries = rq_load(this_rq); ++ cpumask_t tmp; ++ ++ if (suitable_idle_cpus(p) && resched_best_idle(p, task_cpu(p))) ++ return; ++ ++ /* IDLEPRIO tasks never preempt anything but idle */ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ ++ cpumask_and(&tmp, &cpu_online_map, &p->cpus_allowed); ++ ++ for (i = 0; i < num_possible_cpus(); i++) { ++ struct rq *rq = this_rq->cpu_order[i]; ++ ++ if (!cpumask_test_cpu(rq->cpu, &tmp)) ++ continue; ++ ++ if (!sched_interactive && rq != this_rq && rq_load(rq) <= this_entries) ++ continue; ++ if (smt_schedule(p, rq) && can_preempt(p, rq->rq_prio, rq->rq_deadline)) { ++ /* We set rq->preempting lockless, it's a hint only */ ++ rq->preempting = p; ++ resched_curr(rq); ++ return; ++ } ++ } ++} ++ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check); ++#else /* CONFIG_SMP */ ++static inline bool needs_other_cpu(struct task_struct *p, int cpu) ++{ ++ return false; ++} ++ ++static void try_preempt(struct task_struct *p, struct rq *this_rq) ++{ ++ if (p->policy == SCHED_IDLEPRIO) ++ return; ++ if (can_preempt(p, uprq->rq_prio, uprq->rq_deadline)) ++ resched_curr(uprq); ++} ++ ++static inline int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ return set_cpus_allowed_ptr(p, new_mask); ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * wake flags ++ */ ++#define WF_SYNC 0x01 /* waker goes to sleep after wakeup */ ++#define WF_FORK 0x02 /* child wakeup after fork */ ++#define WF_MIGRATED 0x04 /* internal use, task got migrated */ ++ ++static void ++ttwu_stat(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq; ++ ++ if (!schedstat_enabled()) ++ return; ++ ++ rq = this_rq(); ++ ++#ifdef CONFIG_SMP ++ if (cpu == rq->cpu) { ++ __schedstat_inc(rq->ttwu_local); ++ } else { ++ struct sched_domain *sd; ++ ++ rcu_read_lock(); ++ for_each_domain(rq->cpu, sd) { ++ if (cpumask_test_cpu(cpu, sched_domain_span(sd))) { ++ __schedstat_inc(sd->ttwu_wake_remote); ++ break; ++ } ++ } ++ rcu_read_unlock(); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ __schedstat_inc(rq->ttwu_count); ++} ++ ++static inline void ttwu_activate(struct rq *rq, struct task_struct *p) ++{ ++ activate_task(p, rq); ++ ++ /* if a worker is waking up, notify the workqueue */ ++ if (p->flags & PF_WQ_WORKER) ++ wq_worker_waking_up(p, cpu_of(rq)); ++} ++ ++/* ++ * Mark the task runnable and perform wakeup-preemption. ++ */ ++static void ttwu_do_wakeup(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ /* ++ * Sync wakeups (i.e. those types of wakeups where the waker ++ * has indicated that it will leave the CPU in short order) ++ * don't trigger a preemption if there are no idle cpus, ++ * instead waiting for current to deschedule. ++ */ ++ if (wake_flags & WF_SYNC) ++ resched_suitable_idle(p); ++ else ++ try_preempt(p, rq); ++ p->state = TASK_RUNNING; ++ trace_sched_wakeup(p); ++} ++ ++static void ++ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags) ++{ ++ lockdep_assert_held(rq->lock); ++ ++#ifdef CONFIG_SMP ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible--; ++#endif ++ ++ ttwu_activate(rq, p); ++ ttwu_do_wakeup(rq, p, wake_flags); ++} ++ ++/* ++ * Called in case the task @p isn't fully descheduled from its runqueue, ++ * in this case we must do a remote wakeup. Its a 'light' wakeup though, ++ * since all we need to do is flip p->state to TASK_RUNNING, since ++ * the task is still ->on_rq. ++ */ ++static int ttwu_remote(struct task_struct *p, int wake_flags) ++{ ++ struct rq *rq; ++ int ret = 0; ++ ++ rq = __task_rq_lock(p); ++ if (likely(task_on_rq_queued(p))) { ++ ttwu_do_wakeup(rq, p, wake_flags); ++ ret = 1; ++ } ++ __task_rq_unlock(rq); ++ ++ return ret; ++} ++ ++#ifdef CONFIG_SMP ++void sched_ttwu_pending(void) ++{ ++ struct rq *rq = this_rq(); ++ struct llist_node *llist = llist_del_all(&rq->wake_list); ++ struct task_struct *p, *t; ++ unsigned long flags; ++ ++ if (!llist) ++ return; ++ ++ rq_lock_irqsave(rq, &flags); ++ ++ llist_for_each_entry_safe(p, t, llist, wake_entry) ++ ttwu_do_activate(rq, p, 0); ++ ++ rq_unlock_irqrestore(rq, &flags); ++} ++ ++void scheduler_ipi(void) ++{ ++ /* ++ * Fold TIF_NEED_RESCHED into the preempt_count; anybody setting ++ * TIF_NEED_RESCHED remotely (for the first time) will also send ++ * this IPI. ++ */ ++ preempt_fold_need_resched(); ++ ++ if (llist_empty(&this_rq()->wake_list) && (!idle_cpu(smp_processor_id()) || need_resched())) ++ return; ++ ++ /* ++ * Not all reschedule IPI handlers call irq_enter/irq_exit, since ++ * traditionally all their work was done from the interrupt return ++ * path. Now that we actually do some work, we need to make sure ++ * we do call them. ++ * ++ * Some archs already do call them, luckily irq_enter/exit nest ++ * properly. ++ * ++ * Arguably we should visit all archs and update all handlers, ++ * however a fair share of IPIs are still resched only so this would ++ * somewhat pessimize the simple resched case. ++ */ ++ irq_enter(); ++ sched_ttwu_pending(); ++ irq_exit(); ++} ++ ++static void ttwu_queue_remote(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (llist_add(&p->wake_entry, &cpu_rq(cpu)->wake_list)) { ++ if (!set_nr_if_polling(rq->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++ } ++} ++ ++void wake_up_if_idle(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rcu_read_lock(); ++ ++ if (!is_idle_task(rcu_dereference(rq->curr))) ++ goto out; ++ ++ if (set_nr_if_polling(rq->idle)) { ++ trace_sched_wake_idle_without_ipi(cpu); ++ } else { ++ rq_lock_irqsave(rq, &flags); ++ if (likely(is_idle_task(rq->curr))) ++ smp_sched_reschedule(cpu); ++ /* Else cpu is not in idle, do nothing here */ ++ rq_unlock_irqrestore(rq, &flags); ++ } ++ ++out: ++ rcu_read_unlock(); ++} ++ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ cpumask_t valid_mask; ++ ++ if (p->flags & PF_KTHREAD) ++ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_all_mask); ++ else ++ cpumask_and(&valid_mask, &p->cpus_allowed, cpu_active_mask); ++ ++ if (unlikely(!cpumask_weight(&valid_mask))) { ++ /* We shouldn't be hitting this any more */ ++ printk(KERN_WARNING "SCHED: No cpumask for %s/%d weight %d\n", p->comm, ++ p->pid, cpumask_weight(&p->cpus_allowed)); ++ return cpumask_any(&p->cpus_allowed); ++ } ++ return cpumask_any(&valid_mask); ++} ++ ++/* ++ * For a task that's just being woken up we have a valuable balancing ++ * opportunity so choose the nearest cache most lightly loaded runqueue. ++ * Entered with rq locked and returns with the chosen runqueue locked. ++ */ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ unsigned int idlest = ~0U; ++ struct rq *rq = NULL; ++ int i; ++ ++ if (suitable_idle_cpus(p)) { ++ int cpu = task_cpu(p); ++ ++ if (unlikely(needs_other_cpu(p, cpu))) ++ cpu = valid_task_cpu(p); ++ rq = resched_best_idle(p, cpu); ++ if (likely(rq)) ++ return rq->cpu; ++ } ++ ++ for (i = 0; i < num_possible_cpus(); i++) { ++ struct rq *other_rq = task_rq(p)->cpu_order[i]; ++ int entries; ++ ++ if (!other_rq->online) ++ continue; ++ if (needs_other_cpu(p, other_rq->cpu)) ++ continue; ++ entries = rq_load(other_rq); ++ if (entries >= idlest) ++ continue; ++ idlest = entries; ++ rq = other_rq; ++ } ++ if (unlikely(!rq)) ++ return task_cpu(p); ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static int valid_task_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static inline int select_best_cpu(struct task_struct *p) ++{ ++ return 0; ++} ++ ++static struct rq *resched_best_idle(struct task_struct *p, int cpu) ++{ ++ return NULL; ++} ++#endif /* CONFIG_SMP */ ++ ++static void ttwu_queue(struct task_struct *p, int cpu, int wake_flags) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ ++#if defined(CONFIG_SMP) ++ if (!cpus_share_cache(smp_processor_id(), cpu)) { ++ sched_clock_cpu(cpu); /* Sync clocks across CPUs */ ++ ttwu_queue_remote(p, cpu, wake_flags); ++ return; ++ } ++#endif ++ rq_lock(rq); ++ ttwu_do_activate(rq, p, wake_flags); ++ rq_unlock(rq); ++} ++ ++/*** ++ * try_to_wake_up - wake up a thread ++ * @p: the thread to be awakened ++ * @state: the mask of task states that can be woken ++ * @wake_flags: wake modifier flags (WF_*) ++ * ++ * Put it on the run-queue if it's not already there. The "current" ++ * thread is always on the run-queue (except when the actual ++ * re-schedule is in progress), and as such you're allowed to do ++ * the simpler "current->state = TASK_RUNNING" to mark yourself ++ * runnable without the overhead of this. ++ * ++ * Return: %true if @p was woken up, %false if it was already running. ++ * or @state didn't match @p's state. ++ */ ++static int ++try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) ++{ ++ unsigned long flags; ++ int cpu, success = 0; ++ ++ /* ++ * If we are going to wake up a thread waiting for CONDITION we ++ * need to ensure that CONDITION=1 done by the caller can not be ++ * reordered with p->state check below. This pairs with mb() in ++ * set_current_state() the waiting thread does. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ smp_mb__after_spinlock(); ++ /* state is a volatile long, どうして、分からない */ ++ if (!((unsigned int)p->state & state)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ /* We're going to change ->state: */ ++ success = 1; ++ cpu = task_cpu(p); ++ ++ /* ++ * Ensure we load p->on_rq _after_ p->state, otherwise it would ++ * be possible to, falsely, observe p->on_rq == 0 and get stuck ++ * in smp_cond_load_acquire() below. ++ * ++ * sched_ttwu_pending() try_to_wake_up() ++ * STORE p->on_rq = 1 LOAD p->state ++ * UNLOCK rq->lock ++ * ++ * __schedule() (switch to task 'p') ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * UNLOCK rq->lock ++ * ++ * [task p] ++ * STORE p->state = UNINTERRUPTIBLE LOAD p->on_rq ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ if (p->on_rq && ttwu_remote(p, wake_flags)) ++ goto stat; ++ ++#ifdef CONFIG_SMP ++ /* ++ * Ensure we load p->on_cpu _after_ p->on_rq, otherwise it would be ++ * possible to, falsely, observe p->on_cpu == 0. ++ * ++ * One must be running (->on_cpu == 1) in order to remove oneself ++ * from the runqueue. ++ * ++ * __schedule() (switch to task 'p') try_to_wake_up() ++ * STORE p->on_cpu = 1 LOAD p->on_rq ++ * UNLOCK rq->lock ++ * ++ * __schedule() (put 'p' to sleep) ++ * LOCK rq->lock smp_rmb(); ++ * smp_mb__after_spinlock(); ++ * STORE p->on_rq = 0 LOAD p->on_cpu ++ * ++ * Pairs with the LOCK+smp_mb__after_spinlock() on rq->lock in ++ * __schedule(). See the comment for smp_mb__after_spinlock(). ++ */ ++ smp_rmb(); ++ ++ /* ++ * If the owning (remote) CPU is still in the middle of schedule() with ++ * this task as prev, wait until its done referencing the task. ++ * ++ * Pairs with the smp_store_release() in finish_task(). ++ * ++ * This ensures that tasks getting woken will be fully ordered against ++ * their previous state and preserve Program Order. ++ */ ++ smp_cond_load_acquire(&p->on_cpu, !VAL); ++ ++ p->sched_contributes_to_load = !!task_contributes_to_load(p); ++ p->state = TASK_WAKING; ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++ cpu = select_best_cpu(p); ++ if (task_cpu(p) != cpu) ++ set_task_cpu(p, cpu); ++ ++#else /* CONFIG_SMP */ ++ ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&task_rq(p)->nr_iowait); ++ } ++ ++#endif /* CONFIG_SMP */ ++ ++ ttwu_queue(p, cpu, wake_flags); ++stat: ++ ttwu_stat(p, cpu, wake_flags); ++out: ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++ return success; ++} ++ ++/** ++ * try_to_wake_up_local - try to wake up a local task with rq lock held ++ * @p: the thread to be awakened ++ * ++ * Put @p on the run-queue if it's not already there. The caller must ++ * ensure that rq is locked and, @p is not the current task. ++ * rq stays locked over invocation. ++ */ ++static void try_to_wake_up_local(struct task_struct *p) ++{ ++ struct rq *rq = task_rq(p); ++ ++ if (WARN_ON_ONCE(rq != this_rq()) || ++ WARN_ON_ONCE(p == current)) ++ return; ++ ++ lockdep_assert_held(rq->lock); ++ ++ if (!raw_spin_trylock(&p->pi_lock)) { ++ /* ++ * This is OK, because current is on_cpu, which avoids it being ++ * picked for load-balance and preemption/IRQs are still ++ * disabled avoiding further scheduler activity on it and we've ++ * not yet picked a replacement task. ++ */ ++ rq_unlock(rq); ++ raw_spin_lock(&p->pi_lock); ++ rq_lock(rq); ++ } ++ ++ if (!(p->state & TASK_NORMAL)) ++ goto out; ++ ++ trace_sched_waking(p); ++ ++ if (!task_on_rq_queued(p)) { ++ if (p->in_iowait) { ++ delayacct_blkio_end(p); ++ atomic_dec(&rq->nr_iowait); ++ } ++ ttwu_activate(rq, p); ++ } ++ ++ ttwu_do_wakeup(rq, p, 0); ++ ttwu_stat(p, smp_processor_id(), 0); ++out: ++ raw_spin_unlock(&p->pi_lock); ++} ++ ++/** ++ * wake_up_process - Wake up a specific process ++ * @p: The process to be woken up. ++ * ++ * Attempt to wake up the nominated process and move it to the set of runnable ++ * processes. ++ * ++ * Return: 1 if the process was woken up, 0 if it was already running. ++ * ++ * This function executes a full memory barrier before accessing the task state. ++ */ ++int wake_up_process(struct task_struct *p) ++{ ++ return try_to_wake_up(p, TASK_NORMAL, 0); ++} ++EXPORT_SYMBOL(wake_up_process); ++ ++int wake_up_state(struct task_struct *p, unsigned int state) ++{ ++ return try_to_wake_up(p, state, 0); ++} ++ ++static void time_slice_expired(struct task_struct *p, struct rq *rq); ++ ++/* ++ * Perform scheduler related setup for a newly forked process p. ++ * p is forked by current. ++ */ ++int sched_fork(unsigned long __maybe_unused clone_flags, struct task_struct *p) ++{ ++ unsigned long flags; ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ INIT_HLIST_HEAD(&p->preempt_notifiers); ++#endif ++ /* ++ * We mark the process as NEW here. This guarantees that ++ * nobody will actually run it, and a signal or other external ++ * event cannot wake it up and insert it on the runqueue either. ++ */ ++ p->state = TASK_NEW; ++ ++ /* ++ * The process state is set to the same value of the process executing ++ * do_fork() code. That is running. This guarantees that nobody will ++ * actually run it, and a signal or other external event cannot wake ++ * it up and insert it on the runqueue either. ++ */ ++ ++ /* Should be reset in fork.c but done here for ease of MuQSS patching */ ++ p->on_cpu = ++ p->on_rq = ++ p->utime = ++ p->stime = ++ p->sched_time = ++ p->stime_ns = ++ p->utime_ns = 0; ++ skiplist_node_init(&p->node); ++ ++ /* ++ * Revert to default priority/policy on fork if requested. ++ */ ++ if (unlikely(p->sched_reset_on_fork)) { ++ if (p->policy == SCHED_FIFO || p->policy == SCHED_RR) { ++ p->policy = SCHED_NORMAL; ++ p->normal_prio = normal_prio(p); ++ } ++ ++ if (PRIO_TO_NICE(p->static_prio) < 0) { ++ p->static_prio = NICE_TO_PRIO(0); ++ p->normal_prio = p->static_prio; ++ } ++ ++ /* ++ * We don't need the reset flag anymore after the fork. It has ++ * fulfilled its duty: ++ */ ++ p->sched_reset_on_fork = 0; ++ } ++ ++ /* ++ * Silence PROVE_RCU. ++ */ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ set_task_cpu(p, smp_processor_id()); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++#ifdef CONFIG_SCHED_INFO ++ if (unlikely(sched_info_on())) ++ memset(&p->sched_info, 0, sizeof(p->sched_info)); ++#endif ++ init_task_preempt_count(p); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SCHEDSTATS ++ ++DEFINE_STATIC_KEY_FALSE(sched_schedstats); ++static bool __initdata __sched_schedstats = false; ++ ++static void set_schedstats(bool enabled) ++{ ++ if (enabled) ++ static_branch_enable(&sched_schedstats); ++ else ++ static_branch_disable(&sched_schedstats); ++} ++ ++void force_schedstat_enabled(void) ++{ ++ if (!schedstat_enabled()) { ++ pr_info("kernel profiling enabled schedstats, disable via kernel.sched_schedstats.\n"); ++ static_branch_enable(&sched_schedstats); ++ } ++} ++ ++static int __init setup_schedstats(char *str) ++{ ++ int ret = 0; ++ if (!str) ++ goto out; ++ ++ /* ++ * This code is called before jump labels have been set up, so we can't ++ * change the static branch directly just yet. Instead set a temporary ++ * variable so init_schedstats() can do it later. ++ */ ++ if (!strcmp(str, "enable")) { ++ __sched_schedstats = true; ++ ret = 1; ++ } else if (!strcmp(str, "disable")) { ++ __sched_schedstats = false; ++ ret = 1; ++ } ++out: ++ if (!ret) ++ pr_warn("Unable to parse schedstats=\n"); ++ ++ return ret; ++} ++__setup("schedstats=", setup_schedstats); ++ ++static void __init init_schedstats(void) ++{ ++ set_schedstats(__sched_schedstats); ++} ++ ++#ifdef CONFIG_PROC_SYSCTL ++int sysctl_schedstats(struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ struct ctl_table t; ++ int err; ++ int state = static_branch_likely(&sched_schedstats); ++ ++ if (write && !capable(CAP_SYS_ADMIN)) ++ return -EPERM; ++ ++ t = *table; ++ t.data = &state; ++ err = proc_dointvec_minmax(&t, write, buffer, lenp, ppos); ++ if (err < 0) ++ return err; ++ if (write) ++ set_schedstats(state); ++ return err; ++} ++#endif /* CONFIG_PROC_SYSCTL */ ++#else /* !CONFIG_SCHEDSTATS */ ++static inline void init_schedstats(void) {} ++#endif /* CONFIG_SCHEDSTATS */ ++ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p); ++ ++static void account_task_cpu(struct rq *rq, struct task_struct *p) ++{ ++ update_clocks(rq); ++ /* This isn't really a context switch but accounting is the same */ ++ update_cpu_clock_switch(rq, p); ++ p->last_ran = rq->niffies; ++} ++ ++bool sched_smp_initialized __read_mostly; ++ ++static inline int hrexpiry_enabled(struct rq *rq) ++{ ++ if (unlikely(!cpu_active(cpu_of(rq)) || !sched_smp_initialized)) ++ return 0; ++ return hrtimer_is_hres_active(&rq->hrexpiry_timer); ++} ++ ++/* ++ * Use HR-timers to deliver accurate preemption points. ++ */ ++static inline void hrexpiry_clear(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (hrtimer_active(&rq->hrexpiry_timer)) ++ hrtimer_cancel(&rq->hrexpiry_timer); ++} ++ ++/* ++ * High-resolution time_slice expiry. ++ * Runs from hardirq context with interrupts disabled. ++ */ ++static enum hrtimer_restart hrexpiry(struct hrtimer *timer) ++{ ++ struct rq *rq = container_of(timer, struct rq, hrexpiry_timer); ++ struct task_struct *p; ++ ++ /* This can happen during CPU hotplug / resume */ ++ if (unlikely(cpu_of(rq) != smp_processor_id())) ++ goto out; ++ ++ /* ++ * We're doing this without the runqueue lock but this should always ++ * be run on the local CPU. Time slice should run out in __schedule ++ * but we set it to zero here in case niffies is slightly less. ++ */ ++ p = rq->curr; ++ p->time_slice = 0; ++ __set_tsk_resched(p); ++out: ++ return HRTIMER_NORESTART; ++} ++ ++/* ++ * Called to set the hrexpiry timer state. ++ * ++ * called with irqs disabled from the local CPU only ++ */ ++static void hrexpiry_start(struct rq *rq, u64 delay) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ ++ hrtimer_start(&rq->hrexpiry_timer, ns_to_ktime(delay), ++ HRTIMER_MODE_REL_PINNED); ++} ++ ++static void init_rq_hrexpiry(struct rq *rq) ++{ ++ hrtimer_init(&rq->hrexpiry_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ rq->hrexpiry_timer.function = hrexpiry; ++} ++ ++static inline int rq_dither(struct rq *rq) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return HALF_JIFFY_US; ++ return 0; ++} ++ ++/* ++ * wake_up_new_task - wake up a newly created task for the first time. ++ * ++ * This function will do some initial scheduler statistics housekeeping ++ * that must be done for every newly created context, then puts the task ++ * on the runqueue and wakes it. ++ */ ++void wake_up_new_task(struct task_struct *p) ++{ ++ struct task_struct *parent, *rq_curr; ++ struct rq *rq, *new_rq; ++ unsigned long flags; ++ ++ parent = p->parent; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ p->state = TASK_RUNNING; ++ /* Task_rq can't change yet on a new task */ ++ new_rq = rq = task_rq(p); ++ if (unlikely(needs_other_cpu(p, task_cpu(p)))) { ++ set_task_cpu(p, valid_task_cpu(p)); ++ new_rq = task_rq(p); ++ } ++ ++ double_rq_lock(rq, new_rq); ++ rq_curr = rq->curr; ++ ++ /* ++ * Make sure we do not leak PI boosting priority to the child. ++ */ ++ p->prio = rq_curr->normal_prio; ++ ++ trace_sched_wakeup_new(p); ++ ++ /* ++ * Share the timeslice between parent and child, thus the ++ * total amount of pending timeslices in the system doesn't change, ++ * resulting in more scheduling fairness. If it's negative, it won't ++ * matter since that's the same as being 0. rq->rq_deadline is only ++ * modified within schedule() so it is always equal to ++ * current->deadline. ++ */ ++ account_task_cpu(rq, rq_curr); ++ p->last_ran = rq_curr->last_ran; ++ if (likely(rq_curr->policy != SCHED_FIFO)) { ++ rq_curr->time_slice /= 2; ++ if (rq_curr->time_slice < RESCHED_US) { ++ /* ++ * Forking task has run out of timeslice. Reschedule it and ++ * start its child with a new time slice and deadline. The ++ * child will end up running first because its deadline will ++ * be slightly earlier. ++ */ ++ __set_tsk_resched(rq_curr); ++ time_slice_expired(p, new_rq); ++ if (suitable_idle_cpus(p)) ++ resched_best_idle(p, task_cpu(p)); ++ else if (unlikely(rq != new_rq)) ++ try_preempt(p, new_rq); ++ } else { ++ p->time_slice = rq_curr->time_slice; ++ if (rq_curr == parent && rq == new_rq && !suitable_idle_cpus(p)) { ++ /* ++ * The VM isn't cloned, so we're in a good position to ++ * do child-runs-first in anticipation of an exec. This ++ * usually avoids a lot of COW overhead. ++ */ ++ __set_tsk_resched(rq_curr); ++ } else { ++ /* ++ * Adjust the hrexpiry since rq_curr will keep ++ * running and its timeslice has been shortened. ++ */ ++ hrexpiry_start(rq, US_TO_NS(rq_curr->time_slice)); ++ try_preempt(p, new_rq); ++ } ++ } ++ } else { ++ time_slice_expired(p, new_rq); ++ try_preempt(p, new_rq); ++ } ++ activate_task(p, new_rq); ++ double_rq_unlock(rq, new_rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++} ++ ++#ifdef CONFIG_PREEMPT_NOTIFIERS ++ ++static DEFINE_STATIC_KEY_FALSE(preempt_notifier_key); ++ ++void preempt_notifier_inc(void) ++{ ++ static_branch_inc(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_inc); ++ ++void preempt_notifier_dec(void) ++{ ++ static_branch_dec(&preempt_notifier_key); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_dec); ++ ++/** ++ * preempt_notifier_register - tell me when current is being preempted & rescheduled ++ * @notifier: notifier struct to register ++ */ ++void preempt_notifier_register(struct preempt_notifier *notifier) ++{ ++ if (!static_branch_unlikely(&preempt_notifier_key)) ++ WARN(1, "registering preempt_notifier while notifiers disabled\n"); ++ ++ hlist_add_head(¬ifier->link, ¤t->preempt_notifiers); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_register); ++ ++/** ++ * preempt_notifier_unregister - no longer interested in preemption notifications ++ * @notifier: notifier struct to unregister ++ * ++ * This is *not* safe to call from within a preemption notifier. ++ */ ++void preempt_notifier_unregister(struct preempt_notifier *notifier) ++{ ++ hlist_del(¬ifier->link); ++} ++EXPORT_SYMBOL_GPL(preempt_notifier_unregister); ++ ++static void __fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_in(notifier, raw_smp_processor_id()); ++} ++ ++static __always_inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_in_preempt_notifiers(curr); ++} ++ ++static void ++__fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ struct preempt_notifier *notifier; ++ ++ hlist_for_each_entry(notifier, &curr->preempt_notifiers, link) ++ notifier->ops->sched_out(notifier, next); ++} ++ ++static __always_inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++ if (static_branch_unlikely(&preempt_notifier_key)) ++ __fire_sched_out_preempt_notifiers(curr, next); ++} ++ ++#else /* !CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void fire_sched_in_preempt_notifiers(struct task_struct *curr) ++{ ++} ++ ++static inline void ++fire_sched_out_preempt_notifiers(struct task_struct *curr, ++ struct task_struct *next) ++{ ++} ++ ++#endif /* CONFIG_PREEMPT_NOTIFIERS */ ++ ++static inline void prepare_task(struct task_struct *next) ++{ ++ /* ++ * Claim the task as running, we do this before switching to it ++ * such that any running task will have this set. ++ */ ++ next->on_cpu = 1; ++} ++ ++static inline void finish_task(struct task_struct *prev) ++{ ++#ifdef CONFIG_SMP ++ /* ++ * After ->on_cpu is cleared, the task can be moved to a different CPU. ++ * We must ensure this doesn't happen until the switch is completely ++ * finished. ++ * ++ * In particular, the load of prev->state in finish_task_switch() must ++ * happen before this. ++ * ++ * Pairs with the smp_cond_load_acquire() in try_to_wake_up(). ++ */ ++ smp_store_release(&prev->on_cpu, 0); ++#endif ++} ++ ++static inline void ++prepare_lock_switch(struct rq *rq, struct task_struct *next) ++{ ++ /* ++ * Since the runqueue lock will be released by the next ++ * task (which is an invalid locking op but in the case ++ * of the scheduler it's an obvious special-case), so we ++ * do an early lockdep release here: ++ */ ++ spin_release(&rq->lock.dep_map, 1, _THIS_IP_); ++#ifdef CONFIG_DEBUG_SPINLOCK ++ /* this is a valid case when another task releases the spinlock */ ++ rq->lock.owner = next; ++#endif ++} ++ ++static inline void finish_lock_switch(struct rq *rq, struct task_struct *prev) ++{ ++ /* ++ * If we are tracking spinlock dependencies then we have to ++ * fix up the runqueue lock - which gets 'carried over' from ++ * prev into current: ++ */ ++ spin_acquire(&rq->lock.dep_map, 0, 0, _THIS_IP_); ++ ++#ifdef CONFIG_SMP ++ /* ++ * If prev was marked as migrating to another CPU in return_task, drop ++ * the local runqueue lock but leave interrupts disabled and grab the ++ * remote lock we're migrating it to before enabling them. ++ */ ++ if (unlikely(task_on_rq_migrating(prev))) { ++ sched_info_dequeued(rq, prev); ++ /* ++ * We move the ownership of prev to the new cpu now. ttwu can't ++ * activate prev to the wrong cpu since it has to grab this ++ * runqueue in ttwu_remote. ++ */ ++#ifdef CONFIG_THREAD_INFO_IN_TASK ++ prev->cpu = prev->wake_cpu; ++#else ++ task_thread_info(prev)->cpu = prev->wake_cpu; ++#endif ++ raw_spin_unlock(rq->lock); ++ ++ raw_spin_lock(&prev->pi_lock); ++ rq = __task_rq_lock(prev); ++ /* Check that someone else hasn't already queued prev */ ++ if (likely(!task_queued(prev))) { ++ enqueue_task(rq, prev, 0); ++ prev->on_rq = TASK_ON_RQ_QUEUED; ++ /* Wake up the CPU if it's not already running */ ++ resched_if_idle(rq); ++ } ++ raw_spin_unlock(&prev->pi_lock); ++ } ++#endif ++ rq_unlock(rq); ++ ++ do_pending_softirq(rq, current); ++ ++ local_irq_enable(); ++} ++ ++#ifndef prepare_arch_switch ++# define prepare_arch_switch(next) do { } while (0) ++#endif ++#ifndef finish_arch_switch ++# define finish_arch_switch(prev) do { } while (0) ++#endif ++#ifndef finish_arch_post_lock_switch ++# define finish_arch_post_lock_switch() do { } while (0) ++#endif ++ ++/** ++ * prepare_task_switch - prepare to switch tasks ++ * @rq: the runqueue preparing to switch ++ * @next: the task we are going to switch to. ++ * ++ * This is called with the rq lock held and interrupts off. It must ++ * be paired with a subsequent finish_task_switch after the context ++ * switch. ++ * ++ * prepare_task_switch sets up locking and calls architecture specific ++ * hooks. ++ */ ++static inline void ++prepare_task_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ kcov_prepare_switch(prev); ++ sched_info_switch(rq, prev, next); ++ perf_event_task_sched_out(prev, next); ++ rseq_preempt(prev); ++ fire_sched_out_preempt_notifiers(prev, next); ++ prepare_task(next); ++ prepare_arch_switch(next); ++} ++ ++/** ++ * finish_task_switch - clean up after a task-switch ++ * @rq: runqueue associated with task-switch ++ * @prev: the thread we just switched away from. ++ * ++ * finish_task_switch must be called after the context switch, paired ++ * with a prepare_task_switch call before the context switch. ++ * finish_task_switch will reconcile locking set up by prepare_task_switch, ++ * and do any other architecture-specific cleanup actions. ++ * ++ * Note that we may have delayed dropping an mm in context_switch(). If ++ * so, we finish that here outside of the runqueue lock. (Doing it ++ * with the lock held can cause deadlocks; see schedule() for ++ * details.) ++ * ++ * The context switch have flipped the stack from under us and restored the ++ * local variables which were saved when this task called schedule() in the ++ * past. prev == current is still correct but we need to recalculate this_rq ++ * because prev may have moved to another CPU. ++ */ ++static void finish_task_switch(struct task_struct *prev) ++ __releases(rq->lock) ++{ ++ struct rq *rq = this_rq(); ++ struct mm_struct *mm = rq->prev_mm; ++ long prev_state; ++ ++ /* ++ * The previous task will have left us with a preempt_count of 2 ++ * because it left us after: ++ * ++ * schedule() ++ * preempt_disable(); // 1 ++ * __schedule() ++ * raw_spin_lock_irq(rq->lock) // 2 ++ * ++ * Also, see FORK_PREEMPT_COUNT. ++ */ ++ if (WARN_ONCE(preempt_count() != 2*PREEMPT_DISABLE_OFFSET, ++ "corrupted preempt_count: %s/%d/0x%x\n", ++ current->comm, current->pid, preempt_count())) ++ preempt_count_set(FORK_PREEMPT_COUNT); ++ ++ rq->prev_mm = NULL; ++ ++ /* ++ * A task struct has one reference for the use as "current". ++ * If a task dies, then it sets TASK_DEAD in tsk->state and calls ++ * schedule one last time. The schedule call will never return, and ++ * the scheduled task must drop that reference. ++ * ++ * We must observe prev->state before clearing prev->on_cpu (in ++ * finish_task), otherwise a concurrent wakeup can get prev ++ * running on another CPU and we could rave with its RUNNING -> DEAD ++ * transition, resulting in a double drop. ++ */ ++ prev_state = prev->state; ++ vtime_task_switch(prev); ++ perf_event_task_sched_in(prev, current); ++ finish_task(prev); ++ finish_lock_switch(rq, prev); ++ finish_arch_post_lock_switch(); ++ kcov_finish_switch(current); ++ ++ fire_sched_in_preempt_notifiers(current); ++ /* ++ * When switching through a kernel thread, the loop in ++ * membarrier_{private,global}_expedited() may have observed that ++ * kernel thread and not issued an IPI. It is therefore possible to ++ * schedule between user->kernel->user threads without passing though ++ * switch_mm(). Membarrier requires a barrier after storing to ++ * rq->curr, before returning to userspace, so provide them here: ++ * ++ * - a full memory barrier for {PRIVATE,GLOBAL}_EXPEDITED, implicitly ++ * provided by mmdrop(), ++ * - a sync_core for SYNC_CORE. ++ */ ++ if (mm) { ++ membarrier_mm_sync_core_before_usermode(mm); ++ mmdrop(mm); ++ } ++ if (unlikely(prev_state == TASK_DEAD)) { ++ /* ++ * Remove function-return probe instances associated with this ++ * task and put them back on the free list. ++ */ ++ kprobe_flush_task(prev); ++ ++ /* Task is done with its stack. */ ++ put_task_stack(prev); ++ ++ put_task_struct(prev); ++ } ++} ++ ++/** ++ * schedule_tail - first thing a freshly forked thread must call. ++ * @prev: the thread we just switched away from. ++ */ ++asmlinkage __visible void schedule_tail(struct task_struct *prev) ++{ ++ /* ++ * New tasks start with FORK_PREEMPT_COUNT, see there and ++ * finish_task_switch() for details. ++ * ++ * finish_task_switch() will drop rq->lock() and lower preempt_count ++ * and the preempt_enable() will end up enabling preemption (on ++ * PREEMPT_COUNT kernels). ++ */ ++ ++ finish_task_switch(prev); ++ preempt_enable(); ++ ++ if (current->set_child_tid) ++ put_user(task_pid_vnr(current), current->set_child_tid); ++ ++ calculate_sigpending(); ++} ++ ++/* ++ * context_switch - switch to the new MM and the new thread's register state. ++ */ ++static __always_inline void ++context_switch(struct rq *rq, struct task_struct *prev, ++ struct task_struct *next) ++{ ++ struct mm_struct *mm, *oldmm; ++ ++ prepare_task_switch(rq, prev, next); ++ ++ mm = next->mm; ++ oldmm = prev->active_mm; ++ /* ++ * For paravirt, this is coupled with an exit in switch_to to ++ * combine the page table reload and the switch backend into ++ * one hypercall. ++ */ ++ arch_start_context_switch(prev); ++ ++ /* ++ * If mm is non-NULL, we pass through switch_mm(). If mm is ++ * NULL, we will pass through mmdrop() in finish_task_switch(). ++ * Both of these contain the full memory barrier required by ++ * membarrier after storing to rq->curr, before returning to ++ * user-space. ++ */ ++ if (!mm) { ++ next->active_mm = oldmm; ++ mmgrab(oldmm); ++ enter_lazy_tlb(oldmm, next); ++ } else ++ switch_mm_irqs_off(oldmm, mm, next); ++ ++ if (!prev->mm) { ++ prev->active_mm = NULL; ++ rq->prev_mm = oldmm; ++ } ++ prepare_lock_switch(rq, next); ++ ++ /* Here we just switch the register state and the stack. */ ++ switch_to(prev, next, prev); ++ barrier(); ++ ++ finish_task_switch(prev); ++} ++ ++/* ++ * nr_running, nr_uninterruptible and nr_context_switches: ++ * ++ * externally visible scheduler statistics: current number of runnable ++ * threads, total number of context switches performed since bootup. ++ */ ++unsigned long nr_running(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_running; ++ ++ return sum; ++} ++ ++static unsigned long nr_uninterruptible(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_online_cpu(i) ++ sum += cpu_rq(i)->nr_uninterruptible; ++ ++ return sum; ++} ++ ++/* ++ * Check if only the current task is running on the CPU. ++ * ++ * Caution: this function does not check that the caller has disabled ++ * preemption, thus the result might have a time-of-check-to-time-of-use ++ * race. The caller is responsible to use it correctly, for example: ++ * ++ * - from a non-preemptable section (of course) ++ * ++ * - from a thread that is bound to a single CPU ++ * ++ * - in a loop with very short iterations (e.g. a polling loop) ++ */ ++bool single_task_running(void) ++{ ++ struct rq *rq = cpu_rq(smp_processor_id()); ++ ++ if (rq_load(rq) == 1) ++ return true; ++ else ++ return false; ++} ++EXPORT_SYMBOL(single_task_running); ++ ++unsigned long long nr_context_switches(void) ++{ ++ int i; ++ unsigned long long sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += cpu_rq(i)->nr_switches; ++ ++ return sum; ++} ++ ++/* ++ * IO-wait accounting, and how its mostly bollocks (on SMP). ++ * ++ * The idea behind IO-wait account is to account the idle time that we could ++ * have spend running if it were not for IO. That is, if we were to improve the ++ * storage performance, we'd have a proportional reduction in IO-wait time. ++ * ++ * This all works nicely on UP, where, when a task blocks on IO, we account ++ * idle time as IO-wait, because if the storage were faster, it could've been ++ * running and we'd not be idle. ++ * ++ * This has been extended to SMP, by doing the same for each CPU. This however ++ * is broken. ++ * ++ * Imagine for instance the case where two tasks block on one CPU, only the one ++ * CPU will have IO-wait accounted, while the other has regular idle. Even ++ * though, if the storage were faster, both could've ran at the same time, ++ * utilising both CPUs. ++ * ++ * This means, that when looking globally, the current IO-wait accounting on ++ * SMP is a lower bound, by reason of under accounting. ++ * ++ * Worse, since the numbers are provided per CPU, they are sometimes ++ * interpreted per CPU, and that is nonsensical. A blocked task isn't strictly ++ * associated with any one particular CPU, it can wake to another CPU than it ++ * blocked on. This means the per CPU IO-wait number is meaningless. ++ * ++ * Task CPU affinities can make all that even more 'interesting'. ++ */ ++ ++unsigned long nr_iowait(void) ++{ ++ unsigned long i, sum = 0; ++ ++ for_each_possible_cpu(i) ++ sum += atomic_read(&cpu_rq(i)->nr_iowait); ++ ++ return sum; ++} ++ ++/* ++ * Consumers of these two interfaces, like for example the cpufreq menu ++ * governor are using nonsensical data. Boosting frequency for a CPU that has ++ * IO-wait which might not even end up running the task when it does become ++ * runnable. ++ */ ++ ++unsigned long nr_iowait_cpu(int cpu) ++{ ++ struct rq *this = cpu_rq(cpu); ++ return atomic_read(&this->nr_iowait); ++} ++ ++unsigned long nr_active(void) ++{ ++ return nr_running() + nr_uninterruptible(); ++} ++ ++/* ++ * I/O wait is the number of running or queued tasks with their ->rq pointer ++ * set to this cpu as being the CPU they're more likely to run on. ++ */ ++void get_iowait_load(unsigned long *nr_waiters, unsigned long *load) ++{ ++ struct rq *rq = this_rq(); ++ ++ *nr_waiters = atomic_read(&rq->nr_iowait); ++ *load = rq_load(rq); ++} ++ ++/* Variables and functions for calc_load */ ++static unsigned long calc_load_update; ++unsigned long avenrun[3]; ++EXPORT_SYMBOL(avenrun); ++ ++/** ++ * get_avenrun - get the load average array ++ * @loads: pointer to dest load array ++ * @offset: offset to add ++ * @shift: shift count to shift the result left ++ * ++ * These values are estimates at best, so no need for locking. ++ */ ++void get_avenrun(unsigned long *loads, unsigned long offset, int shift) ++{ ++ loads[0] = (avenrun[0] + offset) << shift; ++ loads[1] = (avenrun[1] + offset) << shift; ++ loads[2] = (avenrun[2] + offset) << shift; ++} ++ ++static unsigned long ++calc_load(unsigned long load, unsigned long exp, unsigned long active) ++{ ++ unsigned long newload; ++ ++ newload = load * exp + active * (FIXED_1 - exp); ++ if (active >= load) ++ newload += FIXED_1-1; ++ ++ return newload / FIXED_1; ++} ++ ++/* ++ * calc_load - update the avenrun load estimates every LOAD_FREQ seconds. ++ */ ++void calc_global_load(unsigned long ticks) ++{ ++ long active; ++ ++ if (time_before(jiffies, READ_ONCE(calc_load_update))) ++ return; ++ active = nr_active() * FIXED_1; ++ ++ avenrun[0] = calc_load(avenrun[0], EXP_1, active); ++ avenrun[1] = calc_load(avenrun[1], EXP_5, active); ++ avenrun[2] = calc_load(avenrun[2], EXP_15, active); ++ ++ calc_load_update = jiffies + LOAD_FREQ; ++} ++ ++DEFINE_PER_CPU(struct kernel_stat, kstat); ++DEFINE_PER_CPU(struct kernel_cpustat, kernel_cpustat); ++ ++EXPORT_PER_CPU_SYMBOL(kstat); ++EXPORT_PER_CPU_SYMBOL(kernel_cpustat); ++ ++#ifdef CONFIG_PARAVIRT ++static inline u64 steal_ticks(u64 steal) ++{ ++ if (unlikely(steal > NSEC_PER_SEC)) ++ return div_u64(steal, TICK_NSEC); ++ ++ return __iter_div_u64_rem(steal, TICK_NSEC, &steal); ++} ++#endif ++ ++#ifndef nsecs_to_cputime ++# define nsecs_to_cputime(__nsecs) nsecs_to_jiffies(__nsecs) ++#endif ++ ++/* ++ * On each tick, add the number of nanoseconds to the unbanked variables and ++ * once one tick's worth has accumulated, account it allowing for accurate ++ * sub-tick accounting and totals. Use the TICK_APPROX_NS to match the way we ++ * deduct nanoseconds. ++ */ ++static void pc_idle_time(struct rq *rq, struct task_struct *idle, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ if (atomic_read(&rq->nr_iowait) > 0) { ++ rq->iowait_ns += ns; ++ if (rq->iowait_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->iowait_ns); ++ cpustat[CPUTIME_IOWAIT] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->iowait_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->idle_ns += ns; ++ if (rq->idle_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->idle_ns); ++ cpustat[CPUTIME_IDLE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->idle_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(idle); ++} ++ ++static void pc_system_time(struct rq *rq, struct task_struct *p, ++ int hardirq_offset, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->stime_ns += ns; ++ if (p->stime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->stime_ns); ++ p->stime_ns %= JIFFY_NS; ++ p->stime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_system_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (hardirq_count() - hardirq_offset) { ++ rq->irq_ns += ns; ++ if (rq->irq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->irq_ns); ++ cpustat[CPUTIME_IRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->irq_ns %= JIFFY_NS; ++ } ++ } else if (in_serving_softirq()) { ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->system_ns += ns; ++ if (rq->system_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->system_ns); ++ cpustat[CPUTIME_SYSTEM] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->system_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++static void pc_user_time(struct rq *rq, struct task_struct *p, unsigned long ns) ++{ ++ u64 *cpustat = kcpustat_this_cpu->cpustat; ++ unsigned long ticks; ++ ++ p->utime_ns += ns; ++ if (p->utime_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(p->utime_ns); ++ p->utime_ns %= JIFFY_NS; ++ p->utime += (__force u64)TICK_APPROX_NS * ticks; ++ account_group_user_time(p, TICK_APPROX_NS * ticks); ++ } ++ p->sched_time += ns; ++ account_group_exec_runtime(p, ns); ++ ++ if (this_cpu_ksoftirqd() == p) { ++ /* ++ * ksoftirqd time do not get accounted in cpu_softirq_time. ++ * So, we have to handle it separately here. ++ */ ++ rq->softirq_ns += ns; ++ if (rq->softirq_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->softirq_ns); ++ cpustat[CPUTIME_SOFTIRQ] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->softirq_ns %= JIFFY_NS; ++ } ++ } ++ ++ if (task_nice(p) > 0 || idleprio_task(p)) { ++ rq->nice_ns += ns; ++ if (rq->nice_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->nice_ns); ++ cpustat[CPUTIME_NICE] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->nice_ns %= JIFFY_NS; ++ } ++ } else { ++ rq->user_ns += ns; ++ if (rq->user_ns >= JIFFY_NS) { ++ ticks = NS_TO_JIFFIES(rq->user_ns); ++ cpustat[CPUTIME_USER] += (__force u64)TICK_APPROX_NS * ticks; ++ rq->user_ns %= JIFFY_NS; ++ } ++ } ++ acct_update_integrals(p); ++} ++ ++/* ++ * This is called on clock ticks. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_tick(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate tick timekeeping */ ++ if (user_mode(get_irq_regs())) ++ pc_user_time(rq, p, account_ns); ++ else if (p != idle || (irq_count() != HARDIRQ_OFFSET)) { ++ pc_system_time(rq, p, HARDIRQ_OFFSET, account_ns); ++ } else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++ ++ p->last_ran = rq->niffies; ++} ++ ++/* ++ * This is called on context switches. ++ * Bank in p->sched_time the ns elapsed since the last tick or switch. ++ * CPU scheduler quota accounting is also performed here in microseconds. ++ */ ++static void update_cpu_clock_switch(struct rq *rq, struct task_struct *p) ++{ ++ s64 account_ns = rq->niffies - p->last_ran; ++ struct task_struct *idle = rq->idle; ++ ++ /* Accurate subtick timekeeping */ ++ if (p != idle) ++ pc_user_time(rq, p, account_ns); ++ else ++ pc_idle_time(rq, idle, account_ns); ++ ++ /* time_slice accounting is done in usecs to avoid overflow on 32bit */ ++ if (p->policy != SCHED_FIFO && p != idle) ++ p->time_slice -= NS_TO_US(account_ns); ++} ++ ++/* ++ * Return any ns on the sched_clock that have not yet been accounted in ++ * @p in case that task is currently running. ++ * ++ * Called with task_rq_lock(p) held. ++ */ ++static inline u64 do_task_delta_exec(struct task_struct *p, struct rq *rq) ++{ ++ u64 ns = 0; ++ ++ /* ++ * Must be ->curr _and_ ->on_rq. If dequeued, we would ++ * project cycles that may never be accounted to this ++ * thread, breaking clock_gettime(). ++ */ ++ if (p == rq->curr && task_on_rq_queued(p)) { ++ update_clocks(rq); ++ ns = rq->niffies - p->last_ran; ++ } ++ ++ return ns; ++} ++ ++/* ++ * Return accounted runtime for the task. ++ * Return separately the current's pending runtime that have not been ++ * accounted yet. ++ * ++ */ ++unsigned long long task_sched_runtime(struct task_struct *p) ++{ ++ unsigned long flags; ++ struct rq *rq; ++ u64 ns; ++ ++#if defined(CONFIG_64BIT) && defined(CONFIG_SMP) ++ /* ++ * 64-bit doesn't need locks to atomically read a 64-bit value. ++ * So we have a optimisation chance when the task's delta_exec is 0. ++ * Reading ->on_cpu is racy, but this is ok. ++ * ++ * If we race with it leaving CPU, we'll take a lock. So we're correct. ++ * If we race with it entering CPU, unaccounted time is 0. This is ++ * indistinguishable from the read occurring a few cycles earlier. ++ * If we see ->on_cpu without ->on_rq, the task is leaving, and has ++ * been accounted, so we're correct here as well. ++ */ ++ if (!p->on_cpu || !task_on_rq_queued(p)) ++ return tsk_seruntime(p); ++#endif ++ ++ rq = task_rq_lock(p, &flags); ++ ns = p->sched_time + do_task_delta_exec(p, rq); ++ task_rq_unlock(rq, p, &flags); ++ ++ return ns; ++} ++ ++/* ++ * Functions to test for when SCHED_ISO tasks have used their allocated ++ * quota as real time scheduling and convert them back to SCHED_NORMAL. All ++ * data is modified only by the local runqueue during scheduler_tick with ++ * interrupts disabled. ++ */ ++ ++/* ++ * Test if SCHED_ISO tasks have run longer than their alloted period as RT ++ * tasks and set the refractory flag if necessary. There is 10% hysteresis ++ * for unsetting the flag. 115/128 is ~90/100 as a fast shift instead of a ++ * slow division. ++ */ ++static inline void iso_tick(struct rq *rq) ++{ ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - 1) / ISO_PERIOD; ++ rq->iso_ticks += 100; ++ if (rq->iso_ticks > ISO_PERIOD * sched_iso_cpu) { ++ rq->iso_refractory = true; ++ if (unlikely(rq->iso_ticks > ISO_PERIOD * 100)) ++ rq->iso_ticks = ISO_PERIOD * 100; ++ } ++} ++ ++/* No SCHED_ISO task was running so decrease rq->iso_ticks */ ++static inline void no_iso_tick(struct rq *rq, int ticks) ++{ ++ if (rq->iso_ticks > 0 || rq->iso_refractory) { ++ rq->iso_ticks = rq->iso_ticks * (ISO_PERIOD - ticks) / ISO_PERIOD; ++ if (rq->iso_ticks < ISO_PERIOD * (sched_iso_cpu * 115 / 128)) { ++ rq->iso_refractory = false; ++ if (unlikely(rq->iso_ticks < 0)) ++ rq->iso_ticks = 0; ++ } ++ } ++} ++ ++/* This manages tasks that have run out of timeslice during a scheduler_tick */ ++static void task_running_tick(struct rq *rq) ++{ ++ struct task_struct *p = rq->curr; ++ ++ /* ++ * If a SCHED_ISO task is running we increment the iso_ticks. In ++ * order to prevent SCHED_ISO tasks from causing starvation in the ++ * presence of true RT tasks we account those as iso_ticks as well. ++ */ ++ if (rt_task(p) || task_running_iso(p)) ++ iso_tick(rq); ++ else ++ no_iso_tick(rq, 1); ++ ++ /* SCHED_FIFO tasks never run out of timeslice. */ ++ if (p->policy == SCHED_FIFO) ++ return; ++ ++ if (iso_task(p)) { ++ if (task_running_iso(p)) { ++ if (rq->iso_refractory) { ++ /* ++ * SCHED_ISO task is running as RT and limit ++ * has been hit. Force it to reschedule as ++ * SCHED_NORMAL by zeroing its time_slice ++ */ ++ p->time_slice = 0; ++ } ++ } else if (!rq->iso_refractory) { ++ /* Can now run again ISO. Reschedule to pick up prio */ ++ goto out_resched; ++ } ++ } ++ ++ /* ++ * Tasks that were scheduled in the first half of a tick are not ++ * allowed to run into the 2nd half of the next tick if they will ++ * run out of time slice in the interim. Otherwise, if they have ++ * less than RESCHED_US μs of time slice left they will be rescheduled. ++ * Dither is used as a backup for when hrexpiry is disabled or high res ++ * timers not configured in. ++ */ ++ if (p->time_slice - rq->dither >= RESCHED_US) ++ return; ++out_resched: ++ rq_lock(rq); ++ __set_tsk_resched(p); ++ rq_unlock(rq); ++} ++ ++static inline void task_tick(struct rq *rq) ++{ ++ if (!rq_idle(rq)) ++ task_running_tick(rq); ++ else if (rq->last_jiffy > rq->last_scheduler_tick) ++ no_iso_tick(rq, rq->last_jiffy - rq->last_scheduler_tick); ++} ++ ++#ifdef CONFIG_NO_HZ_FULL ++/* ++ * We can stop the timer tick any time highres timers are active since ++ * we rely entirely on highres timeouts for task expiry rescheduling. ++ */ ++static void sched_stop_tick(struct rq *rq, int cpu) ++{ ++ if (!hrexpiry_enabled(rq)) ++ return; ++ if (!tick_nohz_full_enabled()) ++ return; ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++static inline void sched_start_tick(struct rq *rq, int cpu) ++{ ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++ ++struct tick_work { ++ int cpu; ++ struct delayed_work work; ++}; ++ ++static struct tick_work __percpu *tick_work_cpu; ++ ++static void sched_tick_remote(struct work_struct *work) ++{ ++ struct delayed_work *dwork = to_delayed_work(work); ++ struct tick_work *twork = container_of(dwork, struct tick_work, work); ++ int cpu = twork->cpu; ++ struct rq *rq = cpu_rq(cpu); ++ struct task_struct *curr; ++ u64 delta; ++ ++ /* ++ * Handle the tick only if it appears the remote CPU is running in full ++ * dynticks mode. The check is racy by nature, but missing a tick or ++ * having one too much is no big deal because the scheduler tick updates ++ * statistics and checks timeslices in a time-independent way, regardless ++ * of when exactly it is running. ++ */ ++ if (idle_cpu(cpu) || !tick_nohz_tick_stopped_cpu(cpu)) ++ goto out_requeue; ++ ++ rq_lock_irq(rq); ++ curr = rq->curr; ++ if (is_idle_task(curr)) ++ goto out_unlock; ++ ++ update_rq_clock(rq); ++ delta = rq_clock_task(rq) - curr->last_ran; ++ ++ /* ++ * Make sure the next tick runs within a reasonable ++ * amount of time. ++ */ ++ WARN_ON_ONCE(delta > (u64)NSEC_PER_SEC * 3); ++ task_tick(rq); ++ ++out_unlock: ++ rq_unlock_irq(rq); ++ ++out_requeue: ++ /* ++ * Run the remote tick once per second (1Hz). This arbitrary ++ * frequency is large enough to avoid overload but short enough ++ * to keep scheduler internal stats reasonably up to date. ++ */ ++ queue_delayed_work(system_unbound_wq, dwork, HZ); ++} ++ ++static void sched_tick_start(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ twork->cpu = cpu; ++ INIT_DELAYED_WORK(&twork->work, sched_tick_remote); ++ queue_delayed_work(system_unbound_wq, &twork->work, HZ); ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++static void sched_tick_stop(int cpu) ++{ ++ struct tick_work *twork; ++ ++ if (housekeeping_cpu(cpu, HK_FLAG_TICK)) ++ return; ++ ++ WARN_ON_ONCE(!tick_work_cpu); ++ ++ twork = per_cpu_ptr(tick_work_cpu, cpu); ++ cancel_delayed_work_sync(&twork->work); ++} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++int __init sched_tick_offload_init(void) ++{ ++ tick_work_cpu = alloc_percpu(struct tick_work); ++ BUG_ON(!tick_work_cpu); ++ ++ return 0; ++} ++ ++#else /* !CONFIG_NO_HZ_FULL */ ++static inline void sched_stop_tick(struct rq *rq, int cpu) {} ++static inline void sched_start_tick(struct rq *rq, int cpu) {} ++static inline void sched_tick_start(int cpu) { } ++static inline void sched_tick_stop(int cpu) { } ++#endif ++ ++/* ++ * This function gets called by the timer code, with HZ frequency. ++ * We call it with interrupts disabled. ++ */ ++void scheduler_tick(void) ++{ ++ int cpu __maybe_unused = smp_processor_id(); ++ struct rq *rq = cpu_rq(cpu); ++ ++ sched_clock_tick(); ++ update_clocks(rq); ++ update_load_avg(rq, 0); ++ update_cpu_clock_tick(rq, rq->curr); ++ task_tick(rq); ++ rq->last_scheduler_tick = rq->last_jiffy; ++ rq->last_tick = rq->clock; ++ perf_event_task_tick(); ++ sched_stop_tick(rq, cpu); ++} ++ ++#if defined(CONFIG_PREEMPT) && (defined(CONFIG_DEBUG_PREEMPT) || \ ++ defined(CONFIG_TRACE_PREEMPT_TOGGLE)) ++/* ++ * If the value passed in is equal to the current preempt count ++ * then we just disabled preemption. Start timing the latency. ++ */ ++static inline void preempt_latency_start(int val) ++{ ++ if (preempt_count() == val) { ++ unsigned long ip = get_lock_parent_ip(); ++#ifdef CONFIG_DEBUG_PREEMPT ++ current->preempt_disable_ip = ip; ++#endif ++ trace_preempt_off(CALLER_ADDR0, ip); ++ } ++} ++ ++void preempt_count_add(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((preempt_count() < 0))) ++ return; ++#endif ++ __preempt_count_add(val); ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Spinlock count overflowing soon? ++ */ ++ DEBUG_LOCKS_WARN_ON((preempt_count() & PREEMPT_MASK) >= ++ PREEMPT_MASK - 10); ++#endif ++ preempt_latency_start(val); ++} ++EXPORT_SYMBOL(preempt_count_add); ++NOKPROBE_SYMBOL(preempt_count_add); ++ ++/* ++ * If the value passed in equals to the current preempt count ++ * then we just enabled preemption. Stop timing the latency. ++ */ ++static inline void preempt_latency_stop(int val) ++{ ++ if (preempt_count() == val) ++ trace_preempt_on(CALLER_ADDR0, get_lock_parent_ip()); ++} ++ ++void preempt_count_sub(int val) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Underflow? ++ */ ++ if (DEBUG_LOCKS_WARN_ON(val > preempt_count())) ++ return; ++ /* ++ * Is the spinlock portion underflowing? ++ */ ++ if (DEBUG_LOCKS_WARN_ON((val < PREEMPT_MASK) && ++ !(preempt_count() & PREEMPT_MASK))) ++ return; ++#endif ++ ++ preempt_latency_stop(val); ++ __preempt_count_sub(val); ++} ++EXPORT_SYMBOL(preempt_count_sub); ++NOKPROBE_SYMBOL(preempt_count_sub); ++ ++#else ++static inline void preempt_latency_start(int val) { } ++static inline void preempt_latency_stop(int val) { } ++#endif ++ ++static inline unsigned long get_preempt_disable_ip(struct task_struct *p) ++{ ++#ifdef CONFIG_DEBUG_PREEMPT ++ return p->preempt_disable_ip; ++#else ++ return 0; ++#endif ++} ++ ++/* ++ * The time_slice is only refilled when it is empty and that is when we set a ++ * new deadline. Make sure update_clocks has been called recently to update ++ * rq->niffies. ++ */ ++static void time_slice_expired(struct task_struct *p, struct rq *rq) ++{ ++ p->time_slice = timeslice(); ++ p->deadline = rq->niffies + task_deadline_diff(p); ++#ifdef CONFIG_SMT_NICE ++ if (!p->mm) ++ p->smt_bias = 0; ++ else if (rt_task(p)) ++ p->smt_bias = 1 << 30; ++ else if (task_running_iso(p)) ++ p->smt_bias = 1 << 29; ++ else if (idleprio_task(p)) { ++ if (task_running_idle(p)) ++ p->smt_bias = 0; ++ else ++ p->smt_bias = 1; ++ } else if (--p->smt_bias < 1) ++ p->smt_bias = MAX_PRIO - p->static_prio; ++#endif ++} ++ ++/* ++ * Timeslices below RESCHED_US are considered as good as expired as there's no ++ * point rescheduling when there's so little time left. SCHED_BATCH tasks ++ * have been flagged be not latency sensitive and likely to be fully CPU ++ * bound so every time they're rescheduled they have their time_slice ++ * refilled, but get a new later deadline to have little effect on ++ * SCHED_NORMAL tasks. ++ ++ */ ++static inline void check_deadline(struct task_struct *p, struct rq *rq) ++{ ++ if (p->time_slice < RESCHED_US || batch_task(p)) ++ time_slice_expired(p, rq); ++} ++ ++/* ++ * Task selection with skiplists is a simple matter of picking off the first ++ * task in the sorted list, an O(1) operation. The lookup is amortised O(1) ++ * being bound to the number of processors. ++ * ++ * Runqueues are selectively locked based on their unlocked data and then ++ * unlocked if not needed. At most 3 locks will be held at any time and are ++ * released as soon as they're no longer needed. All balancing between CPUs ++ * is thus done here in an extremely simple first come best fit manner. ++ * ++ * This iterates over runqueues in cache locality order. In interactive mode ++ * it iterates over all CPUs and finds the task with the best key/deadline. ++ * In non-interactive mode it will only take a task if it's from the current ++ * runqueue or a runqueue with more tasks than the current one with a better ++ * key/deadline. ++ */ ++#ifdef CONFIG_SMP ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct rq *locked = NULL, *chosen = NULL; ++ struct task_struct *edt = idle; ++ int i, best_entries = 0; ++ u64 best_key = ~0ULL; ++ ++ for (i = 0; i < total_runqueues; i++) { ++ struct rq *other_rq = rq_order(rq, i); ++ skiplist_node *next; ++ int entries; ++ ++ entries = other_rq->sl->entries; ++ /* ++ * Check for queued entres lockless first. The local runqueue ++ * is locked so entries will always be accurate. ++ */ ++ if (!sched_interactive) { ++ /* ++ * Don't reschedule balance across nodes unless the CPU ++ * is idle. ++ */ ++ if (edt != idle && rq->cpu_locality[other_rq->cpu] > 3) ++ break; ++ if (entries <= best_entries) ++ continue; ++ } else if (!entries) ++ continue; ++ ++ /* if (i) implies other_rq != rq */ ++ if (i) { ++ /* Check for best id queued lockless first */ ++ if (other_rq->best_key >= best_key) ++ continue; ++ ++ if (unlikely(!trylock_rq(rq, other_rq))) ++ continue; ++ ++ /* Need to reevaluate entries after locking */ ++ entries = other_rq->sl->entries; ++ if (unlikely(!entries)) { ++ unlock_rq(other_rq); ++ continue; ++ } ++ } ++ ++ next = other_rq->node; ++ /* ++ * In interactive mode we check beyond the best entry on other ++ * runqueues if we can't get the best for smt or affinity ++ * reasons. ++ */ ++ while ((next = next->next[0]) != other_rq->node) { ++ struct task_struct *p; ++ u64 key = next->key; ++ ++ /* Reevaluate key after locking */ ++ if (key >= best_key) ++ break; ++ ++ p = next->value; ++ if (!smt_schedule(p, rq)) { ++ if (i && !sched_interactive) ++ break; ++ continue; ++ } ++ ++ if (sched_other_cpu(p, cpu)) { ++ if (sched_interactive || !i) ++ continue; ++ break; ++ } ++ /* Make sure affinity is ok */ ++ if (i) { ++ /* From this point on p is the best so far */ ++ if (locked) ++ unlock_rq(locked); ++ chosen = locked = other_rq; ++ } ++ best_entries = entries; ++ best_key = key; ++ edt = p; ++ break; ++ } ++ /* rq->preempting is a hint only as the state may have changed ++ * since it was set with the resched call but if we have met ++ * the condition we can break out here. */ ++ if (edt == rq->preempting) ++ break; ++ if (i && other_rq != chosen) ++ unlock_rq(other_rq); ++ } ++ ++ if (likely(edt != idle)) ++ take_task(rq, cpu, edt); ++ ++ if (locked) ++ unlock_rq(locked); ++ ++ rq->preempting = NULL; ++ ++ return edt; ++} ++#else /* CONFIG_SMP */ ++static inline struct task_struct ++*earliest_deadline_task(struct rq *rq, int cpu, struct task_struct *idle) ++{ ++ struct task_struct *edt; ++ ++ if (unlikely(!rq->sl->entries)) ++ return idle; ++ edt = rq->node->next[0]->value; ++ take_task(rq, cpu, edt); ++ return edt; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * Print scheduling while atomic bug: ++ */ ++static noinline void __schedule_bug(struct task_struct *prev) ++{ ++ /* Save this before calling printk(), since that will clobber it */ ++ unsigned long preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ if (oops_in_progress) ++ return; ++ ++ printk(KERN_ERR "BUG: scheduling while atomic: %s/%d/0x%08x\n", ++ prev->comm, prev->pid, preempt_count()); ++ ++ debug_show_held_locks(prev); ++ print_modules(); ++ if (irqs_disabled()) ++ print_irqtrace_events(prev); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && in_atomic_preempt_off()) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++ ++/* ++ * Various schedule()-time debugging checks and statistics: ++ */ ++static inline void schedule_debug(struct task_struct *prev) ++{ ++#ifdef CONFIG_SCHED_STACK_END_CHECK ++ if (task_stack_end_corrupted(prev)) ++ panic("corrupted stack end detected inside scheduler\n"); ++#endif ++ ++ if (unlikely(in_atomic_preempt_off())) { ++ __schedule_bug(prev); ++ preempt_count_set(PREEMPT_DISABLED); ++ } ++ rcu_sleep_check(); ++ ++ profile_hit(SCHED_PROFILING, __builtin_return_address(0)); ++ ++ schedstat_inc(this_rq()->sched_count); ++} ++ ++/* ++ * The currently running task's information is all stored in rq local data ++ * which is only modified by the local CPU. ++ */ ++static inline void set_rq_task(struct rq *rq, struct task_struct *p) ++{ ++ if (p == rq->idle || p->policy == SCHED_FIFO) ++ hrexpiry_clear(rq); ++ else ++ hrexpiry_start(rq, US_TO_NS(p->time_slice)); ++ if (rq->clock - rq->last_tick > HALF_JIFFY_NS) ++ rq->dither = 0; ++ else ++ rq->dither = rq_dither(rq); ++ ++ rq->rq_deadline = p->deadline; ++ rq->rq_prio = p->prio; ++#ifdef CONFIG_SMT_NICE ++ rq->rq_mm = p->mm; ++ rq->rq_smt_bias = p->smt_bias; ++#endif ++} ++ ++#ifdef CONFIG_SMT_NICE ++static void check_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_no_siblings(struct rq __maybe_unused *this_rq) {} ++static void (*check_siblings)(struct rq *this_rq) = &check_no_siblings; ++static void (*wake_siblings)(struct rq *this_rq) = &wake_no_siblings; ++ ++/* Iterate over smt siblings when we've scheduled a process on cpu and decide ++ * whether they should continue running or be descheduled. */ ++static void check_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct task_struct *p; ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ continue; ++ p = rq->curr; ++ if (!smt_schedule(p, this_rq)) ++ resched_curr(rq); ++ } ++} ++ ++static void wake_smt_siblings(struct rq *this_rq) ++{ ++ int other_cpu; ++ ++ for_each_cpu(other_cpu, &this_rq->thread_mask) { ++ struct rq *rq; ++ ++ rq = cpu_rq(other_cpu); ++ if (rq_idle(rq)) ++ resched_idle(rq); ++ } ++} ++#else ++static void check_siblings(struct rq __maybe_unused *this_rq) {} ++static void wake_siblings(struct rq __maybe_unused *this_rq) {} ++#endif ++ ++/* ++ * schedule() is the main scheduler function. ++ * ++ * The main means of driving the scheduler and thus entering this function are: ++ * ++ * 1. Explicit blocking: mutex, semaphore, waitqueue, etc. ++ * ++ * 2. TIF_NEED_RESCHED flag is checked on interrupt and userspace return ++ * paths. For example, see arch/x86/entry_64.S. ++ * ++ * To drive preemption between tasks, the scheduler sets the flag in timer ++ * interrupt handler scheduler_tick(). ++ * ++ * 3. Wakeups don't really cause entry into schedule(). They add a ++ * task to the run-queue and that's it. ++ * ++ * Now, if the new task added to the run-queue preempts the current ++ * task, then the wakeup sets TIF_NEED_RESCHED and schedule() gets ++ * called on the nearest possible occasion: ++ * ++ * - If the kernel is preemptible (CONFIG_PREEMPT=y): ++ * ++ * - in syscall or exception context, at the next outmost ++ * preempt_enable(). (this might be as soon as the wake_up()'s ++ * spin_unlock()!) ++ * ++ * - in IRQ context, return from interrupt-handler to ++ * preemptible context ++ * ++ * - If the kernel is not preemptible (CONFIG_PREEMPT is not set) ++ * then at the next: ++ * ++ * - cond_resched() call ++ * - explicit schedule() call ++ * - return from syscall or exception to user-space ++ * - return from interrupt-handler to user-space ++ * ++ * WARNING: must be called with preemption disabled! ++ */ ++static void __sched notrace __schedule(bool preempt) ++{ ++ struct task_struct *prev, *next, *idle; ++ unsigned long *switch_count; ++ bool deactivate = false; ++ struct rq *rq; ++ u64 niffies; ++ int cpu; ++ ++ cpu = smp_processor_id(); ++ rq = cpu_rq(cpu); ++ prev = rq->curr; ++ idle = rq->idle; ++ ++ schedule_debug(prev); ++ ++ local_irq_disable(); ++ rcu_note_context_switch(preempt); ++ ++ /* ++ * Make sure that signal_pending_state()->signal_pending() below ++ * can't be reordered with __set_current_state(TASK_INTERRUPTIBLE) ++ * done by the caller to avoid the race with signal_wake_up(). ++ * ++ * The membarrier system call requires a full memory barrier ++ * after coming from user-space, before storing to rq->curr. ++ */ ++ rq_lock(rq); ++ smp_mb__after_spinlock(); ++#ifdef CONFIG_SMP ++ if (rq->preempt) { ++ /* ++ * Make sure resched_curr hasn't triggered a preemption ++ * locklessly on a task that has since scheduled away. Spurious ++ * wakeup of idle is okay though. ++ */ ++ if (unlikely(preempt && prev != idle && !test_tsk_need_resched(prev))) { ++ rq->preempt = NULL; ++ clear_preempt_need_resched(); ++ rq_unlock_irq(rq); ++ return; ++ } ++ rq->preempt = NULL; ++ } ++#endif ++ ++ switch_count = &prev->nivcsw; ++ if (!preempt && prev->state) { ++ if (unlikely(signal_pending_state(prev->state, prev))) { ++ prev->state = TASK_RUNNING; ++ } else { ++ deactivate = true; ++ prev->on_rq = 0; ++ ++ if (prev->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++ ++ /* ++ * If a worker is going to sleep, notify and ++ * ask workqueue whether it wants to wake up a ++ * task to maintain concurrency. If so, wake ++ * up the task. ++ */ ++ if (prev->flags & PF_WQ_WORKER) { ++ struct task_struct *to_wakeup; ++ ++ to_wakeup = wq_worker_sleeping(prev); ++ if (to_wakeup) ++ try_to_wake_up_local(to_wakeup); ++ } ++ } ++ switch_count = &prev->nvcsw; ++ } ++ ++ /* ++ * Store the niffy value here for use by the next task's last_ran ++ * below to avoid losing niffies due to update_clocks being called ++ * again after this point. ++ */ ++ update_clocks(rq); ++ niffies = rq->niffies; ++ update_cpu_clock_switch(rq, prev); ++ ++ clear_tsk_need_resched(prev); ++ clear_preempt_need_resched(); ++ ++ if (idle != prev) { ++ check_deadline(prev, rq); ++ return_task(prev, rq, cpu, deactivate); ++ } ++ ++ next = earliest_deadline_task(rq, cpu, idle); ++ if (likely(next->prio != PRIO_LIMIT)) ++ clear_cpuidle_map(cpu); ++ else { ++ set_cpuidle_map(cpu); ++ update_load_avg(rq, 0); ++ } ++ ++ set_rq_task(rq, next); ++ next->last_ran = niffies; ++ ++ if (likely(prev != next)) { ++ /* ++ * Don't reschedule an idle task or deactivated tasks ++ */ ++ if (prev == idle) { ++ rq->nr_running++; ++ if (rt_task(next)) ++ rq->rt_nr_running++; ++ } else if (!deactivate) ++ resched_suitable_idle(prev); ++ if (unlikely(next == idle)) { ++ rq->nr_running--; ++ if (rt_task(prev)) ++ rq->rt_nr_running--; ++ wake_siblings(rq); ++ } else ++ check_siblings(rq); ++ rq->nr_switches++; ++ rq->curr = next; ++ /* ++ * The membarrier system call requires each architecture ++ * to have a full memory barrier after updating ++ * rq->curr, before returning to user-space. ++ * ++ * Here are the schemes providing that barrier on the ++ * various architectures: ++ * - mm ? switch_mm() : mmdrop() for x86, s390, sparc, PowerPC. ++ * switch_mm() rely on membarrier_arch_switch_mm() on PowerPC. ++ * - finish_lock_switch() for weakly-ordered ++ * architectures where spin_unlock is a full barrier, ++ * - switch_to() for arm64 (weakly-ordered, spin_unlock ++ * is a RELEASE barrier), ++ */ ++ ++*switch_count; ++ ++ trace_sched_switch(preempt, prev, next); ++ context_switch(rq, prev, next); /* unlocks the rq */ ++ } else { ++ check_siblings(rq); ++ rq_unlock(rq); ++ do_pending_softirq(rq, next); ++ local_irq_enable(); ++ } ++} ++ ++void __noreturn do_task_dead(void) ++{ ++ /* Causes final put_task_struct in finish_task_switch(). */ ++ set_special_state(TASK_DEAD); ++ ++ /* Tell freezer to ignore us: */ ++ current->flags |= PF_NOFREEZE; ++ __schedule(false); ++ BUG(); ++ ++ /* Avoid "noreturn function does return" - but don't continue if BUG() is a NOP: */ ++ for (;;) ++ cpu_relax(); ++} ++ ++static inline void sched_submit_work(struct task_struct *tsk) ++{ ++ if (!tsk->state || tsk_is_pi_blocked(tsk) || ++ preempt_count() || ++ signal_pending_state(tsk->state, tsk)) ++ return; ++ ++ /* ++ * If we are going to sleep and we have plugged IO queued, ++ * make sure to submit it to avoid deadlocks. ++ */ ++ if (blk_needs_flush_plug(tsk)) ++ blk_schedule_flush_plug(tsk); ++} ++ ++asmlinkage __visible void __sched schedule(void) ++{ ++ struct task_struct *tsk = current; ++ ++ sched_submit_work(tsk); ++ do { ++ preempt_disable(); ++ __schedule(false); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++} ++ ++EXPORT_SYMBOL(schedule); ++ ++/* ++ * synchronize_rcu_tasks() makes sure that no task is stuck in preempted ++ * state (have scheduled out non-voluntarily) by making sure that all ++ * tasks have either left the run queue or have gone into user space. ++ * As idle tasks do not do either, they must not ever be preempted ++ * (schedule out non-voluntarily). ++ * ++ * schedule_idle() is similar to schedule_preempt_disable() except that it ++ * never enables preemption because it does not call sched_submit_work(). ++ */ ++void __sched schedule_idle(void) ++{ ++ /* ++ * As this skips calling sched_submit_work(), which the idle task does ++ * regardless because that function is a nop when the task is in a ++ * TASK_RUNNING state, make sure this isn't used someplace that the ++ * current task can be in any other state. Note, idle is always in the ++ * TASK_RUNNING state. ++ */ ++ WARN_ON_ONCE(current->state); ++ do { ++ __schedule(false); ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_CONTEXT_TRACKING ++asmlinkage __visible void __sched schedule_user(void) ++{ ++ /* ++ * If we come here after a random call to set_need_resched(), ++ * or we have been woken up remotely but the IPI has not yet arrived, ++ * we haven't yet exited the RCU idle mode. Do it here manually until ++ * we find a better solution. ++ * ++ * NB: There are buggy callers of this function. Ideally we ++ * should warn if prev_state != IN_USER, but that will trigger ++ * too frequently to make sense yet. ++ */ ++ enum ctx_state prev_state = exception_enter(); ++ schedule(); ++ exception_exit(prev_state); ++} ++#endif ++ ++/** ++ * schedule_preempt_disabled - called with preemption disabled ++ * ++ * Returns with preemption disabled. Note: preempt_count must be 1 ++ */ ++void __sched schedule_preempt_disabled(void) ++{ ++ sched_preempt_enable_no_resched(); ++ schedule(); ++ preempt_disable(); ++} ++ ++static void __sched notrace preempt_schedule_common(void) ++{ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ __schedule(true); ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ ++ /* ++ * Check again in case we missed a preemption opportunity ++ * between schedule and now. ++ */ ++ } while (need_resched()); ++} ++ ++#ifdef CONFIG_PREEMPT ++/* ++ * this is the entry point to schedule() from in-kernel preemption ++ * off of preempt_enable. Kernel preemptions off return from interrupt ++ * occur there and call schedule directly. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule(void) ++{ ++ /* ++ * If there is a non-zero preempt_count or interrupts are disabled, ++ * we do not want to preempt the current task. Just return.. ++ */ ++ if (likely(!preemptible())) ++ return; ++ ++ preempt_schedule_common(); ++} ++NOKPROBE_SYMBOL(preempt_schedule); ++EXPORT_SYMBOL(preempt_schedule); ++ ++/** ++ * preempt_schedule_notrace - preempt_schedule called by tracing ++ * ++ * The tracing infrastructure uses preempt_enable_notrace to prevent ++ * recursion and tracing preempt enabling caused by the tracing ++ * infrastructure itself. But as tracing can happen in areas coming ++ * from userspace or just about to enter userspace, a preempt enable ++ * can occur before user_exit() is called. This will cause the scheduler ++ * to be called when the system is still in usermode. ++ * ++ * To prevent this, the preempt_enable_notrace will use this function ++ * instead of preempt_schedule() to exit user context if needed before ++ * calling the scheduler. ++ */ ++asmlinkage __visible void __sched notrace preempt_schedule_notrace(void) ++{ ++ enum ctx_state prev_ctx; ++ ++ if (likely(!preemptible())) ++ return; ++ ++ do { ++ /* ++ * Because the function tracer can trace preempt_count_sub() ++ * and it also uses preempt_enable/disable_notrace(), if ++ * NEED_RESCHED is set, the preempt_enable_notrace() called ++ * by the function tracer will call this function again and ++ * cause infinite recursion. ++ * ++ * Preemption must be disabled here before the function ++ * tracer can trace. Break up preempt_disable() into two ++ * calls. One to disable preemption without fear of being ++ * traced. The other to still record the preemption latency, ++ * which can also be traced by the function tracer. ++ */ ++ preempt_disable_notrace(); ++ preempt_latency_start(1); ++ /* ++ * Needs preempt disabled in case user_exit() is traced ++ * and the tracer calls preempt_enable_notrace() causing ++ * an infinite recursion. ++ */ ++ prev_ctx = exception_enter(); ++ __schedule(true); ++ exception_exit(prev_ctx); ++ ++ preempt_latency_stop(1); ++ preempt_enable_no_resched_notrace(); ++ } while (need_resched()); ++} ++EXPORT_SYMBOL_GPL(preempt_schedule_notrace); ++ ++#endif /* CONFIG_PREEMPT */ ++ ++/* ++ * this is the entry point to schedule() from kernel preemption ++ * off of irq context. ++ * Note, that this is called and return with irqs disabled. This will ++ * protect us against recursive calling from irq. ++ */ ++asmlinkage __visible void __sched preempt_schedule_irq(void) ++{ ++ enum ctx_state prev_state; ++ ++ /* Catch callers which need to be fixed */ ++ BUG_ON(preempt_count() || !irqs_disabled()); ++ ++ prev_state = exception_enter(); ++ ++ do { ++ preempt_disable(); ++ local_irq_enable(); ++ __schedule(true); ++ local_irq_disable(); ++ sched_preempt_enable_no_resched(); ++ } while (need_resched()); ++ ++ exception_exit(prev_state); ++} ++ ++int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flags, ++ void *key) ++{ ++ return try_to_wake_up(curr->private, mode, wake_flags); ++} ++EXPORT_SYMBOL(default_wake_function); ++ ++#ifdef CONFIG_RT_MUTEXES ++ ++static inline int __rt_effective_prio(struct task_struct *pi_task, int prio) ++{ ++ if (pi_task) ++ prio = min(prio, pi_task->prio); ++ ++ return prio; ++} ++ ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ struct task_struct *pi_task = rt_mutex_get_top_task(p); ++ ++ return __rt_effective_prio(pi_task, prio); ++} ++ ++/* ++ * rt_mutex_setprio - set the current priority of a task ++ * @p: task to boost ++ * @pi_task: donor task ++ * ++ * This function changes the 'effective' priority of a task. It does ++ * not touch ->normal_prio like __setscheduler(). ++ * ++ * Used by the rt_mutex code to implement priority inheritance ++ * logic. Call site only calls if the priority of the task changed. ++ */ ++void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) ++{ ++ int prio, oldprio; ++ struct rq *rq; ++ ++ /* XXX used to be waiter->prio, not waiter->task->prio */ ++ prio = __rt_effective_prio(pi_task, p->normal_prio); ++ ++ /* ++ * If nothing changed; bail early. ++ */ ++ if (p->pi_top_task == pi_task && prio == p->prio) ++ return; ++ ++ rq = __task_rq_lock(p); ++ update_rq_clock(rq); ++ /* ++ * Set under pi_lock && rq->lock, such that the value can be used under ++ * either lock. ++ * ++ * Note that there is loads of tricky to make this pointer cache work ++ * right. rt_mutex_slowunlock()+rt_mutex_postunlock() work together to ++ * ensure a task is de-boosted (pi_task is set to NULL) before the ++ * task is allowed to run again (and can exit). This ensures the pointer ++ * points to a blocked task -- which guaratees the task is present. ++ */ ++ p->pi_top_task = pi_task; ++ ++ /* ++ * For FIFO/RR we only need to set prio, if that matches we're done. ++ */ ++ if (prio == p->prio) ++ goto out_unlock; ++ ++ /* ++ * Idle task boosting is a nono in general. There is one ++ * exception, when PREEMPT_RT and NOHZ is active: ++ * ++ * The idle task calls get_next_timer_interrupt() and holds ++ * the timer wheel base->lock on the CPU and another CPU wants ++ * to access the timer (probably to cancel it). We can safely ++ * ignore the boosting request, as the idle CPU runs this code ++ * with interrupts disabled and will complete the lock ++ * protected section without being interrupted. So there is no ++ * real need to boost. ++ */ ++ if (unlikely(p == rq->idle)) { ++ WARN_ON(p != rq->curr); ++ WARN_ON(p->pi_blocked_on); ++ goto out_unlock; ++ } ++ ++ trace_sched_pi_setprio(p, pi_task); ++ oldprio = p->prio; ++ p->prio = prio; ++ if (task_running(rq, p)){ ++ if (prio > oldprio) ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (prio < oldprio) ++ try_preempt(p, rq); ++ } ++out_unlock: ++ __task_rq_unlock(rq); ++} ++#else ++static inline int rt_effective_prio(struct task_struct *p, int prio) ++{ ++ return prio; ++} ++#endif ++ ++/* ++ * Adjust the deadline for when the priority is to change, before it's ++ * changed. ++ */ ++static inline void adjust_deadline(struct task_struct *p, int new_prio) ++{ ++ p->deadline += static_deadline_diff(new_prio) - task_deadline_diff(p); ++} ++ ++void set_user_nice(struct task_struct *p, long nice) ++{ ++ int new_static, old_static; ++ unsigned long flags; ++ struct rq *rq; ++ ++ if (task_nice(p) == nice || nice < MIN_NICE || nice > MAX_NICE) ++ return; ++ new_static = NICE_TO_PRIO(nice); ++ /* ++ * We have to be careful, if called from sys_setpriority(), ++ * the task might be in the middle of scheduling on another CPU. ++ */ ++ rq = task_rq_lock(p, &flags); ++ update_rq_clock(rq); ++ ++ /* ++ * The RT priorities are set via sched_setscheduler(), but we still ++ * allow the 'normal' nice value to be set - but as expected ++ * it wont have any effect on scheduling until the task is ++ * not SCHED_NORMAL/SCHED_BATCH: ++ */ ++ if (has_rt_policy(p)) { ++ p->static_prio = new_static; ++ goto out_unlock; ++ } ++ ++ adjust_deadline(p, new_static); ++ old_static = p->static_prio; ++ p->static_prio = new_static; ++ p->prio = effective_prio(p); ++ ++ if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (new_static < old_static) ++ try_preempt(p, rq); ++ } else if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ if (old_static < new_static) ++ resched_task(p); ++ } ++out_unlock: ++ task_rq_unlock(rq, p, &flags); ++} ++EXPORT_SYMBOL(set_user_nice); ++ ++/* ++ * can_nice - check if a task can reduce its nice value ++ * @p: task ++ * @nice: nice value ++ */ ++int can_nice(const struct task_struct *p, const int nice) ++{ ++ /* Convert nice value [19,-20] to rlimit style value [1,40] */ ++ int nice_rlim = nice_to_rlimit(nice); ++ ++ return (nice_rlim <= task_rlimit(p, RLIMIT_NICE) || ++ capable(CAP_SYS_NICE)); ++} ++ ++#ifdef __ARCH_WANT_SYS_NICE ++ ++/* ++ * sys_nice - change the priority of the current process. ++ * @increment: priority increment ++ * ++ * sys_setpriority is a more generic, but much slower function that ++ * does similar things. ++ */ ++SYSCALL_DEFINE1(nice, int, increment) ++{ ++ long nice, retval; ++ ++ /* ++ * Setpriority might change our priority at the same moment. ++ * We don't have to worry. Conceptually one call occurs first ++ * and we have a single winner. ++ */ ++ ++ increment = clamp(increment, -NICE_WIDTH, NICE_WIDTH); ++ nice = task_nice(current) + increment; ++ ++ nice = clamp_val(nice, MIN_NICE, MAX_NICE); ++ if (increment < 0 && !can_nice(current, nice)) ++ return -EPERM; ++ ++ retval = security_task_setnice(current, nice); ++ if (retval) ++ return retval; ++ ++ set_user_nice(current, nice); ++ return 0; ++} ++ ++#endif ++ ++/** ++ * task_prio - return the priority value of a given task. ++ * @p: the task in question. ++ * ++ * Return: The priority value as seen by users in /proc. ++ * RT tasks are offset by -100. Normal tasks are centered around 1, value goes ++ * from 0 (SCHED_ISO) up to 82 (nice +19 SCHED_IDLEPRIO). ++ */ ++int task_prio(const struct task_struct *p) ++{ ++ int delta, prio = p->prio - MAX_RT_PRIO; ++ ++ /* rt tasks and iso tasks */ ++ if (prio <= 0) ++ goto out; ++ ++ /* Convert to ms to avoid overflows */ ++ delta = NS_TO_MS(p->deadline - task_rq(p)->niffies); ++ if (unlikely(delta < 0)) ++ delta = 0; ++ delta = delta * 40 / ms_longest_deadline_diff(); ++ if (delta <= 80) ++ prio += delta; ++ if (idleprio_task(p)) ++ prio += 40; ++out: ++ return prio; ++} ++ ++/** ++ * idle_cpu - is a given CPU idle currently? ++ * @cpu: the processor in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int idle_cpu(int cpu) ++{ ++ return cpu_curr(cpu) == cpu_rq(cpu)->idle; ++} ++ ++/** ++ * available_idle_cpu - is a given CPU idle for enqueuing work. ++ * @cpu: the CPU in question. ++ * ++ * Return: 1 if the CPU is currently idle. 0 otherwise. ++ */ ++int available_idle_cpu(int cpu) ++{ ++ if (!idle_cpu(cpu)) ++ return 0; ++ ++ if (vcpu_is_preempted(cpu)) ++ return 0; ++ ++ return 1; ++} ++ ++/** ++ * idle_task - return the idle task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * Return: The idle task for the CPU @cpu. ++ */ ++struct task_struct *idle_task(int cpu) ++{ ++ return cpu_rq(cpu)->idle; ++} ++ ++/** ++ * find_process_by_pid - find a process with a matching PID value. ++ * @pid: the pid in question. ++ * ++ * The task of @pid, if found. %NULL otherwise. ++ */ ++static inline struct task_struct *find_process_by_pid(pid_t pid) ++{ ++ return pid ? find_task_by_vpid(pid) : current; ++} ++ ++/* Actually do priority change: must hold rq lock. */ ++static void __setscheduler(struct task_struct *p, struct rq *rq, int policy, ++ int prio, bool keep_boost) ++{ ++ int oldrtprio, oldprio; ++ ++ p->policy = policy; ++ oldrtprio = p->rt_priority; ++ p->rt_priority = prio; ++ p->normal_prio = normal_prio(p); ++ oldprio = p->prio; ++ /* ++ * Keep a potential priority boosting if called from ++ * sched_setscheduler(). ++ */ ++ p->prio = normal_prio(p); ++ if (keep_boost) ++ p->prio = rt_effective_prio(p, p->prio); ++ ++ if (task_running(rq, p)) { ++ set_rq_task(rq, p); ++ resched_task(p); ++ } else if (task_queued(p)) { ++ dequeue_task(rq, p, DEQUEUE_SAVE); ++ enqueue_task(rq, p, ENQUEUE_RESTORE); ++ if (p->prio < oldprio || p->rt_priority > oldrtprio) ++ try_preempt(p, rq); ++ } ++} ++ ++/* ++ * Check the target process has a UID that matches the current process's ++ */ ++static bool check_same_owner(struct task_struct *p) ++{ ++ const struct cred *cred = current_cred(), *pcred; ++ bool match; ++ ++ rcu_read_lock(); ++ pcred = __task_cred(p); ++ match = (uid_eq(cred->euid, pcred->euid) || ++ uid_eq(cred->euid, pcred->uid)); ++ rcu_read_unlock(); ++ return match; ++} ++ ++static int __sched_setscheduler(struct task_struct *p, ++ const struct sched_attr *attr, ++ bool user, bool pi) ++{ ++ int retval, policy = attr->sched_policy, oldpolicy = -1, priority = attr->sched_priority; ++ unsigned long flags, rlim_rtprio = 0; ++ int reset_on_fork; ++ struct rq *rq; ++ ++ /* The pi code expects interrupts enabled */ ++ BUG_ON(pi && in_interrupt()); ++ ++ if (is_rt_policy(policy) && !capable(CAP_SYS_NICE)) { ++ unsigned long lflags; ++ ++ if (!lock_task_sighand(p, &lflags)) ++ return -ESRCH; ++ rlim_rtprio = task_rlimit(p, RLIMIT_RTPRIO); ++ unlock_task_sighand(p, &lflags); ++ if (rlim_rtprio) ++ goto recheck; ++ /* ++ * If the caller requested an RT policy without having the ++ * necessary rights, we downgrade the policy to SCHED_ISO. ++ * We also set the parameter to zero to pass the checks. ++ */ ++ policy = SCHED_ISO; ++ priority = 0; ++ } ++recheck: ++ /* Double check policy once rq lock held */ ++ if (policy < 0) { ++ reset_on_fork = p->sched_reset_on_fork; ++ policy = oldpolicy = p->policy; ++ } else { ++ reset_on_fork = !!(policy & SCHED_RESET_ON_FORK); ++ policy &= ~SCHED_RESET_ON_FORK; ++ ++ if (!SCHED_RANGE(policy)) ++ return -EINVAL; ++ } ++ ++ if (attr->sched_flags & ~(SCHED_FLAG_ALL | SCHED_FLAG_SUGOV)) ++ return -EINVAL; ++ ++ /* ++ * Valid priorities for SCHED_FIFO and SCHED_RR are ++ * 1..MAX_USER_RT_PRIO-1, valid priority for SCHED_NORMAL and ++ * SCHED_BATCH is 0. ++ */ ++ if (priority < 0 || ++ (p->mm && priority > MAX_USER_RT_PRIO - 1) || ++ (!p->mm && priority > MAX_RT_PRIO - 1)) ++ return -EINVAL; ++ if (is_rt_policy(policy) != (priority != 0)) ++ return -EINVAL; ++ ++ /* ++ * Allow unprivileged RT tasks to decrease priority: ++ */ ++ if (user && !capable(CAP_SYS_NICE)) { ++ if (is_rt_policy(policy)) { ++ unsigned long rlim_rtprio = ++ task_rlimit(p, RLIMIT_RTPRIO); ++ ++ /* Can't set/change the rt policy */ ++ if (policy != p->policy && !rlim_rtprio) ++ return -EPERM; ++ ++ /* Can't increase priority */ ++ if (priority > p->rt_priority && ++ priority > rlim_rtprio) ++ return -EPERM; ++ } else { ++ switch (p->policy) { ++ /* ++ * Can only downgrade policies but not back to ++ * SCHED_NORMAL ++ */ ++ case SCHED_ISO: ++ if (policy == SCHED_ISO) ++ goto out; ++ if (policy != SCHED_NORMAL) ++ return -EPERM; ++ break; ++ case SCHED_BATCH: ++ if (policy == SCHED_BATCH) ++ goto out; ++ if (policy != SCHED_IDLEPRIO) ++ return -EPERM; ++ break; ++ case SCHED_IDLEPRIO: ++ if (policy == SCHED_IDLEPRIO) ++ goto out; ++ return -EPERM; ++ default: ++ break; ++ } ++ } ++ ++ /* Can't change other user's priorities */ ++ if (!check_same_owner(p)) ++ return -EPERM; ++ ++ /* Normal users shall not reset the sched_reset_on_fork flag: */ ++ if (p->sched_reset_on_fork && !reset_on_fork) ++ return -EPERM; ++ } ++ ++ if (user) { ++ retval = security_task_setscheduler(p); ++ if (retval) ++ return retval; ++ } ++ ++ /* ++ * Make sure no PI-waiters arrive (or leave) while we are ++ * changing the priority of the task: ++ * ++ * To be able to change p->policy safely, the runqueue lock must be ++ * held. ++ */ ++ rq = task_rq_lock(p, &flags); ++ update_rq_clock(rq); ++ ++ /* ++ * Changing the policy of the stop threads its a very bad idea: ++ */ ++ if (p == rq->stop) { ++ task_rq_unlock(rq, p, &flags); ++ return -EINVAL; ++ } ++ ++ /* ++ * If not changing anything there's no need to proceed further: ++ */ ++ if (unlikely(policy == p->policy && (!is_rt_policy(policy) || ++ priority == p->rt_priority))) { ++ task_rq_unlock(rq, p, &flags); ++ return 0; ++ } ++ ++ /* Re-check policy now with rq lock held */ ++ if (unlikely(oldpolicy != -1 && oldpolicy != p->policy)) { ++ policy = oldpolicy = -1; ++ task_rq_unlock(rq, p, &flags); ++ goto recheck; ++ } ++ p->sched_reset_on_fork = reset_on_fork; ++ ++ __setscheduler(p, rq, policy, priority, pi); ++ task_rq_unlock(rq, p, &flags); ++ ++ if (pi) ++ rt_mutex_adjust_pi(p); ++out: ++ return 0; ++} ++ ++static int _sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param, bool check) ++{ ++ struct sched_attr attr = { ++ .sched_policy = policy, ++ .sched_priority = param->sched_priority, ++ .sched_nice = PRIO_TO_NICE(p->static_prio), ++ }; ++ ++ return __sched_setscheduler(p, &attr, check, true); ++} ++/** ++ * sched_setscheduler - change the scheduling policy and/or RT priority of a thread. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ * ++ * NOTE that the task may be already dead. ++ */ ++int sched_setscheduler(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, true); ++} ++ ++EXPORT_SYMBOL_GPL(sched_setscheduler); ++ ++int sched_setattr(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, true, true); ++} ++EXPORT_SYMBOL_GPL(sched_setattr); ++ ++int sched_setattr_nocheck(struct task_struct *p, const struct sched_attr *attr) ++{ ++ return __sched_setscheduler(p, attr, false, true); ++} ++ ++/** ++ * sched_setscheduler_nocheck - change the scheduling policy and/or RT priority of a thread from kernelspace. ++ * @p: the task in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Just like sched_setscheduler, only don't bother checking if the ++ * current context has permission. For example, this is needed in ++ * stop_machine(): we create temporary high priority worker threads, ++ * but our caller might not have that capability. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++int sched_setscheduler_nocheck(struct task_struct *p, int policy, ++ const struct sched_param *param) ++{ ++ return _sched_setscheduler(p, policy, param, false); ++} ++EXPORT_SYMBOL_GPL(sched_setscheduler_nocheck); ++ ++static int ++do_sched_setscheduler(pid_t pid, int policy, struct sched_param __user *param) ++{ ++ struct sched_param lparam; ++ struct task_struct *p; ++ int retval; ++ ++ if (!param || pid < 0) ++ return -EINVAL; ++ if (copy_from_user(&lparam, param, sizeof(struct sched_param))) ++ return -EFAULT; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setscheduler(p, policy, &lparam); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/* ++ * Mimics kernel/events/core.c perf_copy_attr(). ++ */ ++static int sched_copy_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr) ++{ ++ u32 size; ++ int ret; ++ ++ if (!access_ok(VERIFY_WRITE, uattr, SCHED_ATTR_SIZE_VER0)) ++ return -EFAULT; ++ ++ /* Zero the full structure, so that a short copy will be nice: */ ++ memset(attr, 0, sizeof(*attr)); ++ ++ ret = get_user(size, &uattr->size); ++ if (ret) ++ return ret; ++ ++ /* Bail out on silly large: */ ++ if (size > PAGE_SIZE) ++ goto err_size; ++ ++ /* ABI compatibility quirk: */ ++ if (!size) ++ size = SCHED_ATTR_SIZE_VER0; ++ ++ if (size < SCHED_ATTR_SIZE_VER0) ++ goto err_size; ++ ++ /* ++ * If we're handed a bigger struct than we know of, ++ * ensure all the unknown bits are 0 - i.e. new ++ * user-space does not rely on any kernel feature ++ * extensions we dont know about yet. ++ */ ++ if (size > sizeof(*attr)) { ++ unsigned char __user *addr; ++ unsigned char __user *end; ++ unsigned char val; ++ ++ addr = (void __user *)uattr + sizeof(*attr); ++ end = (void __user *)uattr + size; ++ ++ for (; addr < end; addr++) { ++ ret = get_user(val, addr); ++ if (ret) ++ return ret; ++ if (val) ++ goto err_size; ++ } ++ size = sizeof(*attr); ++ } ++ ++ ret = copy_from_user(attr, uattr, size); ++ if (ret) ++ return -EFAULT; ++ ++ /* ++ * XXX: Do we want to be lenient like existing syscalls; or do we want ++ * to be strict and return an error on out-of-bounds values? ++ */ ++ attr->sched_nice = clamp(attr->sched_nice, -20, 19); ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return 0; ++ ++err_size: ++ put_user(sizeof(*attr), &uattr->size); ++ return -E2BIG; ++} ++ ++/* ++ * sched_setparam() passes in -1 for its policy, to let the functions ++ * it calls know not to change it. ++ */ ++#define SETPARAM_POLICY -1 ++ ++/** ++ * sys_sched_setscheduler - set/change the scheduler policy and RT priority ++ * @pid: the pid in question. ++ * @policy: new policy. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setscheduler, pid_t, pid, int, policy, struct sched_param __user *, param) ++{ ++ if (policy < 0) ++ return -EINVAL; ++ ++ return do_sched_setscheduler(pid, policy, param); ++} ++ ++/** ++ * sys_sched_setparam - set/change the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the new RT priority. ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE2(sched_setparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ return do_sched_setscheduler(pid, SETPARAM_POLICY, param); ++} ++ ++/** ++ * sys_sched_setattr - same as above, but with extended sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ */ ++SYSCALL_DEFINE3(sched_setattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, flags) ++{ ++ struct sched_attr attr; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || flags) ++ return -EINVAL; ++ ++ retval = sched_copy_attr(uattr, &attr); ++ if (retval) ++ return retval; ++ ++ if ((int)attr.sched_policy < 0) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (p != NULL) ++ retval = sched_setattr(p, &attr); ++ rcu_read_unlock(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the policy (scheduling class) of a thread ++ * @pid: the pid in question. ++ * ++ * Return: On success, the policy of the thread. Otherwise, a negative error ++ * code. ++ */ ++SYSCALL_DEFINE1(sched_getscheduler, pid_t, pid) ++{ ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (pid < 0) ++ goto out_nounlock; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (p) { ++ retval = security_task_getscheduler(p); ++ if (!retval) ++ retval = p->policy; ++ } ++ rcu_read_unlock(); ++ ++out_nounlock: ++ return retval; ++} ++ ++/** ++ * sys_sched_getscheduler - get the RT priority of a thread ++ * @pid: the pid in question. ++ * @param: structure containing the RT priority. ++ * ++ * Return: On success, 0 and the RT priority is in @param. Otherwise, an error ++ * code. ++ */ ++SYSCALL_DEFINE2(sched_getparam, pid_t, pid, struct sched_param __user *, param) ++{ ++ struct sched_param lp = { .sched_priority = 0 }; ++ struct task_struct *p; ++ int retval = -EINVAL; ++ ++ if (!param || pid < 0) ++ goto out_nounlock; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ if (has_rt_policy(p)) ++ lp.sched_priority = p->rt_priority; ++ rcu_read_unlock(); ++ ++ /* ++ * This one might sleep, we cannot do it with a spinlock held ... ++ */ ++ retval = copy_to_user(param, &lp, sizeof(*param)) ? -EFAULT : 0; ++ ++out_nounlock: ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++static int sched_read_attr(struct sched_attr __user *uattr, ++ struct sched_attr *attr, ++ unsigned int usize) ++{ ++ int ret; ++ ++ if (!access_ok(VERIFY_WRITE, uattr, usize)) ++ return -EFAULT; ++ ++ /* ++ * If we're handed a smaller struct than we know of, ++ * ensure all the unknown bits are 0 - i.e. old ++ * user-space does not get uncomplete information. ++ */ ++ if (usize < sizeof(*attr)) { ++ unsigned char *addr; ++ unsigned char *end; ++ ++ addr = (void *)attr + usize; ++ end = (void *)attr + sizeof(*attr); ++ ++ for (; addr < end; addr++) { ++ if (*addr) ++ return -EFBIG; ++ } ++ ++ attr->size = usize; ++ } ++ ++ ret = copy_to_user(uattr, attr, attr->size); ++ if (ret) ++ return -EFAULT; ++ ++ /* sched/core.c uses zero here but we already know ret is zero */ ++ return ret; ++} ++ ++/** ++ * sys_sched_getattr - similar to sched_getparam, but with sched_attr ++ * @pid: the pid in question. ++ * @uattr: structure containing the extended parameters. ++ * @size: sizeof(attr) for fwd/bwd comp. ++ * @flags: for future extension. ++ */ ++SYSCALL_DEFINE4(sched_getattr, pid_t, pid, struct sched_attr __user *, uattr, ++ unsigned int, size, unsigned int, flags) ++{ ++ struct sched_attr attr = { ++ .size = sizeof(struct sched_attr), ++ }; ++ struct task_struct *p; ++ int retval; ++ ++ if (!uattr || pid < 0 || size > PAGE_SIZE || ++ size < SCHED_ATTR_SIZE_VER0 || flags) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ retval = -ESRCH; ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ attr.sched_policy = p->policy; ++ if (rt_task(p)) ++ attr.sched_priority = p->rt_priority; ++ else ++ attr.sched_nice = task_nice(p); ++ ++ rcu_read_unlock(); ++ ++ retval = sched_read_attr(uattr, &attr, size); ++ return retval; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++long sched_setaffinity(pid_t pid, const struct cpumask *in_mask) ++{ ++ cpumask_var_t cpus_allowed, new_mask; ++ struct task_struct *p; ++ int retval; ++ ++ rcu_read_lock(); ++ ++ p = find_process_by_pid(pid); ++ if (!p) { ++ rcu_read_unlock(); ++ return -ESRCH; ++ } ++ ++ /* Prevent p going away */ ++ get_task_struct(p); ++ rcu_read_unlock(); ++ ++ if (p->flags & PF_NO_SETAFFINITY) { ++ retval = -EINVAL; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&cpus_allowed, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_put_task; ++ } ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) { ++ retval = -ENOMEM; ++ goto out_free_cpus_allowed; ++ } ++ retval = -EPERM; ++ if (!check_same_owner(p)) { ++ rcu_read_lock(); ++ if (!ns_capable(__task_cred(p)->user_ns, CAP_SYS_NICE)) { ++ rcu_read_unlock(); ++ goto out_unlock; ++ } ++ rcu_read_unlock(); ++ } ++ ++ retval = security_task_setscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ cpuset_cpus_allowed(p, cpus_allowed); ++ cpumask_and(new_mask, in_mask, cpus_allowed); ++again: ++ retval = __set_cpus_allowed_ptr(p, new_mask, true); ++ ++ if (!retval) { ++ cpuset_cpus_allowed(p, cpus_allowed); ++ if (!cpumask_subset(new_mask, cpus_allowed)) { ++ /* ++ * We must have raced with a concurrent cpuset ++ * update. Just reset the cpus_allowed to the ++ * cpuset's cpus_allowed ++ */ ++ cpumask_copy(new_mask, cpus_allowed); ++ goto again; ++ } ++ } ++out_unlock: ++ free_cpumask_var(new_mask); ++out_free_cpus_allowed: ++ free_cpumask_var(cpus_allowed); ++out_put_task: ++ put_task_struct(p); ++ return retval; ++} ++ ++static int get_user_cpu_mask(unsigned long __user *user_mask_ptr, unsigned len, ++ cpumask_t *new_mask) ++{ ++ if (len < cpumask_size()) ++ cpumask_clear(new_mask); ++ else if (len > cpumask_size()) ++ len = cpumask_size(); ++ ++ return copy_from_user(new_mask, user_mask_ptr, len) ? -EFAULT : 0; ++} ++ ++ ++/** ++ * sys_sched_setaffinity - set the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to the new CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_setaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ cpumask_var_t new_mask; ++ int retval; ++ ++ if (!alloc_cpumask_var(&new_mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ retval = get_user_cpu_mask(user_mask_ptr, len, new_mask); ++ if (retval == 0) ++ retval = sched_setaffinity(pid, new_mask); ++ free_cpumask_var(new_mask); ++ return retval; ++} ++ ++long sched_getaffinity(pid_t pid, cpumask_t *mask) ++{ ++ struct task_struct *p; ++ unsigned long flags; ++ int retval; ++ ++ get_online_cpus(); ++ rcu_read_lock(); ++ ++ retval = -ESRCH; ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ raw_spin_lock_irqsave(&p->pi_lock, flags); ++ cpumask_and(mask, &p->cpus_allowed, cpu_active_mask); ++ raw_spin_unlock_irqrestore(&p->pi_lock, flags); ++ ++out_unlock: ++ rcu_read_unlock(); ++ put_online_cpus(); ++ ++ return retval; ++} ++ ++/** ++ * sys_sched_getaffinity - get the CPU affinity of a process ++ * @pid: pid of the process ++ * @len: length in bytes of the bitmask pointed to by user_mask_ptr ++ * @user_mask_ptr: user-space pointer to hold the current CPU mask ++ * ++ * Return: 0 on success. An error code otherwise. ++ */ ++SYSCALL_DEFINE3(sched_getaffinity, pid_t, pid, unsigned int, len, ++ unsigned long __user *, user_mask_ptr) ++{ ++ int ret; ++ cpumask_var_t mask; ++ ++ if ((len * BITS_PER_BYTE) < nr_cpu_ids) ++ return -EINVAL; ++ if (len & (sizeof(unsigned long)-1)) ++ return -EINVAL; ++ ++ if (!alloc_cpumask_var(&mask, GFP_KERNEL)) ++ return -ENOMEM; ++ ++ ret = sched_getaffinity(pid, mask); ++ if (ret == 0) { ++ unsigned int retlen = min(len, cpumask_size()); ++ ++ if (copy_to_user(user_mask_ptr, mask, retlen)) ++ ret = -EFAULT; ++ else ++ ret = retlen; ++ } ++ free_cpumask_var(mask); ++ ++ return ret; ++} ++ ++/** ++ * sys_sched_yield - yield the current processor to other threads. ++ * ++ * This function yields the current CPU to other tasks. It does this by ++ * scheduling away the current task. If it still has the earliest deadline ++ * it will be scheduled again as the next task. ++ * ++ * Return: 0. ++ */ ++static void do_sched_yield(void) ++{ ++ struct rq *rq; ++ ++ if (!sched_yield_type) ++ return; ++ ++ local_irq_disable(); ++ rq = this_rq(); ++ rq_lock(rq); ++ ++ if (sched_yield_type > 1) ++ time_slice_expired(current, rq); ++ schedstat_inc(rq->yld_count); ++ ++ /* ++ * Since we are going to call schedule() anyway, there's ++ * no need to preempt or enable interrupts: ++ */ ++ preempt_disable(); ++ rq_unlock(rq); ++ sched_preempt_enable_no_resched(); ++ ++ schedule(); ++} ++ ++SYSCALL_DEFINE0(sched_yield) ++{ ++ do_sched_yield(); ++ return 0; ++} ++ ++#ifndef CONFIG_PREEMPT ++int __sched _cond_resched(void) ++{ ++ if (should_resched(0)) { ++ preempt_schedule_common(); ++ return 1; ++ } ++ rcu_all_qs(); ++ return 0; ++} ++EXPORT_SYMBOL(_cond_resched); ++#endif ++ ++/* ++ * __cond_resched_lock() - if a reschedule is pending, drop the given lock, ++ * call schedule, and on return reacquire the lock. ++ * ++ * This works OK both with and without CONFIG_PREEMPT. We do strange low-level ++ * operations here to prevent schedule() from being called twice (once via ++ * spin_unlock(), once by hand). ++ */ ++int __cond_resched_lock(spinlock_t *lock) ++{ ++ int resched = should_resched(PREEMPT_LOCK_OFFSET); ++ int ret = 0; ++ ++ lockdep_assert_held(lock); ++ ++ if (spin_needbreak(lock) || resched) { ++ spin_unlock(lock); ++ if (resched) ++ preempt_schedule_common(); ++ else ++ cpu_relax(); ++ ret = 1; ++ spin_lock(lock); ++ } ++ return ret; ++} ++EXPORT_SYMBOL(__cond_resched_lock); ++ ++/** ++ * yield - yield the current processor to other threads. ++ * ++ * Do not ever use this function, there's a 99% chance you're doing it wrong. ++ * ++ * The scheduler is at all times free to pick the calling task as the most ++ * eligible task to run, if removing the yield() call from your code breaks ++ * it, its already broken. ++ * ++ * Typical broken usage is: ++ * ++ * while (!event) ++ * yield(); ++ * ++ * where one assumes that yield() will let 'the other' process run that will ++ * make event true. If the current task is a SCHED_FIFO task that will never ++ * happen. Never use yield() as a progress guarantee!! ++ * ++ * If you want to use yield() to wait for something, use wait_event(). ++ * If you want to use yield() to be 'nice' for others, use cond_resched(). ++ * If you still want to use yield(), do not! ++ */ ++void __sched yield(void) ++{ ++ set_current_state(TASK_RUNNING); ++ do_sched_yield(); ++} ++EXPORT_SYMBOL(yield); ++ ++/** ++ * yield_to - yield the current processor to another thread in ++ * your thread group, or accelerate that thread toward the ++ * processor it's on. ++ * @p: target task ++ * @preempt: whether task preemption is allowed or not ++ * ++ * It's the caller's job to ensure that the target task struct ++ * can't go away on us before we can do any checks. ++ * ++ * Return: ++ * true (>0) if we indeed boosted the target task. ++ * false (0) if we failed to boost the target. ++ * -ESRCH if there's no task to yield to. ++ */ ++int __sched yield_to(struct task_struct *p, bool preempt) ++{ ++ struct task_struct *rq_p; ++ struct rq *rq, *p_rq; ++ unsigned long flags; ++ int yielded = 0; ++ ++ local_irq_save(flags); ++ rq = this_rq(); ++ ++again: ++ p_rq = task_rq(p); ++ /* ++ * If we're the only runnable task on the rq and target rq also ++ * has only one task, there's absolutely no point in yielding. ++ */ ++ if (task_running(p_rq, p) || p->state) { ++ yielded = -ESRCH; ++ goto out_irq; ++ } ++ ++ double_rq_lock(rq, p_rq); ++ if (unlikely(task_rq(p) != p_rq)) { ++ double_rq_unlock(rq, p_rq); ++ goto again; ++ } ++ ++ yielded = 1; ++ schedstat_inc(rq->yld_count); ++ rq_p = rq->curr; ++ if (p->deadline > rq_p->deadline) ++ p->deadline = rq_p->deadline; ++ p->time_slice += rq_p->time_slice; ++ if (p->time_slice > timeslice()) ++ p->time_slice = timeslice(); ++ time_slice_expired(rq_p, rq); ++ if (preempt && rq != p_rq) ++ resched_task(p_rq->curr); ++ double_rq_unlock(rq, p_rq); ++out_irq: ++ local_irq_restore(flags); ++ ++ if (yielded > 0) ++ schedule(); ++ return yielded; ++} ++EXPORT_SYMBOL_GPL(yield_to); ++ ++int io_schedule_prepare(void) ++{ ++ int old_iowait = current->in_iowait; ++ ++ current->in_iowait = 1; ++ blk_schedule_flush_plug(current); ++ ++ return old_iowait; ++} ++ ++void io_schedule_finish(int token) ++{ ++ current->in_iowait = token; ++} ++ ++/* ++ * This task is about to go to sleep on IO. Increment rq->nr_iowait so ++ * that process accounting knows that this is a task in IO wait state. ++ * ++ * But don't do that if it is a deliberate, throttling IO wait (this task ++ * has set its backing_dev_info: the queue against which it should throttle) ++ */ ++ ++long __sched io_schedule_timeout(long timeout) ++{ ++ int token; ++ long ret; ++ ++ token = io_schedule_prepare(); ++ ret = schedule_timeout(timeout); ++ io_schedule_finish(token); ++ ++ return ret; ++} ++EXPORT_SYMBOL(io_schedule_timeout); ++ ++void io_schedule(void) ++{ ++ int token; ++ ++ token = io_schedule_prepare(); ++ schedule(); ++ io_schedule_finish(token); ++} ++EXPORT_SYMBOL(io_schedule); ++ ++/** ++ * sys_sched_get_priority_max - return maximum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the maximum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_max, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = MAX_USER_RT_PRIO-1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++/** ++ * sys_sched_get_priority_min - return minimum RT priority. ++ * @policy: scheduling class. ++ * ++ * Return: On success, this syscall returns the minimum ++ * rt_priority that can be used by a given scheduling class. ++ * On failure, a negative error code is returned. ++ */ ++SYSCALL_DEFINE1(sched_get_priority_min, int, policy) ++{ ++ int ret = -EINVAL; ++ ++ switch (policy) { ++ case SCHED_FIFO: ++ case SCHED_RR: ++ ret = 1; ++ break; ++ case SCHED_NORMAL: ++ case SCHED_BATCH: ++ case SCHED_ISO: ++ case SCHED_IDLEPRIO: ++ ret = 0; ++ break; ++ } ++ return ret; ++} ++ ++static int sched_rr_get_interval(pid_t pid, struct timespec64 *t) ++{ ++ struct task_struct *p; ++ unsigned int time_slice; ++ unsigned long flags; ++ struct rq *rq; ++ int retval; ++ ++ if (pid < 0) ++ return -EINVAL; ++ ++ retval = -ESRCH; ++ rcu_read_lock(); ++ p = find_process_by_pid(pid); ++ if (!p) ++ goto out_unlock; ++ ++ retval = security_task_getscheduler(p); ++ if (retval) ++ goto out_unlock; ++ ++ rq = task_rq_lock(p, &flags); ++ time_slice = p->policy == SCHED_FIFO ? 0 : MS_TO_NS(task_timeslice(p)); ++ task_rq_unlock(rq, p, &flags); ++ ++ rcu_read_unlock(); ++ *t = ns_to_timespec64(time_slice); ++ return 0; ++ ++out_unlock: ++ rcu_read_unlock(); ++ return retval; ++} ++ ++/** ++ * sys_sched_rr_get_interval - return the default timeslice of a process. ++ * @pid: pid of the process. ++ * @interval: userspace pointer to the timeslice value. ++ * ++ * this syscall writes the default timeslice value of a given process ++ * into the user-space timespec buffer. A value of '0' means infinity. ++ * ++ * Return: On success, 0 and the timeslice is in @interval. Otherwise, ++ * an error code. ++ */ ++SYSCALL_DEFINE2(sched_rr_get_interval, pid_t, pid, ++ struct timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = put_timespec64(&t, interval); ++ ++ return retval; ++} ++ ++#ifdef CONFIG_COMPAT ++COMPAT_SYSCALL_DEFINE2(sched_rr_get_interval, ++ compat_pid_t, pid, ++ struct compat_timespec __user *, interval) ++{ ++ struct timespec64 t; ++ int retval = sched_rr_get_interval(pid, &t); ++ ++ if (retval == 0) ++ retval = compat_put_timespec64(&t, interval); ++ return retval; ++} ++#endif ++ ++void sched_show_task(struct task_struct *p) ++{ ++ unsigned long free = 0; ++ int ppid; ++ ++ if (!try_get_task_stack(p)) ++ return; ++ ++ printk(KERN_INFO "%-15.15s %c", p->comm, task_state_to_char(p)); ++ ++ if (p->state == TASK_RUNNING) ++ printk(KERN_CONT " running task "); ++#ifdef CONFIG_DEBUG_STACK_USAGE ++ free = stack_not_used(p); ++#endif ++ ppid = 0; ++ rcu_read_lock(); ++ if (pid_alive(p)) ++ ppid = task_pid_nr(rcu_dereference(p->real_parent)); ++ rcu_read_unlock(); ++ printk(KERN_CONT "%5lu %5d %6d 0x%08lx\n", free, ++ task_pid_nr(p), ppid, ++ (unsigned long)task_thread_info(p)->flags); ++ ++ print_worker_info(KERN_INFO, p); ++ show_stack(p, NULL); ++ put_task_stack(p); ++} ++EXPORT_SYMBOL_GPL(sched_show_task); ++ ++static inline bool ++state_filter_match(unsigned long state_filter, struct task_struct *p) ++{ ++ /* no filter, everything matches */ ++ if (!state_filter) ++ return true; ++ ++ /* filter, but doesn't match */ ++ if (!(p->state & state_filter)) ++ return false; ++ ++ /* ++ * When looking for TASK_UNINTERRUPTIBLE skip TASK_IDLE (allows ++ * TASK_KILLABLE). ++ */ ++ if (state_filter == TASK_UNINTERRUPTIBLE && p->state == TASK_IDLE) ++ return false; ++ ++ return true; ++} ++ ++void show_state_filter(unsigned long state_filter) ++{ ++ struct task_struct *g, *p; ++ ++#if BITS_PER_LONG == 32 ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#else ++ printk(KERN_INFO ++ " task PC stack pid father\n"); ++#endif ++ rcu_read_lock(); ++ for_each_process_thread(g, p) { ++ /* ++ * reset the NMI-timeout, listing all files on a slow ++ * console might take a lot of time: ++ * Also, reset softlockup watchdogs on all CPUs, because ++ * another CPU might be blocked waiting for us to process ++ * an IPI. ++ */ ++ touch_nmi_watchdog(); ++ touch_all_softlockup_watchdogs(); ++ if (state_filter_match(state_filter, p)) ++ sched_show_task(p); ++ } ++ ++ rcu_read_unlock(); ++ /* ++ * Only show locks if all tasks are dumped: ++ */ ++ if (!state_filter) ++ debug_show_all_locks(); ++} ++ ++void dump_cpu_task(int cpu) ++{ ++ pr_info("Task dump for CPU %d:\n", cpu); ++ sched_show_task(cpu_curr(cpu)); ++} ++ ++#ifdef CONFIG_SMP ++void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ cpumask_copy(&p->cpus_allowed, new_mask); ++ p->nr_cpus_allowed = cpumask_weight(new_mask); ++} ++ ++void __do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ struct rq *rq = task_rq(p); ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ cpumask_copy(&p->cpus_allowed, new_mask); ++ ++ if (task_queued(p)) { ++ /* ++ * Because __kthread_bind() calls this on blocked tasks without ++ * holding rq->lock. ++ */ ++ lockdep_assert_held(rq->lock); ++ } ++} ++ ++/* ++ * Calling do_set_cpus_allowed from outside the scheduler code should not be ++ * called on a running or queued task. We should be holding pi_lock. ++ */ ++void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ __do_set_cpus_allowed(p, new_mask); ++ if (needs_other_cpu(p, task_cpu(p))) { ++ struct rq *rq; ++ ++ rq = __task_rq_lock(p); ++ set_task_cpu(p, valid_task_cpu(p)); ++ resched_task(p); ++ __task_rq_unlock(rq); ++ } ++} ++#endif ++ ++/** ++ * init_idle - set up an idle thread for a given CPU ++ * @idle: task in question ++ * @cpu: cpu the idle task belongs to ++ * ++ * NOTE: this function does not set the idle thread's NEED_RESCHED ++ * flag, to make booting more robust. ++ */ ++void init_idle(struct task_struct *idle, int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ raw_spin_lock_irqsave(&idle->pi_lock, flags); ++ raw_spin_lock(rq->lock); ++ idle->last_ran = rq->niffies; ++ time_slice_expired(idle, rq); ++ idle->state = TASK_RUNNING; ++ /* Setting prio to illegal value shouldn't matter when never queued */ ++ idle->prio = PRIO_LIMIT; ++ ++ kasan_unpoison_task_stack(idle); ++ ++#ifdef CONFIG_SMP ++ /* ++ * It's possible that init_idle() gets called multiple times on a task, ++ * in that case do_set_cpus_allowed() will not do the right thing. ++ * ++ * And since this is boot we can forgo the serialisation. ++ */ ++ set_cpus_allowed_common(idle, cpumask_of(cpu)); ++#ifdef CONFIG_SMT_NICE ++ idle->smt_bias = 0; ++#endif ++#endif ++ set_rq_task(rq, idle); ++ ++ /* Silence PROVE_RCU */ ++ rcu_read_lock(); ++ set_task_cpu(idle, cpu); ++ rcu_read_unlock(); ++ ++ rq->curr = rq->idle = idle; ++ idle->on_rq = TASK_ON_RQ_QUEUED; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&idle->pi_lock, flags); ++ ++ /* Set the preempt count _outside_ the spinlocks! */ ++ init_idle_preempt_count(idle, cpu); ++ ++ ftrace_graph_init_idle_task(idle, cpu); ++ vtime_init_idle(idle, cpu); ++#ifdef CONFIG_SMP ++ sprintf(idle->comm, "%s/%d", INIT_TASK_COMM, cpu); ++#endif ++} ++ ++int cpuset_cpumask_can_shrink(const struct cpumask __maybe_unused *cur, ++ const struct cpumask __maybe_unused *trial) ++{ ++ return 1; ++} ++ ++int task_can_attach(struct task_struct *p, ++ const struct cpumask *cs_cpus_allowed) ++{ ++ int ret = 0; ++ ++ /* ++ * Kthreads which disallow setaffinity shouldn't be moved ++ * to a new cpuset; we don't want to change their CPU ++ * affinity and isolating such threads by their set of ++ * allowed nodes is unnecessary. Thus, cpusets are not ++ * applicable for such threads. This prevents checking for ++ * success of set_cpus_allowed_ptr() on all attached tasks ++ * before cpus_allowed may be changed. ++ */ ++ if (p->flags & PF_NO_SETAFFINITY) ++ ret = -EINVAL; ++ ++ return ret; ++} ++ ++void resched_cpu(int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ rq_lock_irqsave(rq, &flags); ++ if (cpu_online(cpu) || cpu == smp_processor_id()) ++ resched_curr(rq); ++ rq_unlock_irqrestore(rq, &flags); ++} ++ ++#ifdef CONFIG_SMP ++#ifdef CONFIG_NO_HZ_COMMON ++void nohz_balance_enter_idle(int cpu) ++{ ++} ++ ++void select_nohz_load_balancer(int stop_tick) ++{ ++} ++ ++void set_cpu_sd_state_idle(void) {} ++ ++/* ++ * In the semi idle case, use the nearest busy CPU for migrating timers ++ * from an idle CPU. This is good for power-savings. ++ * ++ * We don't do similar optimization for completely idle system, as ++ * selecting an idle CPU will add more delays to the timers than intended ++ * (as that CPU's timer base may not be uptodate wrt jiffies etc). ++ */ ++int get_nohz_timer_target(void) ++{ ++ int i, cpu = smp_processor_id(); ++ struct sched_domain *sd; ++ ++ if (!idle_cpu(cpu) && housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ return cpu; ++ ++ rcu_read_lock(); ++ for_each_domain(cpu, sd) { ++ for_each_cpu(i, sched_domain_span(sd)) { ++ if (cpu == i) ++ continue; ++ ++ if (!idle_cpu(i) && housekeeping_cpu(i, HK_FLAG_TIMER)) { ++ cpu = i; ++ cpu = i; ++ goto unlock; ++ } ++ } ++ } ++ ++ if (!housekeeping_cpu(cpu, HK_FLAG_TIMER)) ++ cpu = housekeeping_any_cpu(HK_FLAG_TIMER); ++unlock: ++ rcu_read_unlock(); ++ return cpu; ++} ++ ++/* ++ * When add_timer_on() enqueues a timer into the timer wheel of an ++ * idle CPU then this timer might expire before the next timer event ++ * which is scheduled to wake up that CPU. In case of a completely ++ * idle system the next event might even be infinite time into the ++ * future. wake_up_idle_cpu() ensures that the CPU is woken up and ++ * leaves the inner idle loop so the newly added timer is taken into ++ * account when the CPU goes back to idle and evaluates the timer ++ * wheel for the next timer event. ++ */ ++void wake_up_idle_cpu(int cpu) ++{ ++ if (cpu == smp_processor_id()) ++ return; ++ ++ if (set_nr_and_not_polling(cpu_rq(cpu)->idle)) ++ smp_sched_reschedule(cpu); ++ else ++ trace_sched_wake_idle_without_ipi(cpu); ++} ++ ++static bool wake_up_full_nohz_cpu(int cpu) ++{ ++ /* ++ * We just need the target to call irq_exit() and re-evaluate ++ * the next tick. The nohz full kick at least implies that. ++ * If needed we can still optimize that later with an ++ * empty IRQ. ++ */ ++ if (cpu_is_offline(cpu)) ++ return true; /* Don't try to wake offline CPUs. */ ++ if (tick_nohz_full_cpu(cpu)) { ++ if (cpu != smp_processor_id() || ++ tick_nohz_tick_stopped()) ++ tick_nohz_full_kick_cpu(cpu); ++ return true; ++ } ++ ++ return false; ++} ++ ++/* ++ * Wake up the specified CPU. If the CPU is going offline, it is the ++ * caller's responsibility to deal with the lost wakeup, for example, ++ * by hooking into the CPU_DEAD notifier like timers and hrtimers do. ++ */ ++void wake_up_nohz_cpu(int cpu) ++{ ++ if (!wake_up_full_nohz_cpu(cpu)) ++ wake_up_idle_cpu(cpu); ++} ++#endif /* CONFIG_NO_HZ_COMMON */ ++ ++/* ++ * Change a given task's CPU affinity. Migrate the thread to a ++ * proper CPU and schedule it away if the CPU it's executing on ++ * is removed from the allowed bitmask. ++ * ++ * NOTE: the caller must have a valid reference to the task, the ++ * task must not exit() & deallocate itself prematurely. The ++ * call is not atomic; no spinlocks may be held. ++ */ ++static int __set_cpus_allowed_ptr(struct task_struct *p, ++ const struct cpumask *new_mask, bool check) ++{ ++ const struct cpumask *cpu_valid_mask = cpu_active_mask; ++ bool queued = false, running_wrong = false, kthread; ++ struct cpumask old_mask; ++ unsigned long flags; ++ int cpu, ret = 0; ++ struct rq *rq; ++ ++ rq = task_rq_lock(p, &flags); ++ update_rq_clock(rq); ++ ++ kthread = !!(p->flags & PF_KTHREAD); ++ if (kthread) { ++ /* ++ * Kernel threads are allowed on online && !active CPUs ++ */ ++ cpu_valid_mask = cpu_online_mask; ++ } ++ ++ /* ++ * Must re-check here, to close a race against __kthread_bind(), ++ * sched_setaffinity() is not guaranteed to observe the flag. ++ */ ++ if (check && (p->flags & PF_NO_SETAFFINITY)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ cpumask_copy(&old_mask, &p->cpus_allowed); ++ if (cpumask_equal(&old_mask, new_mask)) ++ goto out; ++ ++ if (!cpumask_intersects(new_mask, cpu_valid_mask)) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ queued = task_queued(p); ++ __do_set_cpus_allowed(p, new_mask); ++ ++ if (kthread) { ++ /* ++ * For kernel threads that do indeed end up on online && ++ * !active we want to ensure they are strict per-CPU threads. ++ */ ++ WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) && ++ !cpumask_intersects(new_mask, cpu_active_mask) && ++ p->nr_cpus_allowed != 1); ++ } ++ ++ /* Can the task run on the task's current CPU? If so, we're done */ ++ if (cpumask_test_cpu(task_cpu(p), new_mask)) ++ goto out; ++ ++ if (task_running(rq, p)) { ++ /* Task is running on the wrong cpu now, reschedule it. */ ++ if (rq == this_rq()) { ++ cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ set_task_cpu(p, cpu); ++ set_tsk_need_resched(p); ++ running_wrong = true; ++ } else ++ resched_task(p); ++ } else { ++ cpu = cpumask_any_and(cpu_valid_mask, new_mask); ++ if (queued) { ++ /* ++ * Switch runqueue locks after dequeueing the task ++ * here while still holding the pi_lock to be holding ++ * the correct lock for enqueueing. ++ */ ++ dequeue_task(rq, p, 0); ++ rq_unlock(rq); ++ ++ rq = cpu_rq(cpu); ++ rq_lock(rq); ++ } ++ set_task_cpu(p, cpu); ++ if (queued) ++ enqueue_task(rq, p, 0); ++ } ++ if (queued) ++ try_preempt(p, rq); ++ if (running_wrong) ++ preempt_disable(); ++out: ++ task_rq_unlock(rq, p, &flags); ++ ++ if (running_wrong) { ++ __schedule(true); ++ preempt_enable(); ++ } ++ ++ return ret; ++} ++ ++int set_cpus_allowed_ptr(struct task_struct *p, const struct cpumask *new_mask) ++{ ++ return __set_cpus_allowed_ptr(p, new_mask, false); ++} ++EXPORT_SYMBOL_GPL(set_cpus_allowed_ptr); ++ ++#ifdef CONFIG_HOTPLUG_CPU ++/* ++ * Run through task list and find tasks affined to the dead cpu, then remove ++ * that cpu from the list, enable cpu0 and set the zerobound flag. Must hold ++ * cpu 0 and src_cpu's runqueue locks. ++ */ ++static void bind_zero(int src_cpu) ++{ ++ struct task_struct *p, *t; ++ struct rq *rq0; ++ int bound = 0; ++ ++ if (src_cpu == 0) ++ return; ++ ++ rq0 = cpu_rq(0); ++ ++ do_each_thread(t, p) { ++ if (cpumask_test_cpu(src_cpu, &p->cpus_allowed)) { ++ bool local = (task_cpu(p) == src_cpu); ++ struct rq *rq = task_rq(p); ++ ++ /* task_running is the cpu stopper thread */ ++ if (local && task_running(rq, p)) ++ continue; ++ atomic_clear_cpu(src_cpu, &p->cpus_allowed); ++ atomic_set_cpu(0, &p->cpus_allowed); ++ p->zerobound = true; ++ bound++; ++ if (local) { ++ bool queued = task_queued(p); ++ ++ if (queued) ++ dequeue_task(rq, p, 0); ++ set_task_cpu(p, 0); ++ if (queued) ++ enqueue_task(rq0, p, 0); ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (bound) { ++ printk(KERN_INFO "Removed affinity for %d processes to cpu %d\n", ++ bound, src_cpu); ++ } ++} ++ ++/* Find processes with the zerobound flag and reenable their affinity for the ++ * CPU coming alive. */ ++static void unbind_zero(int src_cpu) ++{ ++ int unbound = 0, zerobound = 0; ++ struct task_struct *p, *t; ++ ++ if (src_cpu == 0) ++ return; ++ ++ do_each_thread(t, p) { ++ if (!p->mm) ++ p->zerobound = false; ++ if (p->zerobound) { ++ unbound++; ++ cpumask_set_cpu(src_cpu, &p->cpus_allowed); ++ /* Once every CPU affinity has been re-enabled, remove ++ * the zerobound flag */ ++ if (cpumask_subset(cpu_possible_mask, &p->cpus_allowed)) { ++ p->zerobound = false; ++ zerobound++; ++ } ++ } ++ } while_each_thread(t, p); ++ ++ if (unbound) { ++ printk(KERN_INFO "Added affinity for %d processes to cpu %d\n", ++ unbound, src_cpu); ++ } ++ if (zerobound) { ++ printk(KERN_INFO "Released forced binding to cpu0 for %d processes\n", ++ zerobound); ++ } ++} ++ ++/* ++ * Ensure that the idle task is using init_mm right before its cpu goes ++ * offline. ++ */ ++void idle_task_exit(void) ++{ ++ struct mm_struct *mm = current->active_mm; ++ ++ BUG_ON(cpu_online(smp_processor_id())); ++ ++ if (mm != &init_mm) { ++ switch_mm(mm, &init_mm, current); ++ current->active_mm = &init_mm; ++ finish_arch_post_lock_switch(); ++ } ++ mmdrop(mm); ++} ++#else /* CONFIG_HOTPLUG_CPU */ ++static void unbind_zero(int src_cpu) {} ++#endif /* CONFIG_HOTPLUG_CPU */ ++ ++void sched_set_stop_task(int cpu, struct task_struct *stop) ++{ ++ struct sched_param stop_param = { .sched_priority = STOP_PRIO }; ++ struct sched_param start_param = { .sched_priority = 0 }; ++ struct task_struct *old_stop = cpu_rq(cpu)->stop; ++ ++ if (stop) { ++ /* ++ * Make it appear like a SCHED_FIFO task, its something ++ * userspace knows about and won't get confused about. ++ * ++ * Also, it will make PI more or less work without too ++ * much confusion -- but then, stop work should not ++ * rely on PI working anyway. ++ */ ++ sched_setscheduler_nocheck(stop, SCHED_FIFO, &stop_param); ++ } ++ ++ cpu_rq(cpu)->stop = stop; ++ ++ if (old_stop) { ++ /* ++ * Reset it back to a normal scheduling policy so that ++ * it can die in pieces. ++ */ ++ sched_setscheduler_nocheck(old_stop, SCHED_NORMAL, &start_param); ++ } ++} ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++ ++static struct ctl_table sd_ctl_dir[] = { ++ { ++ .procname = "sched_domain", ++ .mode = 0555, ++ }, ++ {} ++}; ++ ++static struct ctl_table sd_ctl_root[] = { ++ { ++ .procname = "kernel", ++ .mode = 0555, ++ .child = sd_ctl_dir, ++ }, ++ {} ++}; ++ ++static struct ctl_table *sd_alloc_ctl_entry(int n) ++{ ++ struct ctl_table *entry = ++ kcalloc(n, sizeof(struct ctl_table), GFP_KERNEL); ++ ++ return entry; ++} ++ ++static void sd_free_ctl_entry(struct ctl_table **tablep) ++{ ++ struct ctl_table *entry; ++ ++ /* ++ * In the intermediate directories, both the child directory and ++ * procname are dynamically allocated and could fail but the mode ++ * will always be set. In the lowest directory the names are ++ * static strings and all have proc handlers. ++ */ ++ for (entry = *tablep; entry->mode; entry++) { ++ if (entry->child) ++ sd_free_ctl_entry(&entry->child); ++ if (entry->proc_handler == NULL) ++ kfree(entry->procname); ++ } ++ ++ kfree(*tablep); ++ *tablep = NULL; ++} ++ ++#define CPU_LOAD_IDX_MAX 5 ++static int min_load_idx = 0; ++static int max_load_idx = CPU_LOAD_IDX_MAX-1; ++ ++static void ++set_table_entry(struct ctl_table *entry, ++ const char *procname, void *data, int maxlen, ++ umode_t mode, proc_handler *proc_handler, ++ bool load_idx) ++{ ++ entry->procname = procname; ++ entry->data = data; ++ entry->maxlen = maxlen; ++ entry->mode = mode; ++ entry->proc_handler = proc_handler; ++ ++ if (load_idx) { ++ entry->extra1 = &min_load_idx; ++ entry->extra2 = &max_load_idx; ++ } ++} ++ ++static struct ctl_table * ++sd_alloc_ctl_domain_table(struct sched_domain *sd) ++{ ++ struct ctl_table *table = sd_alloc_ctl_entry(14); ++ ++ if (table == NULL) ++ return NULL; ++ ++ set_table_entry(&table[0], "min_interval", &sd->min_interval, ++ sizeof(long), 0644, proc_doulongvec_minmax, false); ++ set_table_entry(&table[1], "max_interval", &sd->max_interval, ++ sizeof(long), 0644, proc_doulongvec_minmax, false); ++ set_table_entry(&table[2], "busy_idx", &sd->busy_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[3], "idle_idx", &sd->idle_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[4], "newidle_idx", &sd->newidle_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[5], "wake_idx", &sd->wake_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[6], "forkexec_idx", &sd->forkexec_idx, ++ sizeof(int), 0644, proc_dointvec_minmax, true); ++ set_table_entry(&table[7], "busy_factor", &sd->busy_factor, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[8], "imbalance_pct", &sd->imbalance_pct, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[9], "cache_nice_tries", ++ &sd->cache_nice_tries, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[10], "flags", &sd->flags, ++ sizeof(int), 0644, proc_dointvec_minmax, false); ++ set_table_entry(&table[11], "max_newidle_lb_cost", ++ &sd->max_newidle_lb_cost, ++ sizeof(long), 0644, proc_doulongvec_minmax, false); ++ set_table_entry(&table[12], "name", sd->name, ++ CORENAME_MAX_SIZE, 0444, proc_dostring, false); ++ /* &table[13] is terminator */ ++ ++ return table; ++} ++ ++static struct ctl_table *sd_alloc_ctl_cpu_table(int cpu) ++{ ++ struct ctl_table *entry, *table; ++ struct sched_domain *sd; ++ int domain_num = 0, i; ++ char buf[32]; ++ ++ for_each_domain(cpu, sd) ++ domain_num++; ++ entry = table = sd_alloc_ctl_entry(domain_num + 1); ++ if (table == NULL) ++ return NULL; ++ ++ i = 0; ++ for_each_domain(cpu, sd) { ++ snprintf(buf, 32, "domain%d", i); ++ entry->procname = kstrdup(buf, GFP_KERNEL); ++ entry->mode = 0555; ++ entry->child = sd_alloc_ctl_domain_table(sd); ++ entry++; ++ i++; ++ } ++ return table; ++} ++ ++static cpumask_var_t sd_sysctl_cpus; ++static struct ctl_table_header *sd_sysctl_header; ++ ++void register_sched_domain_sysctl(void) ++{ ++ static struct ctl_table *cpu_entries; ++ static struct ctl_table **cpu_idx; ++ char buf[32]; ++ int i; ++ ++ if (!cpu_entries) { ++ cpu_entries = sd_alloc_ctl_entry(num_possible_cpus() + 1); ++ if (!cpu_entries) ++ return; ++ ++ WARN_ON(sd_ctl_dir[0].child); ++ sd_ctl_dir[0].child = cpu_entries; ++ } ++ ++ if (!cpu_idx) { ++ struct ctl_table *e = cpu_entries; ++ ++ cpu_idx = kcalloc(nr_cpu_ids, sizeof(struct ctl_table*), GFP_KERNEL); ++ if (!cpu_idx) ++ return; ++ ++ /* deal with sparse possible map */ ++ for_each_possible_cpu(i) { ++ cpu_idx[i] = e; ++ e++; ++ } ++ } ++ ++ if (!cpumask_available(sd_sysctl_cpus)) { ++ if (!alloc_cpumask_var(&sd_sysctl_cpus, GFP_KERNEL)) ++ return; ++ ++ /* init to possible to not have holes in @cpu_entries */ ++ cpumask_copy(sd_sysctl_cpus, cpu_possible_mask); ++ } ++ ++ for_each_cpu(i, sd_sysctl_cpus) { ++ struct ctl_table *e = cpu_idx[i]; ++ ++ if (e->child) ++ sd_free_ctl_entry(&e->child); ++ ++ if (!e->procname) { ++ snprintf(buf, 32, "cpu%d", i); ++ e->procname = kstrdup(buf, GFP_KERNEL); ++ } ++ e->mode = 0555; ++ e->child = sd_alloc_ctl_cpu_table(i); ++ ++ __cpumask_clear_cpu(i, sd_sysctl_cpus); ++ } ++ ++ WARN_ON(sd_sysctl_header); ++ sd_sysctl_header = register_sysctl_table(sd_ctl_root); ++} ++ ++void dirty_sched_domain_sysctl(int cpu) ++{ ++ if (cpumask_available(sd_sysctl_cpus)) ++ __cpumask_set_cpu(cpu, sd_sysctl_cpus); ++} ++ ++/* may be called multiple times per register */ ++void unregister_sched_domain_sysctl(void) ++{ ++ unregister_sysctl_table(sd_sysctl_header); ++ sd_sysctl_header = NULL; ++} ++#endif /* CONFIG_SYSCTL */ ++ ++void set_rq_online(struct rq *rq) ++{ ++ if (!rq->online) { ++ cpumask_set_cpu(cpu_of(rq), rq->rd->online); ++ rq->online = true; ++ } ++} ++ ++void set_rq_offline(struct rq *rq) ++{ ++ if (rq->online) { ++ int cpu = cpu_of(rq); ++ ++ cpumask_clear_cpu(cpu, rq->rd->online); ++ rq->online = false; ++ clear_cpuidle_map(cpu); ++ } ++} ++ ++/* ++ * used to mark begin/end of suspend/resume: ++ */ ++static int num_cpus_frozen; ++ ++/* ++ * Update cpusets according to cpu_active mask. If cpusets are ++ * disabled, cpuset_update_active_cpus() becomes a simple wrapper ++ * around partition_sched_domains(). ++ * ++ * If we come here as part of a suspend/resume, don't touch cpusets because we ++ * want to restore it back to its original state upon resume anyway. ++ */ ++static void cpuset_cpu_active(void) ++{ ++ if (cpuhp_tasks_frozen) { ++ /* ++ * num_cpus_frozen tracks how many CPUs are involved in suspend ++ * resume sequence. As long as this is not the last online ++ * operation in the resume sequence, just build a single sched ++ * domain, ignoring cpusets. ++ */ ++ partition_sched_domains(1, NULL, NULL); ++ if (--num_cpus_frozen) ++ return; ++ /* ++ * This is the last CPU online operation. So fall through and ++ * restore the original sched domains by considering the ++ * cpuset configurations. ++ */ ++ cpuset_force_rebuild(); ++ } ++ ++ cpuset_update_active_cpus(); ++} ++ ++static int cpuset_cpu_inactive(unsigned int cpu) ++{ ++ if (!cpuhp_tasks_frozen) { ++ cpuset_update_active_cpus(); ++ } else { ++ num_cpus_frozen++; ++ partition_sched_domains(1, NULL, NULL); ++ } ++ return 0; ++} ++ ++int sched_cpu_activate(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ set_cpu_active(cpu, true); ++ ++ if (sched_smp_initialized) { ++ sched_domains_numa_masks_set(cpu); ++ cpuset_cpu_active(); ++ } ++ ++ /* ++ * Put the rq online, if not already. This happens: ++ * ++ * 1) In the early boot process, because we build the real domains ++ * after all CPUs have been brought up. ++ * ++ * 2) At runtime, if cpuset_cpu_active() fails to rebuild the ++ * domains. ++ */ ++ rq_lock_irqsave(rq, &flags); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_online(rq); ++ } ++ unbind_zero(cpu); ++ rq_unlock_irqrestore(rq, &flags); ++ ++ return 0; ++} ++ ++int sched_cpu_deactivate(unsigned int cpu) ++{ ++ int ret; ++ ++ set_cpu_active(cpu, false); ++ /* ++ * We've cleared cpu_active_mask, wait for all preempt-disabled and RCU ++ * users of this state to go away such that all new such users will ++ * observe it. ++ * ++ * Do sync before park smpboot threads to take care the rcu boost case. ++ */ ++ synchronize_rcu_mult(call_rcu, call_rcu_sched); ++ ++ if (!sched_smp_initialized) ++ return 0; ++ ++ ret = cpuset_cpu_inactive(cpu); ++ if (ret) { ++ set_cpu_active(cpu, true); ++ return ret; ++ } ++ sched_domains_numa_masks_clear(cpu); ++ return 0; ++} ++ ++int sched_cpu_starting(unsigned int cpu) ++{ ++ sched_tick_start(cpu); ++ return 0; ++} ++ ++#ifdef CONFIG_HOTPLUG_CPU ++int sched_cpu_dying(unsigned int cpu) ++{ ++ struct rq *rq = cpu_rq(cpu); ++ unsigned long flags; ++ ++ /* Handle pending wakeups and then migrate everything off */ ++ sched_ttwu_pending(); ++ sched_tick_stop(cpu); ++ ++ local_irq_save(flags); ++ double_rq_lock(rq, cpu_rq(0)); ++ if (rq->rd) { ++ BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); ++ set_rq_offline(rq); ++ } ++ bind_zero(cpu); ++ double_rq_unlock(rq, cpu_rq(0)); ++ sched_start_tick(rq, cpu); ++ hrexpiry_clear(rq); ++ local_irq_restore(flags); ++ ++ return 0; ++} ++#endif ++ ++#if defined(CONFIG_SCHED_SMT) || defined(CONFIG_SCHED_MC) ++/* ++ * Cheaper version of the below functions in case support for SMT and MC is ++ * compiled in but CPUs have no siblings. ++ */ ++static bool sole_cpu_idle(struct rq *rq) ++{ ++ return rq_idle(rq); ++} ++#endif ++#ifdef CONFIG_SCHED_SMT ++static const cpumask_t *thread_cpumask(int cpu) ++{ ++ return topology_sibling_cpumask(cpu); ++} ++/* All this CPU's SMT siblings are idle */ ++static bool siblings_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->thread_mask, &cpu_idle_map); ++} ++#endif ++#ifdef CONFIG_SCHED_MC ++static const cpumask_t *core_cpumask(int cpu) ++{ ++ return topology_core_cpumask(cpu); ++} ++/* All this CPU's shared cache siblings are idle */ ++static bool cache_cpu_idle(struct rq *rq) ++{ ++ return cpumask_subset(&rq->core_mask, &cpu_idle_map); ++} ++#endif ++ ++enum sched_domain_level { ++ SD_LV_NONE = 0, ++ SD_LV_SIBLING, ++ SD_LV_MC, ++ SD_LV_BOOK, ++ SD_LV_CPU, ++ SD_LV_NODE, ++ SD_LV_ALLNODES, ++ SD_LV_MAX ++}; ++ ++void __init sched_init_smp(void) ++{ ++ struct rq *rq, *other_rq, *leader; ++ struct sched_domain *sd; ++ int cpu, other_cpu, i; ++#ifdef CONFIG_SCHED_SMT ++ bool smt_threads = false; ++#endif ++ sched_init_numa(); ++ ++ /* ++ * There's no userspace yet to cause hotplug operations; hence all the ++ * cpu masks are stable and all blatant races in the below code cannot ++ * happen. ++ */ ++ mutex_lock(&sched_domains_mutex); ++ sched_init_domains(cpu_active_mask); ++ mutex_unlock(&sched_domains_mutex); ++ ++ /* Move init over to a non-isolated CPU */ ++ if (set_cpus_allowed_ptr(current, housekeeping_cpumask(HK_FLAG_DOMAIN)) < 0) ++ BUG(); ++ ++ mutex_lock(&sched_domains_mutex); ++ local_irq_disable(); ++ lock_all_rqs(); ++ /* ++ * Set up the relative cache distance of each online cpu from each ++ * other in a simple array for quick lookup. Locality is determined ++ * by the closest sched_domain that CPUs are separated by. CPUs with ++ * shared cache in SMT and MC are treated as local. Separate CPUs ++ * (within the same package or physically) within the same node are ++ * treated as not local. CPUs not even in the same domain (different ++ * nodes) are treated as very distant. ++ */ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ /* First check if this cpu is in the same node */ ++ for_each_domain(cpu, sd) { ++ if (sd->level > SD_LV_MC) ++ continue; ++ leader = NULL; ++ /* Set locality to local node if not already found lower */ ++ for_each_cpu(other_cpu, sched_domain_span(sd)) { ++ if (rqshare == RQSHARE_SMP) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smp_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smp_leader = leader; ++ } ++ ++ if (rq->cpu_locality[other_cpu] > 3) ++ rq->cpu_locality[other_cpu] = 3; ++ } ++ } ++ ++ /* ++ * Each runqueue has its own function in case it doesn't have ++ * siblings of its own allowing mixed topologies. ++ */ ++#ifdef CONFIG_SCHED_MC ++ leader = NULL; ++ if (cpumask_weight(core_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->core_mask, core_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->core_mask); ++ for_each_cpu(other_cpu, core_cpumask(cpu)) { ++ if (rqshare == RQSHARE_MC) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the mc_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->mc_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > 2) ++ rq->cpu_locality[other_cpu] = 2; ++ } ++ rq->cache_idle = cache_cpu_idle; ++ } ++#endif ++#ifdef CONFIG_SCHED_SMT ++ leader = NULL; ++ if (cpumask_weight(thread_cpumask(cpu)) > 1) { ++ cpumask_copy(&rq->thread_mask, thread_cpumask(cpu)); ++ cpumask_clear_cpu(cpu, &rq->thread_mask); ++ for_each_cpu(other_cpu, thread_cpumask(cpu)) { ++ if (rqshare == RQSHARE_SMT) { ++ other_rq = cpu_rq(other_cpu); ++ ++ /* Set the smt_leader to the first CPU */ ++ if (!leader) ++ leader = rq; ++ other_rq->smt_leader = leader; ++ } ++ if (rq->cpu_locality[other_cpu] > 1) ++ rq->cpu_locality[other_cpu] = 1; ++ } ++ rq->siblings_idle = siblings_cpu_idle; ++ smt_threads = true; ++ } ++#endif ++ } ++ ++#ifdef CONFIG_SMT_NICE ++ if (smt_threads) { ++ check_siblings = &check_smt_siblings; ++ wake_siblings = &wake_smt_siblings; ++ smt_schedule = &smt_should_schedule; ++ } ++#endif ++ unlock_all_rqs(); ++ local_irq_enable(); ++ mutex_unlock(&sched_domains_mutex); ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ for_each_online_cpu(other_cpu) { ++ if (other_cpu <= cpu) ++ continue; ++ printk(KERN_DEBUG "MuQSS locality CPU %d to %d: %d\n", cpu, other_cpu, rq->cpu_locality[other_cpu]); ++ } ++ } ++ ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->smp_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "Sharing SMP runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++ ++#ifdef CONFIG_SCHED_MC ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ leader = rq->mc_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "Sharing MC runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_MC */ ++ ++#ifdef CONFIG_SCHED_SMT ++ for_each_online_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ ++ leader = rq->smt_leader; ++ ++ rq_lock(rq); ++ if (leader && rq != leader) { ++ printk(KERN_INFO "Sharing SMT runqueue from CPU %d to CPU %d\n", ++ leader->cpu, rq->cpu); ++ kfree(rq->node); ++ kfree(rq->sl); ++ kfree(rq->lock); ++ rq->node = leader->node; ++ rq->sl = leader->sl; ++ rq->lock = leader->lock; ++ barrier(); ++ /* To make up for not unlocking the freed runlock */ ++ preempt_enable(); ++ } else ++ rq_unlock(rq); ++ } ++#endif /* CONFIG_SCHED_SMT */ ++ ++ total_runqueues = 0; ++ for_each_possible_cpu(cpu) { ++ int locality, total_rqs = 0, total_cpus = 0; ++ ++ rq = cpu_rq(cpu); ++ if ( ++#ifdef CONFIG_SCHED_MC ++ (rq->mc_leader == rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (rq->smt_leader == rq) && ++#endif ++ (rq->smp_leader == rq)) ++ total_runqueues++; ++ ++ for (locality = 0; locality <= 4; locality++) { ++ int test_cpu; ++ ++ for_each_possible_cpu(test_cpu) { ++ /* Work from each CPU up instead of every rq ++ * starting at CPU 0 */ ++ other_cpu = test_cpu + cpu; ++ other_cpu %= num_possible_cpus(); ++ other_rq = cpu_rq(other_cpu); ++ ++ if (rq->cpu_locality[other_cpu] == locality) { ++ rq->cpu_order[total_cpus++] = other_rq; ++ if ( ++ ++#ifdef CONFIG_SCHED_MC ++ (other_rq->mc_leader == other_rq) && ++#endif ++#ifdef CONFIG_SCHED_SMT ++ (other_rq->smt_leader == other_rq) && ++#endif ++ (other_rq->smp_leader == other_rq)) ++ rq->rq_order[total_rqs++] = other_rq; ++ } ++ } ++ } ++ } ++ ++ for_each_possible_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < total_runqueues; i++) { ++ printk(KERN_DEBUG "CPU %d RQ order %d RQ %d\n", cpu, i, ++ rq->rq_order[i]->cpu); ++ } ++ } ++ for_each_possible_cpu(cpu) { ++ rq = cpu_rq(cpu); ++ for (i = 0; i < num_possible_cpus(); i++) { ++ printk(KERN_DEBUG "CPU %d CPU order %d RQ %d\n", cpu, i, ++ rq->cpu_order[i]->cpu); ++ } ++ } ++ switch (rqshare) { ++ case RQSHARE_SMP: ++ printk(KERN_INFO "MuQSS runqueue share type SMP total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_MC: ++ printk(KERN_INFO "MuQSS runqueue share type MC total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_SMT: ++ printk(KERN_INFO "MuQSS runqueue share type SMT total runqueues: %d\n", ++ total_runqueues); ++ break; ++ case RQSHARE_NONE: ++ printk(KERN_INFO "MuQSS runqueue share type none total runqueues: %d\n", ++ total_runqueues); ++ break; ++ } ++ ++ sched_smp_initialized = true; ++} ++#else ++void __init sched_init_smp(void) ++{ ++ sched_smp_initialized = true; ++} ++#endif /* CONFIG_SMP */ ++ ++int in_sched_functions(unsigned long addr) ++{ ++ return in_lock_functions(addr) || ++ (addr >= (unsigned long)__sched_text_start ++ && addr < (unsigned long)__sched_text_end); ++} ++ ++#ifdef CONFIG_CGROUP_SCHED ++/* task group related information */ ++struct task_group { ++ struct cgroup_subsys_state css; ++ ++ struct rcu_head rcu; ++ struct list_head list; ++ ++ struct task_group *parent; ++ struct list_head siblings; ++ struct list_head children; ++}; ++ ++/* ++ * Default task group. ++ * Every task in system belongs to this group at bootup. ++ */ ++struct task_group root_task_group; ++LIST_HEAD(task_groups); ++ ++/* Cacheline aligned slab cache for task_group */ ++static struct kmem_cache *task_group_cache __read_mostly; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++void __init sched_init(void) ++{ ++#ifdef CONFIG_SMP ++ int cpu_ids; ++#endif ++ int i; ++ struct rq *rq; ++ ++ wait_bit_init(); ++ ++ prio_ratios[0] = 128; ++ for (i = 1 ; i < NICE_WIDTH ; i++) ++ prio_ratios[i] = prio_ratios[i - 1] * 11 / 10; ++ ++ skiplist_node_init(&init_task.node); ++ ++#ifdef CONFIG_SMP ++ init_defrootdomain(); ++ cpumask_clear(&cpu_idle_map); ++#else ++ uprq = &per_cpu(runqueues, 0); ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++ task_group_cache = KMEM_CACHE(task_group, 0); ++ ++ list_add(&root_task_group.list, &task_groups); ++ INIT_LIST_HEAD(&root_task_group.children); ++ INIT_LIST_HEAD(&root_task_group.siblings); ++#endif /* CONFIG_CGROUP_SCHED */ ++ for_each_possible_cpu(i) { ++ rq = cpu_rq(i); ++ rq->node = kmalloc(sizeof(skiplist_node), GFP_ATOMIC); ++ skiplist_init(rq->node); ++ rq->sl = new_skiplist(rq->node); ++ rq->lock = kmalloc(sizeof(raw_spinlock_t), GFP_ATOMIC); ++ raw_spin_lock_init(rq->lock); ++ rq->nr_running = 0; ++ rq->nr_uninterruptible = 0; ++ rq->nr_switches = 0; ++ rq->clock = rq->old_clock = rq->last_niffy = rq->niffies = 0; ++ rq->last_jiffy = jiffies; ++ rq->user_ns = rq->nice_ns = rq->softirq_ns = rq->system_ns = ++ rq->iowait_ns = rq->idle_ns = 0; ++ rq->dither = 0; ++ set_rq_task(rq, &init_task); ++ rq->iso_ticks = 0; ++ rq->iso_refractory = false; ++#ifdef CONFIG_SMP ++ rq->smp_leader = rq; ++#ifdef CONFIG_SCHED_MC ++ rq->mc_leader = rq; ++#endif ++#ifdef CONFIG_SCHED_SMT ++ rq->smt_leader = rq; ++#endif ++ rq->sd = NULL; ++ rq->rd = NULL; ++ rq->online = false; ++ rq->cpu = i; ++ rq_attach_root(rq, &def_root_domain); ++#endif ++ init_rq_hrexpiry(rq); ++ atomic_set(&rq->nr_iowait, 0); ++ } ++ ++#ifdef CONFIG_SMP ++ cpu_ids = i; ++ /* ++ * Set the base locality for cpu cache distance calculation to ++ * "distant" (3). Make sure the distance from a CPU to itself is 0. ++ */ ++ for_each_possible_cpu(i) { ++ int j; ++ ++ rq = cpu_rq(i); ++#ifdef CONFIG_SCHED_SMT ++ rq->siblings_idle = sole_cpu_idle; ++#endif ++#ifdef CONFIG_SCHED_MC ++ rq->cache_idle = sole_cpu_idle; ++#endif ++ rq->cpu_locality = kmalloc(cpu_ids * sizeof(int *), GFP_ATOMIC); ++ for_each_possible_cpu(j) { ++ if (i == j) ++ rq->cpu_locality[j] = 0; ++ else ++ rq->cpu_locality[j] = 4; ++ } ++ rq->rq_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->cpu_order = kmalloc(cpu_ids * sizeof(struct rq *), GFP_ATOMIC); ++ rq->rq_order[0] = rq->cpu_order[0] = rq; ++ for (j = 1; j < cpu_ids; j++) ++ rq->rq_order[j] = rq->cpu_order[j] = cpu_rq(j); ++ } ++#endif ++ ++ /* ++ * The boot idle thread does lazy MMU switching as well: ++ */ ++ mmgrab(&init_mm); ++ enter_lazy_tlb(&init_mm, current); ++ ++ /* ++ * Make us the idle thread. Technically, schedule() should not be ++ * called from this thread, however somewhere below it might be, ++ * but because we are the idle thread, we just pick up running again ++ * when this runqueue becomes "idle". ++ */ ++ init_idle(current, smp_processor_id()); ++ ++#ifdef CONFIG_SMP ++ idle_thread_set_boot_cpu(); ++#endif /* SMP */ ++ ++ init_schedstats(); ++} ++ ++#ifdef CONFIG_DEBUG_ATOMIC_SLEEP ++static inline int preempt_count_equals(int preempt_offset) ++{ ++ int nested = preempt_count() + rcu_preempt_depth(); ++ ++ return (nested == preempt_offset); ++} ++ ++void __might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* ++ * Blocking primitives will set (and therefore destroy) current->state, ++ * since we will exit with TASK_RUNNING make sure we enter with it, ++ * otherwise we will destroy state. ++ */ ++ WARN_ONCE(current->state != TASK_RUNNING && current->task_state_change, ++ "do not call blocking ops when !TASK_RUNNING; " ++ "state=%lx set at [<%p>] %pS\n", ++ current->state, ++ (void *)current->task_state_change, ++ (void *)current->task_state_change); ++ ++ ___might_sleep(file, line, preempt_offset); ++} ++EXPORT_SYMBOL(__might_sleep); ++ ++void ___might_sleep(const char *file, int line, int preempt_offset) ++{ ++ /* Ratelimiting timestamp: */ ++ static unsigned long prev_jiffy; ++ ++ unsigned long preempt_disable_ip; ++ ++ /* WARN_ON_ONCE() by default, no rate limit required: */ ++ rcu_sleep_check(); ++ ++ if ((preempt_count_equals(preempt_offset) && !irqs_disabled() && ++ !is_idle_task(current)) || ++ system_state == SYSTEM_BOOTING || system_state > SYSTEM_RUNNING || ++ oops_in_progress) ++ return; ++ ++ if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) ++ return; ++ prev_jiffy = jiffies; ++ ++ /* Save this before calling printk(), since that will clobber it: */ ++ preempt_disable_ip = get_preempt_disable_ip(current); ++ ++ printk(KERN_ERR ++ "BUG: sleeping function called from invalid context at %s:%d\n", ++ file, line); ++ printk(KERN_ERR ++ "in_atomic(): %d, irqs_disabled(): %d, pid: %d, name: %s\n", ++ in_atomic(), irqs_disabled(), ++ current->pid, current->comm); ++ ++ if (task_stack_end_corrupted(current)) ++ printk(KERN_EMERG "Thread overran stack, or stack corrupted\n"); ++ ++ debug_show_held_locks(current); ++ if (irqs_disabled()) ++ print_irqtrace_events(current); ++ if (IS_ENABLED(CONFIG_DEBUG_PREEMPT) ++ && !preempt_count_equals(preempt_offset)) { ++ pr_err("Preemption disabled at:"); ++ print_ip_sym(preempt_disable_ip); ++ pr_cont("\n"); ++ } ++ dump_stack(); ++ add_taint(TAINT_WARN, LOCKDEP_STILL_OK); ++} ++EXPORT_SYMBOL(___might_sleep); ++#endif ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static inline void normalise_rt_tasks(void) ++{ ++ struct task_struct *g, *p; ++ unsigned long flags; ++ struct rq *rq; ++ ++ read_lock(&tasklist_lock); ++ for_each_process_thread(g, p) { ++ /* ++ * Only normalize user tasks: ++ */ ++ if (p->flags & PF_KTHREAD) ++ continue; ++ ++ if (!rt_task(p) && !iso_task(p)) ++ continue; ++ ++ rq = task_rq_lock(p, &flags); ++ __setscheduler(p, rq, SCHED_NORMAL, 0, false); ++ task_rq_unlock(rq, p, &flags); ++ } ++ read_unlock(&tasklist_lock); ++} ++ ++void normalize_rt_tasks(void) ++{ ++ normalise_rt_tasks(); ++} ++#endif /* CONFIG_MAGIC_SYSRQ */ ++ ++#if defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) ++/* ++ * These functions are only useful for the IA64 MCA handling, or kdb. ++ * ++ * They can only be called when the whole system has been ++ * stopped - every CPU needs to be quiescent, and no scheduling ++ * activity can take place. Using them for anything else would ++ * be a serious bug, and as a result, they aren't even visible ++ * under any other configuration. ++ */ ++ ++/** ++ * curr_task - return the current task for a given CPU. ++ * @cpu: the processor in question. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ * ++ * Return: The current task for @cpu. ++ */ ++struct task_struct *curr_task(int cpu) ++{ ++ return cpu_curr(cpu); ++} ++ ++#endif /* defined(CONFIG_IA64) || defined(CONFIG_KGDB_KDB) */ ++ ++#ifdef CONFIG_IA64 ++/** ++ * set_curr_task - set the current task for a given CPU. ++ * @cpu: the processor in question. ++ * @p: the task pointer to set. ++ * ++ * Description: This function must only be used when non-maskable interrupts ++ * are serviced on a separate stack. It allows the architecture to switch the ++ * notion of the current task on a CPU in a non-blocking manner. This function ++ * must be called with all CPU's synchronised, and interrupts disabled, the ++ * and caller must save the original value of the current task (see ++ * curr_task() above) and restore that value before reenabling interrupts and ++ * re-starting the system. ++ * ++ * ONLY VALID WHEN THE WHOLE SYSTEM IS STOPPED! ++ */ ++void ia64_set_curr_task(int cpu, struct task_struct *p) ++{ ++ cpu_curr(cpu) = p; ++} ++ ++#endif ++ ++void init_idle_bootup_task(struct task_struct *idle) ++{} ++ ++#ifdef CONFIG_SCHED_DEBUG ++__read_mostly bool sched_debug_enabled; ++ ++void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, ++ struct seq_file *m) ++{} ++ ++void proc_sched_set_task(struct task_struct *p) ++{} ++#endif ++ ++#ifdef CONFIG_SMP ++#define SCHED_LOAD_SHIFT (10) ++#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT) ++ ++unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu) ++{ ++ return SCHED_LOAD_SCALE; ++} ++ ++unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu) ++{ ++ unsigned long weight = cpumask_weight(sched_domain_span(sd)); ++ unsigned long smt_gain = sd->smt_gain; ++ ++ smt_gain /= weight; ++ ++ return smt_gain; ++} ++#endif ++ ++#ifdef CONFIG_CGROUP_SCHED ++static void sched_free_group(struct task_group *tg) ++{ ++ kmem_cache_free(task_group_cache, tg); ++} ++ ++/* allocate runqueue etc for a new task group */ ++struct task_group *sched_create_group(struct task_group *parent) ++{ ++ struct task_group *tg; ++ ++ tg = kmem_cache_alloc(task_group_cache, GFP_KERNEL | __GFP_ZERO); ++ if (!tg) ++ return ERR_PTR(-ENOMEM); ++ ++ return tg; ++} ++ ++void sched_online_group(struct task_group *tg, struct task_group *parent) ++{ ++} ++ ++/* rcu callback to free various structures associated with a task group */ ++static void sched_free_group_rcu(struct rcu_head *rhp) ++{ ++ /* Now it should be safe to free those cfs_rqs */ ++ sched_free_group(container_of(rhp, struct task_group, rcu)); ++} ++ ++void sched_destroy_group(struct task_group *tg) ++{ ++ /* Wait for possible concurrent references to cfs_rqs complete */ ++ call_rcu(&tg->rcu, sched_free_group_rcu); ++} ++ ++void sched_offline_group(struct task_group *tg) ++{ ++} ++ ++static inline struct task_group *css_tg(struct cgroup_subsys_state *css) ++{ ++ return css ? container_of(css, struct task_group, css) : NULL; ++} ++ ++static struct cgroup_subsys_state * ++cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) ++{ ++ struct task_group *parent = css_tg(parent_css); ++ struct task_group *tg; ++ ++ if (!parent) { ++ /* This is early initialization for the top cgroup */ ++ return &root_task_group.css; ++ } ++ ++ tg = sched_create_group(parent); ++ if (IS_ERR(tg)) ++ return ERR_PTR(-ENOMEM); ++ return &tg->css; ++} ++ ++/* Expose task group only after completing cgroup initialization */ ++static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ struct task_group *parent = css_tg(css->parent); ++ ++ if (parent) ++ sched_online_group(tg, parent); ++ return 0; ++} ++ ++static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ sched_offline_group(tg); ++} ++ ++static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) ++{ ++ struct task_group *tg = css_tg(css); ++ ++ /* ++ * Relies on the RCU grace period between css_released() and this. ++ */ ++ sched_free_group(tg); ++} ++ ++static void cpu_cgroup_fork(struct task_struct *task) ++{ ++} ++ ++static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) ++{ ++ return 0; ++} ++ ++static void cpu_cgroup_attach(struct cgroup_taskset *tset) ++{ ++} ++ ++static struct cftype cpu_legacy_files[] = { ++ { } /* Terminate */ ++}; ++ ++static struct cftype cpu_files[] = { ++ { } /* terminate */ ++}; ++ ++static int cpu_extra_stat_show(struct seq_file *sf, ++ struct cgroup_subsys_state *css) ++{ ++ return 0; ++} ++ ++struct cgroup_subsys cpu_cgrp_subsys = { ++ .css_alloc = cpu_cgroup_css_alloc, ++ .css_online = cpu_cgroup_css_online, ++ .css_released = cpu_cgroup_css_released, ++ .css_free = cpu_cgroup_css_free, ++ .css_extra_stat_show = cpu_extra_stat_show, ++ .fork = cpu_cgroup_fork, ++ .can_attach = cpu_cgroup_can_attach, ++ .attach = cpu_cgroup_attach, ++ .legacy_cftypes = cpu_files, ++ .legacy_cftypes = cpu_legacy_files, ++ .dfl_cftypes = cpu_files, ++ .early_init = true, ++ .threaded = true, ++}; ++#endif /* CONFIG_CGROUP_SCHED */ ++ ++#undef CREATE_TRACE_POINTS +diff -Nur a/kernel/sched/MuQSS.h b/kernel/sched/MuQSS.h +--- a/kernel/sched/MuQSS.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/kernel/sched/MuQSS.h 2019-02-09 17:46:12.001297867 +0000 +@@ -0,0 +1,881 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef MUQSS_SCHED_H ++#define MUQSS_SCHED_H ++ ++#include <linux/sched/clock.h> ++#include <linux/sched/wake_q.h> ++#include <linux/sched/signal.h> ++#include <linux/sched/mm.h> ++#include <linux/sched/cpufreq.h> ++#include <linux/sched/stat.h> ++#include <linux/sched/nohz.h> ++#include <linux/sched/debug.h> ++#include <linux/sched/hotplug.h> ++#include <linux/sched/task.h> ++#include <linux/sched/task_stack.h> ++#include <linux/sched/topology.h> ++#include <linux/sched/cputime.h> ++#include <linux/sched/init.h> ++#include <linux/sched/isolation.h> ++ ++#include <uapi/linux/sched/types.h> ++ ++#include <linux/cgroup.h> ++#include <linux/cpufreq.h> ++#include <linux/cpuidle.h> ++#include <linux/ctype.h> ++#include <linux/freezer.h> ++#include <linux/interrupt.h> ++#include <linux/kernel_stat.h> ++#include <linux/kthread.h> ++#include <linux/livepatch.h> ++#include <linux/proc_fs.h> ++#include <linux/sched.h> ++#include <linux/slab.h> ++#include <linux/skip_list.h> ++#include <linux/stackprotector.h> ++#include <linux/stop_machine.h> ++#include <linux/suspend.h> ++#include <linux/swait.h> ++#include <linux/tick.h> ++#include <linux/tsacct_kern.h> ++#include <linux/u64_stats_sync.h> ++ ++#ifdef CONFIG_PARAVIRT ++#include <asm/paravirt.h> ++#endif ++ ++#include "cpupri.h" ++ ++#ifdef CONFIG_SCHED_DEBUG ++# define SCHED_WARN_ON(x) WARN_ONCE(x, #x) ++#else ++# define SCHED_WARN_ON(x) ((void)(x)) ++#endif ++ ++/* task_struct::on_rq states: */ ++#define TASK_ON_RQ_QUEUED 1 ++#define TASK_ON_RQ_MIGRATING 2 ++ ++struct rq; ++ ++#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING) ++#define HAVE_SCHED_AVG_IRQ ++#endif ++ ++#ifdef CONFIG_SMP ++ ++static inline bool sched_asym_prefer(int a, int b) ++{ ++ return arch_asym_cpu_priority(a) > arch_asym_cpu_priority(b); ++} ++ ++/* ++ * We add the notion of a root-domain which will be used to define per-domain ++ * variables. Each exclusive cpuset essentially defines an island domain by ++ * fully partitioning the member cpus from any other cpuset. Whenever a new ++ * exclusive cpuset is created, we also create and attach a new root-domain ++ * object. ++ * ++ */ ++struct root_domain { ++ atomic_t refcount; ++ atomic_t rto_count; ++ struct rcu_head rcu; ++ cpumask_var_t span; ++ cpumask_var_t online; ++ ++ /* Indicate more than one runnable task for any CPU */ ++ bool overload; ++ ++ /* ++ * The bit corresponding to a CPU gets set here if such CPU has more ++ * than one runnable -deadline task (as it is below for RT tasks). ++ */ ++ cpumask_var_t dlo_mask; ++ atomic_t dlo_count; ++ /* Replace unused CFS structures with void */ ++ //struct dl_bw dl_bw; ++ //struct cpudl cpudl; ++ void *dl_bw; ++ void *cpudl; ++ ++ /* ++ * The "RT overload" flag: it gets set if a CPU has more than ++ * one runnable RT task. ++ */ ++ cpumask_var_t rto_mask; ++ //struct cpupri cpupri; ++ void *cpupri; ++ ++ unsigned long max_cpu_capacity; ++}; ++ ++extern struct root_domain def_root_domain; ++extern struct mutex sched_domains_mutex; ++ ++extern void init_defrootdomain(void); ++extern int sched_init_domains(const struct cpumask *cpu_map); ++extern void rq_attach_root(struct rq *rq, struct root_domain *rd); ++ ++static inline void cpupri_cleanup(void __maybe_unused *cpupri) ++{ ++} ++ ++static inline void cpudl_cleanup(void __maybe_unused *cpudl) ++{ ++} ++ ++static inline void init_dl_bw(void __maybe_unused *dl_bw) ++{ ++} ++ ++static inline int cpudl_init(void __maybe_unused *dl_bw) ++{ ++ return 0; ++} ++ ++static inline int cpupri_init(void __maybe_unused *cpupri) ++{ ++ return 0; ++} ++#endif /* CONFIG_SMP */ ++ ++/* ++ * This is the main, per-CPU runqueue data structure. ++ * This data should only be modified by the local cpu. ++ */ ++struct rq { ++ raw_spinlock_t *lock; ++ raw_spinlock_t *orig_lock; ++ ++ struct task_struct *curr, *idle, *stop; ++ struct mm_struct *prev_mm; ++ ++ unsigned int nr_running; ++ /* ++ * This is part of a global counter where only the total sum ++ * over all CPUs matters. A task can increase this counter on ++ * one CPU and if it got migrated afterwards it may decrease ++ * it on another CPU. Always updated under the runqueue lock: ++ */ ++ unsigned long nr_uninterruptible; ++ u64 nr_switches; ++ ++ /* Stored data about rq->curr to work outside rq lock */ ++ u64 rq_deadline; ++ int rq_prio; ++ ++ /* Best queued id for use outside lock */ ++ u64 best_key; ++ ++ unsigned long last_scheduler_tick; /* Last jiffy this RQ ticked */ ++ unsigned long last_jiffy; /* Last jiffy this RQ updated rq clock */ ++ u64 niffies; /* Last time this RQ updated rq clock */ ++ u64 last_niffy; /* Last niffies as updated by local clock */ ++ u64 last_jiffy_niffies; /* Niffies @ last_jiffy */ ++ ++ u64 load_update; /* When we last updated load */ ++ unsigned long load_avg; /* Rolling load average */ ++#ifdef HAVE_SCHED_AVG_IRQ ++ u64 irq_load_update; /* When we last updated IRQ load */ ++ unsigned long irq_load_avg; /* Rolling IRQ load average */ ++#endif ++#ifdef CONFIG_SMT_NICE ++ struct mm_struct *rq_mm; ++ int rq_smt_bias; /* Policy/nice level bias across smt siblings */ ++#endif ++ /* Accurate timekeeping data */ ++ unsigned long user_ns, nice_ns, irq_ns, softirq_ns, system_ns, ++ iowait_ns, idle_ns; ++ atomic_t nr_iowait; ++ ++ skiplist_node *node; ++ skiplist *sl; ++#ifdef CONFIG_SMP ++ struct task_struct *preempt; /* Preempt triggered on this task */ ++ struct task_struct *preempting; /* Hint only, what task is preempting */ ++ ++ int cpu; /* cpu of this runqueue */ ++ bool online; ++ ++ struct root_domain *rd; ++ struct sched_domain *sd; ++ ++ unsigned long cpu_capacity_orig; ++ ++ int *cpu_locality; /* CPU relative cache distance */ ++ struct rq **rq_order; /* Shared RQs ordered by relative cache distance */ ++ struct rq **cpu_order; /* RQs of discrete CPUs ordered by distance */ ++ ++ struct rq *smp_leader; /* First physical CPU per node */ ++#ifdef CONFIG_SCHED_SMT ++ struct rq *smt_leader; /* First logical CPU in SMT siblings */ ++ cpumask_t thread_mask; ++ bool (*siblings_idle)(struct rq *rq); ++ /* See if all smt siblings are idle */ ++#endif /* CONFIG_SCHED_SMT */ ++#ifdef CONFIG_SCHED_MC ++ struct rq *mc_leader; /* First logical CPU in MC siblings */ ++ cpumask_t core_mask; ++ bool (*cache_idle)(struct rq *rq); ++ /* See if all cache siblings are idle */ ++#endif /* CONFIG_SCHED_MC */ ++#endif /* CONFIG_SMP */ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++ u64 prev_irq_time; ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++#ifdef CONFIG_PARAVIRT ++ u64 prev_steal_time; ++#endif /* CONFIG_PARAVIRT */ ++#ifdef CONFIG_PARAVIRT_TIME_ACCOUNTING ++ u64 prev_steal_time_rq; ++#endif /* CONFIG_PARAVIRT_TIME_ACCOUNTING */ ++ ++ u64 clock, old_clock, last_tick; ++ u64 clock_task; ++ int dither; ++ ++ int iso_ticks; ++ bool iso_refractory; ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++ struct hrtimer hrexpiry_timer; ++#endif ++ ++ int rt_nr_running; /* Number real time tasks running */ ++#ifdef CONFIG_SCHEDSTATS ++ ++ /* latency stats */ ++ struct sched_info rq_sched_info; ++ unsigned long long rq_cpu_time; ++ /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ ++ ++ /* sys_sched_yield() stats */ ++ unsigned int yld_count; ++ ++ /* schedule() stats */ ++ unsigned int sched_switch; ++ unsigned int sched_count; ++ unsigned int sched_goidle; ++ ++ /* try_to_wake_up() stats */ ++ unsigned int ttwu_count; ++ unsigned int ttwu_local; ++#endif /* CONFIG_SCHEDSTATS */ ++ ++#ifdef CONFIG_SMP ++ struct llist_head wake_list; ++#endif ++ ++#ifdef CONFIG_CPU_IDLE ++ /* Must be inspected within a rcu lock section */ ++ struct cpuidle_state *idle_state; ++#endif ++}; ++ ++#ifdef CONFIG_SMP ++struct rq *cpu_rq(int cpu); ++#endif ++ ++#ifndef CONFIG_SMP ++extern struct rq *uprq; ++#define cpu_rq(cpu) (uprq) ++#define this_rq() (uprq) ++#define raw_rq() (uprq) ++#define task_rq(p) (uprq) ++#define cpu_curr(cpu) ((uprq)->curr) ++#else /* CONFIG_SMP */ ++DECLARE_PER_CPU_SHARED_ALIGNED(struct rq, runqueues); ++#define this_rq() this_cpu_ptr(&runqueues) ++#define raw_rq() raw_cpu_ptr(&runqueues) ++#define task_rq(p) cpu_rq(task_cpu(p)) ++#endif /* CONFIG_SMP */ ++ ++static inline int task_current(struct rq *rq, struct task_struct *p) ++{ ++ return rq->curr == p; ++} ++ ++static inline int task_running(struct rq *rq, struct task_struct *p) ++{ ++#ifdef CONFIG_SMP ++ return p->on_cpu; ++#else ++ return task_current(rq, p); ++#endif ++} ++ ++static inline void rq_lock(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock(rq->lock); ++} ++ ++static inline void rq_unlock(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock(rq->lock); ++} ++ ++static inline void rq_lock_irq(struct rq *rq) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irq(rq->lock); ++} ++ ++static inline void rq_unlock_irq(struct rq *rq) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irq(rq->lock); ++} ++ ++static inline void rq_lock_irqsave(struct rq *rq, unsigned long *flags) ++ __acquires(rq->lock) ++{ ++ raw_spin_lock_irqsave(rq->lock, *flags); ++} ++ ++static inline void rq_unlock_irqrestore(struct rq *rq, unsigned long *flags) ++ __releases(rq->lock) ++{ ++ raw_spin_unlock_irqrestore(rq->lock, *flags); ++} ++ ++static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags) ++ __acquires(p->pi_lock) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ while (42) { ++ raw_spin_lock_irqsave(&p->pi_lock, *flags); ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++ } ++ return rq; ++} ++ ++static inline void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags) ++ __releases(rq->lock) ++ __releases(p->pi_lock) ++{ ++ rq_unlock(rq); ++ raw_spin_unlock_irqrestore(&p->pi_lock, *flags); ++} ++ ++static inline struct rq *__task_rq_lock(struct task_struct *p) ++ __acquires(rq->lock) ++{ ++ struct rq *rq; ++ ++ lockdep_assert_held(&p->pi_lock); ++ ++ while (42) { ++ rq = task_rq(p); ++ raw_spin_lock(rq->lock); ++ if (likely(rq == task_rq(p))) ++ break; ++ raw_spin_unlock(rq->lock); ++ } ++ return rq; ++} ++ ++static inline void __task_rq_unlock(struct rq *rq) ++{ ++ rq_unlock(rq); ++} ++ ++/* ++ * {de,en}queue flags: Most not used on MuQSS. ++ * ++ * DEQUEUE_SLEEP - task is no longer runnable ++ * ENQUEUE_WAKEUP - task just became runnable ++ * ++ * SAVE/RESTORE - an otherwise spurious dequeue/enqueue, done to ensure tasks ++ * are in a known state which allows modification. Such pairs ++ * should preserve as much state as possible. ++ * ++ * MOVE - paired with SAVE/RESTORE, explicitly does not preserve the location ++ * in the runqueue. ++ * ++ * ENQUEUE_HEAD - place at front of runqueue (tail if not specified) ++ * ENQUEUE_REPLENISH - CBS (replenish runtime and postpone deadline) ++ * ENQUEUE_MIGRATED - the task was migrated during wakeup ++ * ++ */ ++ ++#define DEQUEUE_SAVE 0x02 /* matches ENQUEUE_RESTORE */ ++ ++#define ENQUEUE_RESTORE 0x02 ++ ++static inline u64 __rq_clock_broken(struct rq *rq) ++{ ++ return READ_ONCE(rq->clock); ++} ++ ++static inline u64 rq_clock(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock; ++} ++ ++static inline u64 rq_clock_task(struct rq *rq) ++{ ++ lockdep_assert_held(rq->lock); ++ ++ return rq->clock_task; ++} ++ ++#ifdef CONFIG_NUMA ++enum numa_topology_type { ++ NUMA_DIRECT, ++ NUMA_GLUELESS_MESH, ++ NUMA_BACKPLANE, ++}; ++extern enum numa_topology_type sched_numa_topology_type; ++extern int sched_max_numa_distance; ++extern bool find_numa_distance(int distance); ++ ++extern void sched_init_numa(void); ++extern void sched_domains_numa_masks_set(unsigned int cpu); ++extern void sched_domains_numa_masks_clear(unsigned int cpu); ++#else ++static inline void sched_init_numa(void) { } ++static inline void sched_domains_numa_masks_set(unsigned int cpu) { } ++static inline void sched_domains_numa_masks_clear(unsigned int cpu) { } ++#endif ++ ++extern struct mutex sched_domains_mutex; ++extern struct static_key_false sched_schedstats; ++ ++#define rcu_dereference_check_sched_domain(p) \ ++ rcu_dereference_check((p), \ ++ lockdep_is_held(&sched_domains_mutex)) ++ ++#ifdef CONFIG_SMP ++ ++/* ++ * The domain tree (rq->sd) is protected by RCU's quiescent state transition. ++ * See detach_destroy_domains: synchronize_sched for details. ++ * ++ * The domain tree of any CPU may only be accessed from within ++ * preempt-disabled sections. ++ */ ++#define for_each_domain(cpu, __sd) \ ++ for (__sd = rcu_dereference_check_sched_domain(cpu_rq(cpu)->sd); \ ++ __sd; __sd = __sd->parent) ++ ++#define for_each_lower_domain(sd) for (; sd; sd = sd->child) ++ ++/** ++ * highest_flag_domain - Return highest sched_domain containing flag. ++ * @cpu: The cpu whose highest level of sched domain is to ++ * be returned. ++ * @flag: The flag to check for the highest sched_domain ++ * for the given cpu. ++ * ++ * Returns the highest sched_domain of a cpu which contains the given flag. ++ */ ++static inline struct sched_domain *highest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd, *hsd = NULL; ++ ++ for_each_domain(cpu, sd) { ++ if (!(sd->flags & flag)) ++ break; ++ hsd = sd; ++ } ++ ++ return hsd; ++} ++ ++static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) ++{ ++ struct sched_domain *sd; ++ ++ for_each_domain(cpu, sd) { ++ if (sd->flags & flag) ++ break; ++ } ++ ++ return sd; ++} ++ ++DECLARE_PER_CPU(struct sched_domain *, sd_llc); ++DECLARE_PER_CPU(int, sd_llc_size); ++DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared); ++DECLARE_PER_CPU(struct sched_domain *, sd_numa); ++DECLARE_PER_CPU(struct sched_domain *, sd_asym); ++ ++struct sched_group_capacity { ++ atomic_t ref; ++ /* ++ * CPU capacity of this group, SCHED_CAPACITY_SCALE being max capacity ++ * for a single CPU. ++ */ ++ unsigned long capacity; ++ unsigned long min_capacity; /* Min per-CPU capacity in group */ ++ unsigned long next_update; ++ int imbalance; /* XXX unrelated to capacity but shared group state */ ++ ++#ifdef CONFIG_SCHED_DEBUG ++ int id; ++#endif ++ ++ unsigned long cpumask[0]; /* balance mask */ ++}; ++ ++struct sched_group { ++ struct sched_group *next; /* Must be a circular list */ ++ atomic_t ref; ++ ++ unsigned int group_weight; ++ struct sched_group_capacity *sgc; ++ int asym_prefer_cpu; /* cpu of highest priority in group */ ++ ++ /* ++ * The CPUs this group covers. ++ * ++ * NOTE: this field is variable length. (Allocated dynamically ++ * by attaching extra space to the end of the structure, ++ * depending on how many CPUs the kernel has booted up with) ++ */ ++ unsigned long cpumask[0]; ++}; ++ ++static inline struct cpumask *sched_group_span(struct sched_group *sg) ++{ ++ return to_cpumask(sg->cpumask); ++} ++ ++/* ++ * See build_balance_mask(). ++ */ ++static inline struct cpumask *group_balance_mask(struct sched_group *sg) ++{ ++ return to_cpumask(sg->sgc->cpumask); ++} ++ ++/** ++ * group_first_cpu - Returns the first cpu in the cpumask of a sched_group. ++ * @group: The group whose first cpu is to be returned. ++ */ ++static inline unsigned int group_first_cpu(struct sched_group *group) ++{ ++ return cpumask_first(sched_group_span(group)); ++} ++ ++ ++#if defined(CONFIG_SCHED_DEBUG) && defined(CONFIG_SYSCTL) ++void register_sched_domain_sysctl(void); ++void dirty_sched_domain_sysctl(int cpu); ++void unregister_sched_domain_sysctl(void); ++#else ++static inline void register_sched_domain_sysctl(void) ++{ ++} ++static inline void dirty_sched_domain_sysctl(int cpu) ++{ ++} ++static inline void unregister_sched_domain_sysctl(void) ++{ ++} ++#endif ++ ++extern void sched_ttwu_pending(void); ++extern void set_cpus_allowed_common(struct task_struct *p, const struct cpumask *new_mask); ++extern void set_rq_online (struct rq *rq); ++extern void set_rq_offline(struct rq *rq); ++extern bool sched_smp_initialized; ++ ++static inline void update_group_capacity(struct sched_domain *sd, int cpu) ++{ ++} ++ ++static inline void trigger_load_balance(struct rq *rq) ++{ ++} ++ ++#define sched_feat(x) 0 ++ ++#else /* CONFIG_SMP */ ++ ++static inline void sched_ttwu_pending(void) { } ++ ++#endif /* CONFIG_SMP */ ++ ++#ifdef CONFIG_CPU_IDLE ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++ rq->idle_state = idle_state; ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ SCHED_WARN_ON(!rcu_read_lock_held()); ++ return rq->idle_state; ++} ++#else ++static inline void idle_set_state(struct rq *rq, ++ struct cpuidle_state *idle_state) ++{ ++} ++ ++static inline struct cpuidle_state *idle_get_state(struct rq *rq) ++{ ++ return NULL; ++} ++#endif ++ ++#ifdef CONFIG_SCHED_DEBUG ++extern bool sched_debug_enabled; ++#endif ++ ++extern void schedule_idle(void); ++ ++#ifdef CONFIG_IRQ_TIME_ACCOUNTING ++struct irqtime { ++ u64 total; ++ u64 tick_delta; ++ u64 irq_start_time; ++ struct u64_stats_sync sync; ++}; ++ ++DECLARE_PER_CPU(struct irqtime, cpu_irqtime); ++ ++/* ++ * Returns the irqtime minus the softirq time computed by ksoftirqd. ++ * Otherwise ksoftirqd's sum_exec_runtime is substracted its own runtime ++ * and never move forward. ++ */ ++static inline u64 irq_time_read(int cpu) ++{ ++ struct irqtime *irqtime = &per_cpu(cpu_irqtime, cpu); ++ unsigned int seq; ++ u64 total; ++ ++ do { ++ seq = __u64_stats_fetch_begin(&irqtime->sync); ++ total = irqtime->total; ++ } while (__u64_stats_fetch_retry(&irqtime->sync, seq)); ++ ++ return total; ++} ++#endif /* CONFIG_IRQ_TIME_ACCOUNTING */ ++ ++#ifdef CONFIG_SMP ++static inline int cpu_of(struct rq *rq) ++{ ++ return rq->cpu; ++} ++#else /* CONFIG_SMP */ ++static inline int cpu_of(struct rq *rq) ++{ ++ return 0; ++} ++#endif ++ ++#ifdef CONFIG_CPU_FREQ ++DECLARE_PER_CPU(struct update_util_data *, cpufreq_update_util_data); ++ ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flags) ++{ ++ struct update_util_data *data; ++ ++ data = rcu_dereference_sched(*per_cpu_ptr(&cpufreq_update_util_data, ++ cpu_of(rq))); ++ ++ if (data) ++ data->func(data, rq->niffies, flags); ++} ++#else ++static inline void cpufreq_trigger(struct rq *rq, unsigned int flag) ++{ ++} ++#endif /* CONFIG_CPU_FREQ */ ++ ++#ifdef arch_scale_freq_capacity ++#ifndef arch_scale_freq_invariant ++#define arch_scale_freq_invariant() (true) ++#endif ++#else /* arch_scale_freq_capacity */ ++#define arch_scale_freq_invariant() (false) ++#endif ++ ++/* ++ * This should only be called when current == rq->idle. Dodgy workaround for ++ * when softirqs are pending and we are in the idle loop. Setting current to ++ * resched will kick us out of the idle loop and the softirqs will be serviced ++ * on our next pass through schedule(). ++ */ ++static inline bool softirq_pending(int cpu) ++{ ++ if (likely(!local_softirq_pending())) ++ return false; ++ set_tsk_need_resched(current); ++ return true; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return tsk_seruntime(t); ++} ++#else ++struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags); ++void task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags); ++ ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ unsigned long flags; ++ u64 ns; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &flags); ++ ns = tsk_seruntime(t); ++ task_rq_unlock(rq, t, &flags); ++ ++ return ns; ++} ++#endif ++ ++#ifndef arch_scale_freq_capacity ++static __always_inline ++unsigned long arch_scale_freq_capacity(int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++ ++#ifdef CONFIG_NO_HZ_FULL ++extern bool sched_can_stop_tick(struct rq *rq); ++extern int __init sched_tick_offload_init(void); ++ ++/* ++ * Tick may be needed by tasks in the runqueue depending on their policy and ++ * requirements. If tick is needed, lets send the target an IPI to kick it out of ++ * nohz mode if necessary. ++ */ ++static inline void sched_update_tick_dependency(struct rq *rq) ++{ ++ int cpu; ++ ++ if (!tick_nohz_full_enabled()) ++ return; ++ ++ cpu = cpu_of(rq); ++ ++ if (!tick_nohz_full_cpu(cpu)) ++ return; ++ ++ if (sched_can_stop_tick(rq)) ++ tick_nohz_dep_clear_cpu(cpu, TICK_DEP_BIT_SCHED); ++ else ++ tick_nohz_dep_set_cpu(cpu, TICK_DEP_BIT_SCHED); ++} ++#else ++static inline int sched_tick_offload_init(void) { return 0; } ++static inline void sched_update_tick_dependency(struct rq *rq) { } ++#endif ++ ++#ifdef CONFIG_SMP ++ ++#ifndef arch_scale_cpu_capacity ++static __always_inline ++unsigned long arch_scale_cpu_capacity(struct sched_domain *sd, int cpu) ++{ ++ if (sd && (sd->flags & SD_SHARE_CPUCAPACITY) && (sd->span_weight > 1)) ++ return sd->smt_gain / sd->span_weight; ++ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++#else ++#ifndef arch_scale_cpu_capacity ++static __always_inline ++unsigned long arch_scale_cpu_capacity(void __always_unused *sd, int cpu) ++{ ++ return SCHED_CAPACITY_SCALE; ++} ++#endif ++#endif ++ ++#define SCHED_FLAG_SUGOV 0x10000000 ++ ++static inline bool rt_rq_is_runnable(struct rq *rt_rq) ++{ ++ return rt_rq->rt_nr_running; ++} ++ ++#ifdef CONFIG_CPU_FREQ_GOV_SCHEDUTIL ++ ++static inline unsigned long cpu_bw_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_dl(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline unsigned long cpu_util_cfs(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline unsigned long cpu_util_rt(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->rt_nr_running); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++#ifdef HAVE_SCHED_AVG_IRQ ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ unsigned long ret = READ_ONCE(rq->irq_load_avg); ++ ++ if (ret > SCHED_CAPACITY_SCALE) ++ ret = SCHED_CAPACITY_SCALE; ++ return ret; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ util *= (max - irq); ++ util /= max; ++ ++ return util; ++ ++} ++#else ++static inline unsigned long cpu_util_irq(struct rq *rq) ++{ ++ return 0; ++} ++ ++static inline ++unsigned long scale_irq_capacity(unsigned long util, unsigned long irq, unsigned long max) ++{ ++ return util; ++} ++#endif ++#endif ++ ++#endif /* MUQSS_SCHED_H */ +diff -Nur a/kernel/sched/sched.h b/kernel/sched/sched.h +--- a/kernel/sched/sched.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/sched.h 2019-02-09 17:46:12.001297867 +0000 +@@ -2,6 +2,19 @@ + /* + * Scheduler internal types and methods: + */ ++#ifdef CONFIG_SCHED_MUQSS ++#include "MuQSS.h" ++ ++/* Begin compatibility wrappers for MuQSS/CFS differences */ ++#define rq_rt_nr_running(rq) ((rq)->rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->nr_running) ++ ++#else /* CONFIG_SCHED_MUQSS */ ++ ++#define rq_rt_nr_running(rq) ((rq)->rt.rt_nr_running) ++#define rq_h_nr_running(rq) ((rq)->cfs.h_nr_running) ++ ++ + #include <linux/sched.h> + + #include <linux/sched/autogroup.h> +@@ -2241,3 +2254,30 @@ + return util; + } + #endif ++ ++/* MuQSS compatibility functions */ ++static inline bool softirq_pending(int cpu) ++{ ++ return false; ++} ++ ++#ifdef CONFIG_64BIT ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ return t->se.sum_exec_runtime; ++} ++#else ++static inline u64 read_sum_exec_runtime(struct task_struct *t) ++{ ++ u64 ns; ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(t, &rf); ++ ns = t->se.sum_exec_runtime; ++ task_rq_unlock(rq, t, &rf); ++ ++ return ns; ++} ++#endif ++#endif /* CONFIG_SCHED_MUQSS */ +diff -Nur a/kernel/sched/topology.c b/kernel/sched/topology.c +--- a/kernel/sched/topology.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/sched/topology.c 2019-02-09 17:46:12.001297867 +0000 +@@ -219,7 +219,11 @@ + struct root_domain *old_rd = NULL; + unsigned long flags; + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_lock_irqsave(rq->lock, flags); ++#else + raw_spin_lock_irqsave(&rq->lock, flags); ++#endif + + if (rq->rd) { + old_rd = rq->rd; +@@ -245,7 +249,11 @@ + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++#ifdef CONFIG_SCHED_MUQSS ++ raw_spin_unlock_irqrestore(rq->lock, flags); ++#else + raw_spin_unlock_irqrestore(&rq->lock, flags); ++#endif + + if (old_rd) + call_rcu_sched(&old_rd->rcu, free_rootdomain); +diff -Nur a/kernel/skip_list.c b/kernel/skip_list.c +--- a/kernel/skip_list.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/kernel/skip_list.c 2019-02-09 17:46:12.001297867 +0000 +@@ -0,0 +1,148 @@ ++/* ++ Copyright (C) 2011,2016 Con Kolivas. ++ ++ Code based on example originally by William Pugh. ++ ++Skip Lists are a probabilistic alternative to balanced trees, as ++described in the June 1990 issue of CACM and were invented by ++William Pugh in 1987. ++ ++A couple of comments about this implementation: ++The routine randomLevel has been hard-coded to generate random ++levels using p=0.25. It can be easily changed. ++ ++The insertion routine has been implemented so as to use the ++dirty hack described in the CACM paper: if a random level is ++generated that is more than the current maximum level, the ++current maximum level plus one is used instead. ++ ++Levels start at zero and go up to MaxLevel (which is equal to ++MaxNumberOfLevels-1). ++ ++The routines defined in this file are: ++ ++init: defines slnode ++ ++new_skiplist: returns a new, empty list ++ ++randomLevel: Returns a random level based on a u64 random seed passed to it. ++In MuQSS, the "niffy" time is used for this purpose. ++ ++insert(l,key, value): inserts the binding (key, value) into l. This operation ++occurs in O(log n) time. ++ ++delnode(slnode, l, node): deletes any binding of key from the l based on the ++actual node value. This operation occurs in O(k) time where k is the ++number of levels of the node in question (max 8). The original delete ++function occurred in O(log n) time and involved a search. ++ ++MuQSS Notes: In this implementation of skiplists, there are bidirectional ++next/prev pointers and the insert function returns a pointer to the actual ++node the value is stored. The key here is chosen by the scheduler so as to ++sort tasks according to the priority list requirements and is no longer used ++by the scheduler after insertion. The scheduler lookup, however, occurs in ++O(1) time because it is always the first item in the level 0 linked list. ++Since the task struct stores a copy of the node pointer upon skiplist_insert, ++it can also remove it much faster than the original implementation with the ++aid of prev<->next pointer manipulation and no searching. ++ ++*/ ++ ++#include <linux/slab.h> ++#include <linux/skip_list.h> ++ ++#define MaxNumberOfLevels 8 ++#define MaxLevel (MaxNumberOfLevels - 1) ++ ++void skiplist_init(skiplist_node *slnode) ++{ ++ int i; ++ ++ slnode->key = 0xFFFFFFFFFFFFFFFF; ++ slnode->level = 0; ++ slnode->value = NULL; ++ for (i = 0; i < MaxNumberOfLevels; i++) ++ slnode->next[i] = slnode->prev[i] = slnode; ++} ++ ++skiplist *new_skiplist(skiplist_node *slnode) ++{ ++ skiplist *l = kzalloc(sizeof(skiplist), GFP_ATOMIC); ++ ++ BUG_ON(!l); ++ l->header = slnode; ++ return l; ++} ++ ++void free_skiplist(skiplist *l) ++{ ++ skiplist_node *p, *q; ++ ++ p = l->header; ++ do { ++ q = p->next[0]; ++ p->next[0]->prev[0] = q->prev[0]; ++ skiplist_node_init(p); ++ p = q; ++ } while (p != l->header); ++ kfree(l); ++} ++ ++void skiplist_node_init(skiplist_node *node) ++{ ++ memset(node, 0, sizeof(skiplist_node)); ++} ++ ++static inline unsigned int randomLevel(const long unsigned int randseed) ++{ ++ return find_first_bit(&randseed, MaxLevel) / 2; ++} ++ ++void skiplist_insert(skiplist *l, skiplist_node *node, keyType key, valueType value, unsigned int randseed) ++{ ++ skiplist_node *update[MaxNumberOfLevels]; ++ skiplist_node *p, *q; ++ int k = l->level; ++ ++ p = l->header; ++ do { ++ while (q = p->next[k], q->key <= key) ++ p = q; ++ update[k] = p; ++ } while (--k >= 0); ++ ++ ++l->entries; ++ k = randomLevel(randseed); ++ if (k > l->level) { ++ k = ++l->level; ++ update[k] = l->header; ++ } ++ ++ node->level = k; ++ node->key = key; ++ node->value = value; ++ do { ++ p = update[k]; ++ node->next[k] = p->next[k]; ++ p->next[k] = node; ++ node->prev[k] = p; ++ node->next[k]->prev[k] = node; ++ } while (--k >= 0); ++} ++ ++void skiplist_delete(skiplist *l, skiplist_node *node) ++{ ++ int k, m = node->level; ++ ++ for (k = 0; k <= m; k++) { ++ node->prev[k]->next[k] = node->next[k]; ++ node->next[k]->prev[k] = node->prev[k]; ++ } ++ skiplist_node_init(node); ++ if (m == l->level) { ++ while (l->header->next[m] == l->header && l->header->prev[m] == l->header && m > 0) ++ m--; ++ l->level = m; ++ } ++ l->entries--; ++} +diff -Nur a/kernel/sysctl.c b/kernel/sysctl.c +--- a/kernel/sysctl.c 2019-02-09 17:20:30.491821512 +0000 ++++ b/kernel/sysctl.c 2019-02-09 17:57:37.563468776 +0000 +@@ -136,6 +136,12 @@ + static unsigned long one_ul __read_only = 1; + static int one_hundred __read_only = 100; + static int one_thousand __read_only = 1000; ++#ifdef CONFIG_SCHED_MUQSS ++extern int rr_interval; ++extern int sched_interactive; ++extern int sched_iso_cpu; ++extern int sched_yield_type; ++#endif + #ifdef CONFIG_PRINTK + static int ten_thousand __read_only = 10000; + #endif +@@ -306,7 +312,7 @@ + { } + }; + +-#ifdef CONFIG_SCHED_DEBUG ++#if defined(CONFIG_SCHED_DEBUG) && !defined(CONFIG_SCHED_MUQSS) + static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ + static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ + static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ +@@ -323,6 +329,7 @@ + #endif + + static struct ctl_table kern_table[] = { ++#ifndef CONFIG_SCHED_MUQSS + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -477,6 +484,7 @@ + .extra1 = &one, + }, + #endif ++#endif /* !CONFIG_SCHED_MUQSS */ + #ifdef CONFIG_PROVE_LOCKING + { + .procname = "prove_locking", +@@ -1082,6 +1090,44 @@ + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_SCHED_MUQSS ++ { ++ .procname = "rr_interval", ++ .data = &rr_interval, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &one, ++ .extra2 = &one_thousand, ++ }, ++ { ++ .procname = "interactive", ++ .data = &sched_interactive, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++ { ++ .procname = "iso_cpu", ++ .data = &sched_iso_cpu, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &one_hundred, ++ }, ++ { ++ .procname = "yield_type", ++ .data = &sched_yield_type, ++ .maxlen = sizeof (int), ++ .mode = 0644, ++ .proc_handler = &proc_dointvec_minmax, ++ .extra1 = &zero, ++ .extra2 = &two, ++ }, ++#endif + #if defined(CONFIG_S390) && defined(CONFIG_SMP) + { + .procname = "spin_retry", +diff -Nur a/kernel/time/clockevents.c b/kernel/time/clockevents.c +--- a/kernel/time/clockevents.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/time/clockevents.c 2019-02-09 17:46:12.001297867 +0000 +@@ -198,8 +198,13 @@ + + #ifdef CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST + ++#ifdef CONFIG_SCHED_MUQSS ++/* Limit min_delta to 100us */ ++#define MIN_DELTA_LIMIT (NSEC_PER_SEC / 10000) ++#else + /* Limit min_delta to a jiffie */ + #define MIN_DELTA_LIMIT (NSEC_PER_SEC / HZ) ++#endif + + /** + * clockevents_increase_min_delta - raise minimum delta of a clock event device +diff -Nur a/kernel/time/posix-cpu-timers.c b/kernel/time/posix-cpu-timers.c +--- a/kernel/time/posix-cpu-timers.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/time/posix-cpu-timers.c 2019-02-09 17:46:12.001297867 +0000 +@@ -830,7 +830,7 @@ + tsk_expires->virt_exp = expires; + + tsk_expires->sched_exp = check_timers_list(++timers, firing, +- tsk->se.sum_exec_runtime); ++ tsk_seruntime(tsk)); + + /* + * Check for the special case thread timers. +@@ -840,7 +840,7 @@ + unsigned long hard = task_rlimit_max(tsk, RLIMIT_RTTIME); + + if (hard != RLIM_INFINITY && +- tsk->rt.timeout > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { ++ tsk_rttimeout(tsk) > DIV_ROUND_UP(hard, USEC_PER_SEC/HZ)) { + /* + * At the hard limit, we just die. + * No need to calculate anything else now. +@@ -852,7 +852,7 @@ + __group_send_sig_info(SIGKILL, SEND_SIG_PRIV, tsk); + return; + } +- if (tsk->rt.timeout > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { ++ if (tsk_rttimeout(tsk) > DIV_ROUND_UP(soft, USEC_PER_SEC/HZ)) { + /* + * At the soft limit, send a SIGXCPU every second. + */ +@@ -1095,7 +1095,7 @@ + struct task_cputime task_sample; + + task_cputime(tsk, &task_sample.utime, &task_sample.stime); +- task_sample.sum_exec_runtime = tsk->se.sum_exec_runtime; ++ task_sample.sum_exec_runtime = tsk_seruntime(tsk); + if (task_cputime_expired(&task_sample, &tsk->cputime_expires)) + return 1; + } +diff -Nur a/kernel/time/timer.c b/kernel/time/timer.c +--- a/kernel/time/timer.c 2019-02-09 17:20:30.491821512 +0000 ++++ b/kernel/time/timer.c 2019-02-09 17:46:12.001297867 +0000 +@@ -1479,7 +1479,7 @@ + * Check, if the next hrtimer event is before the next timer wheel + * event: + */ +-static u64 cmp_next_hrtimer_event(u64 basem, u64 expires) ++static u64 cmp_next_hrtimer_event(struct timer_base *base, u64 basem, u64 expires) + { + u64 nextevt = hrtimer_get_next_event(); + +@@ -1497,6 +1497,9 @@ + if (nextevt <= basem) + return basem; + ++ if (nextevt < expires && nextevt - basem <= TICK_NSEC) ++ base->is_idle = false; ++ + /* + * Round up to the next jiffie. High resolution timers are + * off, so the hrtimers are expired in the tick and we need to +@@ -1566,7 +1569,7 @@ + } + raw_spin_unlock(&base->lock); + +- return cmp_next_hrtimer_event(basem, expires); ++ return cmp_next_hrtimer_event(base, basem, expires); + } + + /** +diff -Nur a/kernel/trace/trace_selftest.c b/kernel/trace/trace_selftest.c +--- a/kernel/trace/trace_selftest.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/kernel/trace/trace_selftest.c 2019-02-09 17:46:12.001297867 +0000 +@@ -1041,10 +1041,15 @@ + { + /* Make this a -deadline thread */ + static const struct sched_attr attr = { ++#ifdef CONFIG_SCHED_MUQSS ++ /* No deadline on MuQSS, use RR */ ++ .sched_policy = SCHED_RR, ++#else + .sched_policy = SCHED_DEADLINE, + .sched_runtime = 100000ULL, + .sched_deadline = 10000000ULL, + .sched_period = 10000000ULL ++#endif + }; + struct wakeup_test_data *x = data; + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0002-Fix-Werror-build-failure-in-tools.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0002-Fix-Werror-build-failure-in-tools.patch new file mode 100644 index 00000000..35872156 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0002-Fix-Werror-build-failure-in-tools.patch @@ -0,0 +1,25 @@ +From ba77544e4687e62fe9d8ca870ceb47ea87d1cbfe Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Sun, 18 Feb 2018 12:36:22 +1100 +Subject: [PATCH 02/16] Fix Werror build failure in tools. + +--- + tools/objtool/Makefile | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/objtool/Makefile b/tools/objtool/Makefile +index c9d038f91af6..af41781c233a 100644 +--- a/tools/objtool/Makefile ++++ b/tools/objtool/Makefile +@@ -31,7 +31,7 @@ INCLUDES := -I$(srctree)/tools/include \ + -I$(srctree)/tools/arch/$(HOSTARCH)/include/uapi \ + -I$(srctree)/tools/objtool/arch/$(ARCH)/include + WARNINGS := $(EXTRA_WARNINGS) -Wno-switch-default -Wno-switch-enum -Wno-packed +-CFLAGS += -Werror $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) ++CFLAGS += $(WARNINGS) $(KBUILD_HOSTCFLAGS) -g $(INCLUDES) + LDFLAGS += -lelf $(LIBSUBCMD) $(KBUILD_HOSTLDFLAGS) + + # Allow old libelf to be used: +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0003-Make-preemptible-kernel-default.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0003-Make-preemptible-kernel-default.patch new file mode 100644 index 00000000..176fcb54 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0003-Make-preemptible-kernel-default.patch @@ -0,0 +1,4648 @@ +From 2432d1de7128e6ac986749bc52eb30c4c1c654d0 Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Sat, 29 Oct 2016 11:20:37 +1100 +Subject: [PATCH 03/16] Make preemptible kernel default. + +Make full preempt default on all arches. +--- + arch/arc/configs/tb10x_defconfig | 2 +- + arch/arm/configs/bcm2835_defconfig | 2 +- + arch/arm/configs/imx_v6_v7_defconfig | 2 +- + arch/arm/configs/mps2_defconfig | 2 +- + arch/arm/configs/mxs_defconfig | 2 +- + arch/blackfin/configs/BF518F-EZBRD_defconfig | 121 ++++ + arch/blackfin/configs/BF526-EZBRD_defconfig | 158 ++++++ + .../blackfin/configs/BF527-EZKIT-V2_defconfig | 188 +++++++ + arch/blackfin/configs/BF527-EZKIT_defconfig | 181 ++++++ + .../blackfin/configs/BF527-TLL6527M_defconfig | 178 ++++++ + arch/blackfin/configs/BF533-EZKIT_defconfig | 114 ++++ + arch/blackfin/configs/BF533-STAMP_defconfig | 124 +++++ + arch/blackfin/configs/BF537-STAMP_defconfig | 136 +++++ + arch/blackfin/configs/BF538-EZKIT_defconfig | 133 +++++ + arch/blackfin/configs/BF548-EZKIT_defconfig | 207 +++++++ + arch/blackfin/configs/BF561-ACVILON_defconfig | 149 +++++ + .../configs/BF561-EZKIT-SMP_defconfig | 112 ++++ + arch/blackfin/configs/BF561-EZKIT_defconfig | 114 ++++ + arch/blackfin/configs/BF609-EZKIT_defconfig | 154 +++++ + arch/blackfin/configs/BlackStamp_defconfig | 108 ++++ + arch/blackfin/configs/CM-BF527_defconfig | 129 +++++ + arch/blackfin/configs/PNAV-10_defconfig | 111 ++++ + arch/blackfin/configs/SRV1_defconfig | 88 +++ + arch/blackfin/configs/TCM-BF518_defconfig | 131 +++++ + arch/mips/configs/fuloong2e_defconfig | 3 +- + arch/mips/configs/gpr_defconfig | 3 +- + arch/mips/configs/ip22_defconfig | 3 +- + arch/mips/configs/ip28_defconfig | 3 +- + arch/mips/configs/jazz_defconfig | 3 +- + arch/mips/configs/mtx1_defconfig | 3 +- + arch/mips/configs/nlm_xlr_defconfig | 2 +- + arch/mips/configs/pic32mzda_defconfig | 2 +- + arch/mips/configs/pistachio_defconfig | 2 +- + arch/mips/configs/pnx8335_stb225_defconfig | 2 +- + arch/mips/configs/rm200_defconfig | 3 +- + arch/parisc/configs/712_defconfig | 2 +- + arch/parisc/configs/c3000_defconfig | 2 +- + arch/parisc/configs/default_defconfig | 2 +- + arch/powerpc/configs/c2k_defconfig | 389 +++++++++++++ + arch/powerpc/configs/ppc6xx_defconfig | 2 +- + arch/score/configs/spct6600_defconfig | 84 +++ + arch/sh/configs/se7712_defconfig | 2 +- + arch/sh/configs/se7721_defconfig | 2 +- + arch/sh/configs/titan_defconfig | 2 +- + arch/sparc/configs/sparc64_defconfig | 2 +- + arch/tile/configs/tilegx_defconfig | 411 ++++++++++++++ + arch/tile/configs/tilepro_defconfig | 524 ++++++++++++++++++ + arch/x86/configs/i386_defconfig | 2 +- + arch/x86/configs/x86_64_defconfig | 2 +- + kernel/Kconfig.preempt | 9 +- + 50 files changed, 4082 insertions(+), 30 deletions(-) + create mode 100644 arch/blackfin/configs/BF518F-EZBRD_defconfig + create mode 100644 arch/blackfin/configs/BF526-EZBRD_defconfig + create mode 100644 arch/blackfin/configs/BF527-EZKIT-V2_defconfig + create mode 100644 arch/blackfin/configs/BF527-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BF527-TLL6527M_defconfig + create mode 100644 arch/blackfin/configs/BF533-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BF533-STAMP_defconfig + create mode 100644 arch/blackfin/configs/BF537-STAMP_defconfig + create mode 100644 arch/blackfin/configs/BF538-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BF548-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BF561-ACVILON_defconfig + create mode 100644 arch/blackfin/configs/BF561-EZKIT-SMP_defconfig + create mode 100644 arch/blackfin/configs/BF561-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BF609-EZKIT_defconfig + create mode 100644 arch/blackfin/configs/BlackStamp_defconfig + create mode 100644 arch/blackfin/configs/CM-BF527_defconfig + create mode 100644 arch/blackfin/configs/PNAV-10_defconfig + create mode 100644 arch/blackfin/configs/SRV1_defconfig + create mode 100644 arch/blackfin/configs/TCM-BF518_defconfig + create mode 100644 arch/powerpc/configs/c2k_defconfig + create mode 100644 arch/score/configs/spct6600_defconfig + create mode 100644 arch/tile/configs/tilegx_defconfig + create mode 100644 arch/tile/configs/tilepro_defconfig + +diff --git a/arch/arc/configs/tb10x_defconfig b/arch/arc/configs/tb10x_defconfig +index a7f65313f84a..5233307bf903 100644 +--- a/arch/arc/configs/tb10x_defconfig ++++ b/arch/arc/configs/tb10x_defconfig +@@ -28,7 +28,7 @@ CONFIG_ARC_PLAT_TB10X=y + CONFIG_ARC_CACHE_LINE_SHIFT=5 + CONFIG_HZ=250 + CONFIG_ARC_BUILTIN_DTB_NAME="abilis_tb100_dvk" +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_COMPACTION is not set + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/arm/configs/bcm2835_defconfig b/arch/arm/configs/bcm2835_defconfig +index e9bc88937b1e..73cde48ad00f 100644 +--- a/arch/arm/configs/bcm2835_defconfig ++++ b/arch/arm/configs/bcm2835_defconfig +@@ -29,7 +29,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_ARCH_MULTI_V6=y + CONFIG_ARCH_BCM=y + CONFIG_ARCH_BCM2835=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_KSM=y + CONFIG_CLEANCACHE=y +diff --git a/arch/arm/configs/imx_v6_v7_defconfig b/arch/arm/configs/imx_v6_v7_defconfig +index 7eca43ff69bb..689095192133 100644 +--- a/arch/arm/configs/imx_v6_v7_defconfig ++++ b/arch/arm/configs/imx_v6_v7_defconfig +@@ -48,7 +48,7 @@ CONFIG_PCI_MSI=y + CONFIG_PCI_IMX6=y + CONFIG_SMP=y + CONFIG_ARM_PSCI=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_HIGHMEM=y + CONFIG_FORCE_MAX_ZONEORDER=14 + CONFIG_CMDLINE="noinitrd console=ttymxc0,115200" +diff --git a/arch/arm/configs/mps2_defconfig b/arch/arm/configs/mps2_defconfig +index 0bcdec7cc169..10ceaefa51e0 100644 +--- a/arch/arm/configs/mps2_defconfig ++++ b/arch/arm/configs/mps2_defconfig +@@ -18,7 +18,7 @@ CONFIG_ARCH_MPS2=y + CONFIG_SET_MEM_PARAM=y + CONFIG_DRAM_BASE=0x21000000 + CONFIG_DRAM_SIZE=0x1000000 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_ATAGS is not set + CONFIG_ZBOOT_ROM_TEXT=0x0 + CONFIG_ZBOOT_ROM_BSS=0x0 +diff --git a/arch/arm/configs/mxs_defconfig b/arch/arm/configs/mxs_defconfig +index 7b8212857535..6c1b8a1d9d59 100644 +--- a/arch/arm/configs/mxs_defconfig ++++ b/arch/arm/configs/mxs_defconfig +@@ -26,7 +26,7 @@ CONFIG_BLK_DEV_INTEGRITY=y + # CONFIG_ARCH_MULTI_V7 is not set + CONFIG_ARCH_MXS=y + # CONFIG_ARM_THUMB is not set +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_AEABI=y + CONFIG_NET=y + CONFIG_PACKET=y +diff --git a/arch/blackfin/configs/BF518F-EZBRD_defconfig b/arch/blackfin/configs/BF518F-EZBRD_defconfig +new file mode 100644 +index 000000000000..39b91dfa55b5 +--- /dev/null ++++ b/arch/blackfin/configs/BF518F-EZBRD_defconfig +@@ -0,0 +1,121 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF518=y ++CONFIG_IRQ_TIMER0=12 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_SDH_BFIN=y ++CONFIG_SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_VFAT_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/BF526-EZBRD_defconfig b/arch/blackfin/configs/BF526-EZBRD_defconfig +new file mode 100644 +index 000000000000..675cadb3a0c4 +--- /dev/null ++++ b/arch/blackfin/configs/BF526-EZBRD_defconfig +@@ -0,0 +1,158 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF526=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BFIN526_EZBRD=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_STORAGE=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/BF527-EZKIT-V2_defconfig b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig +new file mode 100644 +index 000000000000..4c517c443af5 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-EZKIT-V2_defconfig +@@ -0,0 +1,188 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_2=y ++CONFIG_BFIN527_EZKIT_V2=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++CONFIG_KEYBOARD_ADP5520=y ++# CONFIG_KEYBOARD_ATKBD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=y ++CONFIG_TOUCHSCREEN_AD7879_I2C=y ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_PMIC_ADP5520=y ++CONFIG_FB=y ++CONFIG_FB_BFIN_LQ035Q1=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_NEW_LEDS=y ++CONFIG_LEDS_CLASS=y ++CONFIG_LEDS_ADP5520=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF527-EZKIT_defconfig b/arch/blackfin/configs/BF527-EZKIT_defconfig +new file mode 100644 +index 000000000000..bf8df3e6cf02 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-EZKIT_defconfig +@@ -0,0 +1,181 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_1=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=m ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=y ++CONFIG_FB_BFIN_T350MCQB=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_LCD_LTV350QV=m ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=m ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF527-TLL6527M_defconfig b/arch/blackfin/configs/BF527-TLL6527M_defconfig +new file mode 100644 +index 000000000000..0220b3b15c53 +--- /dev/null ++++ b/arch/blackfin/configs/BF527-TLL6527M_defconfig +@@ -0,0 +1,178 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_LOCALVERSION="DEV_0-1_pre2010" ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_2=y ++CONFIG_BFIN527_TLL6527M=y ++CONFIG_BF527_UART1_PORTG=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++CONFIG_BOOT_LOAD=0x400000 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_0=0xFFC2 ++CONFIG_BANK_1=0xFFC2 ++CONFIG_BANK_2=0xFFC2 ++CONFIG_BANK_3=0xFFC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR0=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_GPIO_ADDR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=m ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_AD714X=y ++CONFIG_INPUT_ADXL34X=y ++# CONFIG_SERIO is not set ++CONFIG_BFIN_PPI=m ++CONFIG_BFIN_SIMPLE_TIMER=m ++CONFIG_BFIN_SPORT=m ++# CONFIG_CONSOLE_TRANSLATIONS is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C_CHARDEV=y ++# CONFIG_I2C_HELPER_AUTO is not set ++CONFIG_I2C_SMBUS=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_MEDIA_SUPPORT=y ++CONFIG_VIDEO_DEV=y ++# CONFIG_MEDIA_TUNER_CUSTOMISE is not set ++CONFIG_VIDEO_HELPER_CHIPS_AUTO=y ++CONFIG_VIDEO_BLACKFIN_CAM=m ++CONFIG_OV9655=y ++CONFIG_FB=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_FONTS=y ++CONFIG_FONT_6x11=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SOC_SSM2602=y ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=m ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++# CONFIG_RPCSEC_GSS_KRB5 is not set ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC7=m +diff --git a/arch/blackfin/configs/BF533-EZKIT_defconfig b/arch/blackfin/configs/BF533-EZKIT_defconfig +new file mode 100644 +index 000000000000..6023e3fd2c48 +--- /dev/null ++++ b/arch/blackfin/configs/BF533-EZKIT_defconfig +@@ -0,0 +1,114 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BFIN533_EZKIT=y ++CONFIG_TIMER0=11 ++CONFIG_CLKIN_HZ=27000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_PLATRAM=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF533-STAMP_defconfig b/arch/blackfin/configs/BF533-STAMP_defconfig +new file mode 100644 +index 000000000000..f5cd0f18b711 +--- /dev/null ++++ b/arch/blackfin/configs/BF533-STAMP_defconfig +@@ -0,0 +1,124 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_TIMER0=11 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_GPIO=m ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FIRMWARE_EDID=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++CONFIG_SND_SOC=m ++CONFIG_SND_BF5XX_I2S=m ++CONFIG_SND_BF5XX_SOC_AD73311=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF537-STAMP_defconfig b/arch/blackfin/configs/BF537-STAMP_defconfig +new file mode 100644 +index 000000000000..48085fde7f9e +--- /dev/null ++++ b/arch/blackfin/configs/BF537-STAMP_defconfig +@@ -0,0 +1,136 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR1=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=m ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_BFIN=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FIRMWARE_EDID=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++CONFIG_SND_SOC=m ++CONFIG_SND_BF5XX_I2S=m ++CONFIG_SND_BF5XX_SOC_AD73311=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF538-EZKIT_defconfig b/arch/blackfin/configs/BF538-EZKIT_defconfig +new file mode 100644 +index 000000000000..12deeaaef3cb +--- /dev/null ++++ b/arch/blackfin/configs/BF538-EZKIT_defconfig +@@ -0,0 +1,133 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF538=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_IRQ_TIMER1=12 ++CONFIG_IRQ_TIMER2=12 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_DEV=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=m ++CONFIG_MTD_NAND=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_PHYLIB=y ++CONFIG_SMSC_PHY=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMC91X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7879=y ++CONFIG_TOUCHSCREEN_AD7879_SPI=y ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++CONFIG_SERIAL_BFIN_UART2=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=m ++CONFIG_FB_BFIN_LQ035Q1=m ++# CONFIG_USB_SUPPORT is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF548-EZKIT_defconfig b/arch/blackfin/configs/BF548-EZKIT_defconfig +new file mode 100644 +index 000000000000..6a68ffc55b5a +--- /dev/null ++++ b/arch/blackfin/configs/BF548-EZKIT_defconfig +@@ -0,0 +1,207 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF548_std=y ++CONFIG_IRQ_TIMER0=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_CACHELINE_ALIGNED_L1=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_BFIN_EXTMEM_WRITETHROUGH=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_EBIU_MBSCTLVAL=0x0 ++CONFIG_EBIU_MODEVAL=0x1 ++CONFIG_EBIU_FCTLVAL=0x6 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_CAN=m ++CONFIG_CAN_RAW=m ++CONFIG_CAN_BCM=m ++CONFIG_CAN_BFIN=m ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRTTY_SIR=m ++CONFIG_BFIN_SIR=m ++CONFIG_BFIN_SIR3=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_FW_LOADER=m ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_NAND=y ++CONFIG_MTD_NAND_BF5XX=y ++# CONFIG_MTD_NAND_BF5XX_HWECC is not set ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_RAM=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++CONFIG_BLK_DEV_SR=m ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_ATA=y ++# CONFIG_SATA_PMP is not set ++CONFIG_PATA_BF54X=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMSC911X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT_FF_MEMLESS=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++CONFIG_INPUT_EVBUG=m ++# CONFIG_KEYBOARD_ATKBD is not set ++CONFIG_KEYBOARD_BFIN=y ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7877=m ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_FB=y ++CONFIG_FIRMWARE_EDID=y ++CONFIG_FB_BF54X_LQ043=y ++CONFIG_FRAMEBUFFER_CONSOLE=y ++CONFIG_FONTS=y ++CONFIG_FONT_6x11=y ++CONFIG_LOGO=y ++# CONFIG_LOGO_LINUX_MONO is not set ++# CONFIG_LOGO_LINUX_VGA16 is not set ++# CONFIG_LOGO_LINUX_CLUT224 is not set ++# CONFIG_LOGO_BLACKFIN_VGA16 is not set ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_AC97=y ++CONFIG_SND_BF5XX_SOC_AD1980=y ++CONFIG_HID_A4TECH=y ++CONFIG_HID_APPLE=y ++CONFIG_HID_BELKIN=y ++CONFIG_HID_CHERRY=y ++CONFIG_HID_CHICONY=y ++CONFIG_HID_CYPRESS=y ++CONFIG_HID_EZKEY=y ++CONFIG_HID_GYRATION=y ++CONFIG_HID_LOGITECH=y ++CONFIG_HID_MICROSOFT=y ++CONFIG_HID_MONTEREY=y ++CONFIG_HID_PANTHERLORD=y ++CONFIG_HID_PETALYNX=y ++CONFIG_HID_SAMSUNG=y ++CONFIG_HID_SONY=y ++CONFIG_HID_SUNPLUS=y ++CONFIG_USB=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_MMC=y ++CONFIG_MMC_BLOCK=m ++CONFIG_SDH_BFIN=y ++CONFIG_SDH_BFIN_MISSING_CMD_PULLUP_WORKAROUND=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_NTFS_FS=m ++CONFIG_NTFS_RW=y ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3=y ++CONFIG_CIFS=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-ACVILON_defconfig b/arch/blackfin/configs/BF561-ACVILON_defconfig +new file mode 100644 +index 000000000000..e9f3ba783a4e +--- /dev/null ++++ b/arch/blackfin/configs/BF561-ACVILON_defconfig +@@ -0,0 +1,149 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_SYSFS_DEPRECATED_V2=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_BF_REV_0_5=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_BFIN561_ACVILON=y ++# CONFIG_BF561_COREB is not set ++CONFIG_CLKIN_HZ=12000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_DMA_UNCACHED_4M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_0=0x99b2 ++CONFIG_BANK_1=0x3350 ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++CONFIG_SYN_COOKIES=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_PLATRAM=y ++CONFIG_MTD_PHRAM=y ++CONFIG_MTD_BLOCK2MTD=y ++CONFIG_MTD_NAND=y ++CONFIG_MTD_NAND_PLATFORM=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_COUNT=2 ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_SCSI=y ++# CONFIG_SCSI_PROC_FS is not set ++CONFIG_BLK_DEV_SD=y ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMSC911X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_PIO=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_PCA_PLATFORM=y ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_SPI_SPIDEV=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_GPIO_PCF857X=y ++CONFIG_SENSORS_LM75=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_SOUND=y ++CONFIG_SND=y ++CONFIG_SND_MIXER_OSS=y ++CONFIG_SND_PCM_OSS=y ++# CONFIG_SND_DRIVERS is not set ++# CONFIG_SND_USB is not set ++CONFIG_SND_SOC=y ++CONFIG_SND_BF5XX_I2S=y ++CONFIG_SND_BF5XX_SPORT_NUM=1 ++CONFIG_USB=y ++CONFIG_USB_ANNOUNCE_NEW_DEVICES=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_MON=y ++CONFIG_USB_STORAGE=y ++CONFIG_USB_SERIAL=y ++CONFIG_USB_SERIAL_FTDI_SIO=y ++CONFIG_USB_SERIAL_PL2303=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_DS1307=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_FAT_DEFAULT_CODEPAGE=866 ++CONFIG_FAT_DEFAULT_IOCHARSET="cp1251" ++CONFIG_NTFS_FS=y ++CONFIG_CONFIGFS_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_JFFS2_COMPRESSION_OPTIONS=y ++# CONFIG_JFFS2_ZLIB is not set ++CONFIG_JFFS2_LZO=y ++# CONFIG_JFFS2_RTIME is not set ++CONFIG_JFFS2_CMODE_FAVOURLZO=y ++CONFIG_CRAMFS=y ++CONFIG_MINIX_FS=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_ROOT_NFS=y ++CONFIG_NLS_DEFAULT="cp1251" ++CONFIG_NLS_CODEPAGE_866=y ++CONFIG_NLS_CODEPAGE_1251=y ++CONFIG_NLS_KOI8_R=y ++CONFIG_NLS_UTF8=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++# CONFIG_DEBUG_BUGVERBOSE is not set ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_CPLB_INFO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +new file mode 100644 +index 000000000000..89b75a6c3fab +--- /dev/null ++++ b/arch/blackfin/configs/BF561-EZKIT-SMP_defconfig +@@ -0,0 +1,112 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_SMP=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_CLKIN_HZ=30000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF561-EZKIT_defconfig b/arch/blackfin/configs/BF561-EZKIT_defconfig +new file mode 100644 +index 000000000000..67b3d2f419ba +--- /dev/null ++++ b/arch/blackfin/configs/BF561-EZKIT_defconfig +@@ -0,0 +1,114 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF561=y ++CONFIG_IRQ_TIMER0=10 ++CONFIG_CLKIN_HZ=30000000 ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_BFIN_EXTMEM_WRITETHROUGH=y ++CONFIG_BFIN_L2_DCACHEABLE=y ++CONFIG_BFIN_L2_WRITETHROUGH=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++CONFIG_SMC91X=y ++# CONFIG_NET_VENDOR_STMICRO is not set ++# CONFIG_WLAN is not set ++CONFIG_INPUT=m ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_JTAG_COMM=m ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set +diff --git a/arch/blackfin/configs/BF609-EZKIT_defconfig b/arch/blackfin/configs/BF609-EZKIT_defconfig +new file mode 100644 +index 000000000000..8cc75d4218fb +--- /dev/null ++++ b/arch/blackfin/configs/BF609-EZKIT_defconfig +@@ -0,0 +1,154 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_EXPERT=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF609=y ++CONFIG_PINT1_ASSIGN=0x01010000 ++CONFIG_PINT2_ASSIGN=0x07000101 ++CONFIG_PINT3_ASSIGN=0x02020303 ++CONFIG_IP_CHECKSUM_L1=y ++CONFIG_SYSCALL_TAB_L1=y ++CONFIG_CPLB_SWITCH_TAB_L1=y ++# CONFIG_APP_STACK_L1 is not set ++# CONFIG_BFIN_INS_LOWOVERHEAD is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM_BFIN_WAKE_PE12=y ++CONFIG_PM_BFIN_WAKE_PE12_POL=1 ++CONFIG_CPU_FREQ=y ++CONFIG_CPU_FREQ_GOV_POWERSAVE=y ++CONFIG_CPU_FREQ_GOV_ONDEMAND=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_IP_PNP_BOOTP=y ++CONFIG_IP_PNP_RARP=y ++# CONFIG_IPV6 is not set ++CONFIG_NETFILTER=y ++CONFIG_CAN=y ++CONFIG_CAN_BFIN=y ++CONFIG_IRDA=y ++CONFIG_IRTTY_SIR=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_FW_LOADER=m ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_CFI_STAA=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_MTD_UBI=m ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++CONFIG_NETDEVICES=y ++# CONFIG_NET_VENDOR_BROADCOM is not set ++# CONFIG_NET_VENDOR_CHELSIO is not set ++# CONFIG_NET_VENDOR_INTEL is not set ++# CONFIG_NET_VENDOR_MARVELL is not set ++# CONFIG_NET_VENDOR_MICREL is not set ++# CONFIG_NET_VENDOR_MICROCHIP is not set ++# CONFIG_NET_VENDOR_NATSEMI is not set ++# CONFIG_NET_VENDOR_SEEQ is not set ++# CONFIG_NET_VENDOR_SMSC is not set ++CONFIG_STMMAC_ETH=y ++CONFIG_STMMAC_IEEE1588=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_BFIN_ROTARY=y ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_BFIN_SIMPLE_TIMER=m ++# CONFIG_BFIN_CRC is not set ++CONFIG_BFIN_LINKPORT=y ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_ADI_V3=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_PINCTRL_MCP23S08=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_SOUND=m ++CONFIG_SND=m ++CONFIG_SND_MIXER_OSS=m ++CONFIG_SND_PCM_OSS=m ++# CONFIG_SND_DRIVERS is not set ++# CONFIG_SND_SPI is not set ++# CONFIG_SND_USB is not set ++CONFIG_SND_SOC=m ++CONFIG_USB=y ++CONFIG_USB_MUSB_HDRC=y ++CONFIG_USB_MUSB_BLACKFIN=m ++CONFIG_USB_STORAGE=y ++CONFIG_USB_GADGET=y ++CONFIG_USB_GADGET_MUSB_HDRC=y ++CONFIG_USB_ZERO=y ++CONFIG_MMC=y ++CONFIG_SDH_BFIN=y ++# CONFIG_IOMMU_SUPPORT is not set ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=m ++CONFIG_UBIFS_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ISO8859_1=y ++CONFIG_DEBUG_FS=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++CONFIG_FRAME_POINTER=y ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_BFIN_PSEUDODBG_INSNS=y ++CONFIG_CRYPTO_HMAC=m ++CONFIG_CRYPTO_MD4=m ++CONFIG_CRYPTO_MD5=m ++CONFIG_CRYPTO_ARC4=m ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRYPTO_DEV_BFIN_CRC=m +diff --git a/arch/blackfin/configs/BlackStamp_defconfig b/arch/blackfin/configs/BlackStamp_defconfig +new file mode 100644 +index 000000000000..9faf0ec7007f +--- /dev/null ++++ b/arch/blackfin/configs/BlackStamp_defconfig +@@ -0,0 +1,108 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_SYSFS_DEPRECATED_V2=y ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF532=y ++CONFIG_BF_REV_0_5=y ++CONFIG_BLACKSTAMP=y ++CONFIG_TIMER0=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_ROMKERNEL=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xAAC2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_BINFMT_SHARED_FLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=m ++CONFIG_MTD_CFI_AMDSTD=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_M25P80=y ++CONFIG_MTD_SPI_NOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_NBD=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_MISC_DEVICES=y ++CONFIG_EEPROM_AT25=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_SMC91X=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_GPIO=m ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_SPI_SPIDEV=m ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_MMC_SPI=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_NFS_V4=y ++CONFIG_SMB_FS=y ++CONFIG_CIFS=y ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_UTF8=y ++CONFIG_SYSCTL_SYSCALL_CHECK=y ++CONFIG_DEBUG_MMRS=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/CM-BF527_defconfig b/arch/blackfin/configs/CM-BF527_defconfig +new file mode 100644 +index 000000000000..4a1ad4fd7bb2 +--- /dev/null ++++ b/arch/blackfin/configs/CM-BF527_defconfig +@@ -0,0 +1,129 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_KERNEL_LZMA=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_RD_GZIP is not set ++CONFIG_RD_LZMA=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF527=y ++CONFIG_BF_REV_0_1=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BFIN527_BLUETECHNIX_CM=y ++CONFIG_IRQ_USB_INT0=11 ++CONFIG_IRQ_USB_INT1=11 ++CONFIG_IRQ_USB_INT2=11 ++CONFIG_IRQ_USB_DMA=11 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0xFFC0 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_GPIO_ADDR=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_SCSI=y ++CONFIG_BLK_DEV_SD=y ++# CONFIG_SCSI_LOWLEVEL is not set ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_BLACKFIN_TWI=m ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++CONFIG_USB=m ++CONFIG_USB_ANNOUNCE_NEW_DEVICES=y ++# CONFIG_USB_DEVICE_CLASS is not set ++CONFIG_USB_OTG_BLACKLIST_HUB=y ++CONFIG_USB_MON=m ++CONFIG_USB_MUSB_HDRC=m ++CONFIG_USB_MUSB_PERIPHERAL=y ++CONFIG_USB_GADGET_MUSB_HDRC=y ++CONFIG_MUSB_PIO_ONLY=y ++CONFIG_USB_STORAGE=m ++CONFIG_USB_GADGET=m ++CONFIG_USB_ETH=m ++CONFIG_USB_MASS_STORAGE=m ++CONFIG_USB_G_SERIAL=m ++CONFIG_USB_G_PRINTER=m ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++# CONFIG_DNOTIFY is not set ++CONFIG_MSDOS_FS=y ++CONFIG_VFAT_FS=y ++CONFIG_JFFS2_FS=y ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_ISO8859_1=y ++CONFIG_DEBUG_FS=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_EARLY_PRINTK=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m ++CONFIG_CRC_ITU_T=y ++CONFIG_CRC7=y +diff --git a/arch/blackfin/configs/PNAV-10_defconfig b/arch/blackfin/configs/PNAV-10_defconfig +new file mode 100644 +index 000000000000..9d787e28bbe8 +--- /dev/null ++++ b/arch/blackfin/configs/PNAV-10_defconfig +@@ -0,0 +1,111 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_LOG_BUF_SHIFT=14 ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_PNAV10=y ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++CONFIG_IP_CHECKSUM_L1=y ++CONFIG_SYSCALL_TAB_L1=y ++CONFIG_CPLB_SWITCH_TAB_L1=y ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=y ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_1=0x33B0 ++CONFIG_BANK_2=0x33B0 ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_UCLINUX=y ++CONFIG_MTD_NAND=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_BFIN_MAC_USE_L1 is not set ++CONFIG_BFIN_TX_DESC_NUM=100 ++CONFIG_BFIN_RX_DESC_NUM=100 ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_TOUCHSCREEN=y ++CONFIG_TOUCHSCREEN_AD7877=y ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++CONFIG_SERIAL_BFIN_UART1=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_FB=y ++CONFIG_FIRMWARE_EDID=y ++CONFIG_BACKLIGHT_LCD_SUPPORT=y ++CONFIG_LCD_CLASS_DEVICE=y ++CONFIG_BACKLIGHT_CLASS_DEVICE=y ++CONFIG_SOUND=y ++CONFIG_SND=m ++# CONFIG_SND_SUPPORT_OLD_API is not set ++# CONFIG_SND_VERBOSE_PROCFS is not set ++CONFIG_SOUND_PRIME=y ++# CONFIG_HID is not set ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_DEBUG_HUNT_FOR_ZERO is not set ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++# CONFIG_ACCESS_CHECK is not set ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/blackfin/configs/SRV1_defconfig b/arch/blackfin/configs/SRV1_defconfig +new file mode 100644 +index 000000000000..225df32dc9a8 +--- /dev/null ++++ b/arch/blackfin/configs/SRV1_defconfig +@@ -0,0 +1,88 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_SYSVIPC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++CONFIG_KALLSYMS_ALL=y ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_IOSCHED_DEADLINE is not set ++CONFIG_PREEMPT=y ++CONFIG_BF537=y ++CONFIG_IRQ_TIMER0=12 ++CONFIG_BOOT_LOAD=0x400000 ++CONFIG_CLKIN_HZ=22118400 ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_DMA_UNCACHED_2M=y ++CONFIG_C_CDPRIO=y ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_PM=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_IPV6 is not set ++CONFIG_IRDA=m ++CONFIG_IRLAN=m ++CONFIG_IRCOMM=m ++CONFIG_IRDA_CACHE_LAST_LSAP=y ++CONFIG_IRTTY_SIR=m ++# CONFIG_WIRELESS is not set ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_JEDECPROBE=m ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_UCLINUX=y ++CONFIG_MTD_NAND=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_MISC_DEVICES=y ++CONFIG_EEPROM_AT25=m ++CONFIG_NETDEVICES=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++CONFIG_INPUT_EVDEV=m ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=y ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_HWMON=m ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_HID is not set ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++# CONFIG_DNOTIFY is not set ++CONFIG_JFFS2_FS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3=y ++CONFIG_SMB_FS=m ++CONFIG_DEBUG_KERNEL=y ++# CONFIG_DEBUG_BUGVERBOSE is not set ++CONFIG_DEBUG_INFO=y ++# CONFIG_DEBUG_BFIN_NO_KERN_HWTRACE is not set ++CONFIG_CPLB_INFO=y +diff --git a/arch/blackfin/configs/TCM-BF518_defconfig b/arch/blackfin/configs/TCM-BF518_defconfig +new file mode 100644 +index 000000000000..425c24e43c34 +--- /dev/null ++++ b/arch/blackfin/configs/TCM-BF518_defconfig +@@ -0,0 +1,131 @@ ++CONFIG_EXPERIMENTAL=y ++CONFIG_KERNEL_LZMA=y ++CONFIG_SYSVIPC=y ++CONFIG_IKCONFIG=y ++CONFIG_IKCONFIG_PROC=y ++CONFIG_LOG_BUF_SHIFT=14 ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_RD_GZIP is not set ++CONFIG_RD_LZMA=y ++CONFIG_EXPERT=y ++# CONFIG_SYSCTL_SYSCALL is not set ++# CONFIG_ELF_CORE is not set ++# CONFIG_FUTEX is not set ++# CONFIG_SIGNALFD is not set ++# CONFIG_TIMERFD is not set ++# CONFIG_EVENTFD is not set ++# CONFIG_AIO is not set ++CONFIG_SLAB=y ++CONFIG_MMAP_ALLOW_UNINITIALIZED=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++# CONFIG_LBDAF is not set ++# CONFIG_BLK_DEV_BSG is not set ++# CONFIG_IOSCHED_DEADLINE is not set ++# CONFIG_IOSCHED_CFQ is not set ++CONFIG_PREEMPT=y ++CONFIG_BF518=y ++CONFIG_BF_REV_0_1=y ++CONFIG_BFIN518F_TCM=y ++CONFIG_IRQ_TIMER0=12 ++# CONFIG_CYCLES_CLOCKSOURCE is not set ++# CONFIG_SCHEDULE_L1 is not set ++# CONFIG_MEMSET_L1 is not set ++# CONFIG_MEMCPY_L1 is not set ++# CONFIG_SYS_BFIN_SPINLOCK_L1 is not set ++CONFIG_NOMMU_INITIAL_TRIM_EXCESS=0 ++CONFIG_BFIN_GPTIMERS=m ++CONFIG_C_CDPRIO=y ++CONFIG_BANK_3=0x99B2 ++CONFIG_BINFMT_FLAT=y ++CONFIG_BINFMT_ZFLAT=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_INET=y ++CONFIG_IP_PNP=y ++# CONFIG_INET_XFRM_MODE_TRANSPORT is not set ++# CONFIG_INET_XFRM_MODE_TUNNEL is not set ++# CONFIG_INET_XFRM_MODE_BEET is not set ++# CONFIG_INET_LRO is not set ++# CONFIG_INET_DIAG is not set ++# CONFIG_IPV6 is not set ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++# CONFIG_FW_LOADER is not set ++CONFIG_MTD=y ++CONFIG_MTD_CMDLINE_PARTS=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_ADV_OPTIONS=y ++CONFIG_MTD_CFI_GEOMETRY=y ++# CONFIG_MTD_MAP_BANK_WIDTH_1 is not set ++# CONFIG_MTD_MAP_BANK_WIDTH_4 is not set ++# CONFIG_MTD_CFI_I2 is not set ++CONFIG_MTD_CFI_INTELEXT=y ++CONFIG_MTD_RAM=y ++CONFIG_MTD_ROM=m ++CONFIG_MTD_PHYSMAP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_NETDEVICES=y ++CONFIG_NET_ETHERNET=y ++CONFIG_BFIN_MAC=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++# CONFIG_SERIO is not set ++# CONFIG_DEVKMEM is not set ++CONFIG_BFIN_JTAG_COMM=m ++CONFIG_SERIAL_BFIN=y ++CONFIG_SERIAL_BFIN_CONSOLE=y ++CONFIG_SERIAL_BFIN_UART0=y ++# CONFIG_LEGACY_PTYS is not set ++# CONFIG_HW_RANDOM is not set ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++CONFIG_I2C_BLACKFIN_TWI=y ++CONFIG_I2C_BLACKFIN_TWI_CLK_KHZ=100 ++CONFIG_SPI=y ++CONFIG_SPI_BFIN5XX=y ++CONFIG_GPIOLIB=y ++CONFIG_GPIO_SYSFS=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_BFIN_WDT=y ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_MMC=y ++CONFIG_MMC_DEBUG=y ++CONFIG_MMC_SPI=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_BFIN=y ++CONFIG_EXT2_FS=y ++# CONFIG_DNOTIFY is not set ++CONFIG_VFAT_FS=m ++# CONFIG_MISC_FILESYSTEMS is not set ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_ROOT_NFS=y ++CONFIG_NLS_CODEPAGE_437=m ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_UTF8=m ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_SHIRQ=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_INFO=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++# CONFIG_FTRACE is not set ++CONFIG_DEBUG_MMRS=y ++CONFIG_DEBUG_HWERR=y ++CONFIG_EXACT_HWERR=y ++CONFIG_DEBUG_DOUBLEFAULT=y ++CONFIG_DEBUG_BFIN_HWTRACE_COMPRESSION_ONE=y ++CONFIG_EARLY_PRINTK=y ++CONFIG_CPLB_INFO=y ++CONFIG_CRYPTO=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++CONFIG_CRC_CCITT=m +diff --git a/arch/mips/configs/fuloong2e_defconfig b/arch/mips/configs/fuloong2e_defconfig +index 499f51498ecb..f7cb39b0662c 100644 +--- a/arch/mips/configs/fuloong2e_defconfig ++++ b/arch/mips/configs/fuloong2e_defconfig +@@ -2,7 +2,8 @@ CONFIG_MACH_LOONGSON64=y + CONFIG_64BIT=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_LOCALVERSION="-fuloong2e" + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y +diff --git a/arch/mips/configs/gpr_defconfig b/arch/mips/configs/gpr_defconfig +index 55438fc9991e..db03ef4f737d 100644 +--- a/arch/mips/configs/gpr_defconfig ++++ b/arch/mips/configs/gpr_defconfig +@@ -1,7 +1,8 @@ + CONFIG_MIPS_ALCHEMY=y + CONFIG_MIPS_GPR=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +diff --git a/arch/mips/configs/ip22_defconfig b/arch/mips/configs/ip22_defconfig +index 7ddfb4ef9479..93e439ad3fce 100644 +--- a/arch/mips/configs/ip22_defconfig ++++ b/arch/mips/configs/ip22_defconfig +@@ -4,7 +4,8 @@ CONFIG_CPU_R5000=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/ip28_defconfig b/arch/mips/configs/ip28_defconfig +index d0a4c2cfacf8..6f0600e99c25 100644 +--- a/arch/mips/configs/ip28_defconfig ++++ b/arch/mips/configs/ip28_defconfig +@@ -1,6 +1,7 @@ + CONFIG_SGI_IP28=y + CONFIG_ARC_CONSOLE=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_IKCONFIG=y + CONFIG_IKCONFIG_PROC=y +diff --git a/arch/mips/configs/jazz_defconfig b/arch/mips/configs/jazz_defconfig +index 9ad1c94376c8..1d62ce7ff5dc 100644 +--- a/arch/mips/configs/jazz_defconfig ++++ b/arch/mips/configs/jazz_defconfig +@@ -1,6 +1,7 @@ + CONFIG_MACH_JAZZ=y + CONFIG_OLIVETTI_M700=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y +diff --git a/arch/mips/configs/mtx1_defconfig b/arch/mips/configs/mtx1_defconfig +index c3d0d0a6e044..aa3426d5f7d7 100644 +--- a/arch/mips/configs/mtx1_defconfig ++++ b/arch/mips/configs/mtx1_defconfig +@@ -1,6 +1,7 @@ + CONFIG_MIPS_ALCHEMY=y + CONFIG_MIPS_MTX1=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y +diff --git a/arch/mips/configs/nlm_xlr_defconfig b/arch/mips/configs/nlm_xlr_defconfig +index c4477a4d40c1..95caf0af665f 100644 +--- a/arch/mips/configs/nlm_xlr_defconfig ++++ b/arch/mips/configs/nlm_xlr_defconfig +@@ -5,7 +5,7 @@ CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 + CONFIG_SMP=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_KEXEC=y + CONFIG_CROSS_COMPILE="" + # CONFIG_LOCALVERSION_AUTO is not set +diff --git a/arch/mips/configs/pic32mzda_defconfig b/arch/mips/configs/pic32mzda_defconfig +index 41190c2036e6..3728897ab2b2 100644 +--- a/arch/mips/configs/pic32mzda_defconfig ++++ b/arch/mips/configs/pic32mzda_defconfig +@@ -1,7 +1,7 @@ + CONFIG_MACH_PIC32=y + CONFIG_DTB_PIC32_MZDA_SK=y + CONFIG_HZ_100=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_SECCOMP is not set + CONFIG_SYSVIPC=y + CONFIG_NO_HZ=y +diff --git a/arch/mips/configs/pistachio_defconfig b/arch/mips/configs/pistachio_defconfig +index b22a3cf149b6..cfffca3d37f4 100644 +--- a/arch/mips/configs/pistachio_defconfig ++++ b/arch/mips/configs/pistachio_defconfig +@@ -5,7 +5,7 @@ CONFIG_MIPS_CPS=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=32768 + CONFIG_ZSMALLOC=y + CONFIG_NR_CPUS=4 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_LOCALVERSION_AUTO is not set + CONFIG_DEFAULT_HOSTNAME="localhost" + CONFIG_SYSVIPC=y +diff --git a/arch/mips/configs/pnx8335_stb225_defconfig b/arch/mips/configs/pnx8335_stb225_defconfig +index e73cdb08fc6e..dc62fa8d6065 100644 +--- a/arch/mips/configs/pnx8335_stb225_defconfig ++++ b/arch/mips/configs/pnx8335_stb225_defconfig +@@ -3,7 +3,7 @@ CONFIG_CPU_LITTLE_ENDIAN=y + CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_HZ_128=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_SECCOMP is not set + # CONFIG_LOCALVERSION_AUTO is not set + # CONFIG_SWAP is not set +diff --git a/arch/mips/configs/rm200_defconfig b/arch/mips/configs/rm200_defconfig +index 5f71aa598b06..767f1999ead0 100644 +--- a/arch/mips/configs/rm200_defconfig ++++ b/arch/mips/configs/rm200_defconfig +@@ -2,7 +2,8 @@ CONFIG_SNI_RM=y + CONFIG_CPU_LITTLE_ENDIAN=y + CONFIG_ARC_CONSOLE=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y + CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y +diff --git a/arch/parisc/configs/712_defconfig b/arch/parisc/configs/712_defconfig +index ccc109761f44..a6a5b0b7a9c9 100644 +--- a/arch/parisc/configs/712_defconfig ++++ b/arch/parisc/configs/712_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_GSC_LASI=y + # CONFIG_PDC_CHASSIS is not set + CONFIG_BINFMT_MISC=m +diff --git a/arch/parisc/configs/c3000_defconfig b/arch/parisc/configs/c3000_defconfig +index 8d41a73bd71b..b8e0a6662ff9 100644 +--- a/arch/parisc/configs/c3000_defconfig ++++ b/arch/parisc/configs/c3000_defconfig +@@ -13,7 +13,7 @@ CONFIG_MODULES=y + CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + CONFIG_PA8X00=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + # CONFIG_GSC is not set + CONFIG_PCI=y + CONFIG_PCI_LBA=y +diff --git a/arch/parisc/configs/default_defconfig b/arch/parisc/configs/default_defconfig +index 52c9050a7c5c..8d86d2e989f4 100644 +--- a/arch/parisc/configs/default_defconfig ++++ b/arch/parisc/configs/default_defconfig +@@ -14,7 +14,7 @@ CONFIG_MODULE_UNLOAD=y + CONFIG_MODULE_FORCE_UNLOAD=y + # CONFIG_BLK_DEV_BSG is not set + CONFIG_PA7100LC=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_IOMMU_CCIO=y + CONFIG_GSC_LASI=y + CONFIG_GSC_WAX=y +diff --git a/arch/powerpc/configs/c2k_defconfig b/arch/powerpc/configs/c2k_defconfig +new file mode 100644 +index 000000000000..04fee07ea6c5 +--- /dev/null ++++ b/arch/powerpc/configs/c2k_defconfig +@@ -0,0 +1,389 @@ ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_AUDIT=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_PROFILING=y ++CONFIG_OPROFILE=m ++CONFIG_KPROBES=y ++CONFIG_MODULES=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODVERSIONS=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++# CONFIG_PPC_CHRP is not set ++# CONFIG_PPC_PMAC is not set ++CONFIG_EMBEDDED6xx=y ++CONFIG_PPC_C2K=y ++CONFIG_CPU_FREQ=y ++CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE=y ++CONFIG_CPU_FREQ_GOV_PERFORMANCE=y ++CONFIG_CPU_FREQ_GOV_POWERSAVE=m ++CONFIG_CPU_FREQ_GOV_ONDEMAND=m ++CONFIG_GEN_RTC=y ++CONFIG_HIGHMEM=y ++CONFIG_PREEMPT=y ++CONFIG_BINFMT_MISC=y ++CONFIG_PM=y ++CONFIG_PCI_MSI=y ++CONFIG_HOTPLUG_PCI=y ++CONFIG_HOTPLUG_PCI_SHPC=m ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_NET_KEY=m ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_IP_PNP=y ++CONFIG_IP_PNP_DHCP=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_NETFILTER=y ++# CONFIG_NETFILTER_XT_MATCH_SCTP is not set ++CONFIG_IP_NF_IPTABLES=m ++CONFIG_IP_NF_MATCH_ECN=m ++CONFIG_IP_NF_MATCH_TTL=m ++CONFIG_IP_NF_FILTER=m ++CONFIG_IP_NF_TARGET_REJECT=m ++CONFIG_IP_NF_MANGLE=m ++CONFIG_IP_NF_TARGET_ECN=m ++CONFIG_IP_NF_RAW=m ++CONFIG_IP_NF_ARPTABLES=m ++CONFIG_IP_NF_ARPFILTER=m ++CONFIG_IP_NF_ARP_MANGLE=m ++CONFIG_IP6_NF_IPTABLES=m ++CONFIG_IP6_NF_MATCH_EUI64=m ++CONFIG_IP6_NF_MATCH_FRAG=m ++CONFIG_IP6_NF_MATCH_OPTS=m ++CONFIG_IP6_NF_MATCH_HL=m ++CONFIG_IP6_NF_MATCH_IPV6HEADER=m ++CONFIG_IP6_NF_MATCH_RT=m ++CONFIG_IP6_NF_FILTER=m ++CONFIG_IP6_NF_MANGLE=m ++CONFIG_IP6_NF_RAW=m ++CONFIG_BRIDGE_NF_EBTABLES=m ++CONFIG_BRIDGE_EBT_BROUTE=m ++CONFIG_BRIDGE_EBT_T_FILTER=m ++CONFIG_BRIDGE_EBT_T_NAT=m ++CONFIG_BRIDGE_EBT_802_3=m ++CONFIG_BRIDGE_EBT_AMONG=m ++CONFIG_BRIDGE_EBT_ARP=m ++CONFIG_BRIDGE_EBT_IP=m ++CONFIG_BRIDGE_EBT_LIMIT=m ++CONFIG_BRIDGE_EBT_MARK=m ++CONFIG_BRIDGE_EBT_PKTTYPE=m ++CONFIG_BRIDGE_EBT_STP=m ++CONFIG_BRIDGE_EBT_VLAN=m ++CONFIG_BRIDGE_EBT_ARPREPLY=m ++CONFIG_BRIDGE_EBT_DNAT=m ++CONFIG_BRIDGE_EBT_MARK_T=m ++CONFIG_BRIDGE_EBT_REDIRECT=m ++CONFIG_BRIDGE_EBT_SNAT=m ++CONFIG_BRIDGE_EBT_LOG=m ++CONFIG_IP_SCTP=m ++CONFIG_ATM=m ++CONFIG_ATM_CLIP=m ++CONFIG_ATM_LANE=m ++CONFIG_ATM_BR2684=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_ATM=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_IND=y ++CONFIG_BT=m ++CONFIG_BT_RFCOMM=m ++CONFIG_BT_RFCOMM_TTY=y ++CONFIG_BT_BNEP=m ++CONFIG_BT_BNEP_MC_FILTER=y ++CONFIG_BT_BNEP_PROTO_FILTER=y ++CONFIG_BT_HIDP=m ++CONFIG_BT_HCIUART=m ++CONFIG_BT_HCIUART_H4=y ++CONFIG_BT_HCIUART_BCSP=y ++CONFIG_BT_HCIBCM203X=m ++CONFIG_BT_HCIBFUSB=m ++CONFIG_BT_HCIVHCI=m ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_MTD=y ++CONFIG_MTD_BLOCK=y ++CONFIG_MTD_CFI=y ++CONFIG_MTD_CFI_AMDSTD=y ++CONFIG_MTD_COMPLEX_MAPPINGS=y ++CONFIG_MTD_PHYSMAP_OF=y ++CONFIG_BLK_DEV_LOOP=m ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_NBD=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_SCSI=m ++CONFIG_BLK_DEV_SD=m ++CONFIG_CHR_DEV_ST=m ++CONFIG_CHR_DEV_OSST=m ++CONFIG_BLK_DEV_SR=m ++CONFIG_BLK_DEV_SR_VENDOR=y ++CONFIG_CHR_DEV_SG=m ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_SCSI_ISCSI_ATTRS=m ++CONFIG_BLK_DEV_3W_XXXX_RAID=m ++CONFIG_SCSI_3W_9XXX=m ++CONFIG_SCSI_ACARD=m ++CONFIG_SCSI_AACRAID=m ++CONFIG_SCSI_AIC7XXX=m ++CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 ++CONFIG_AIC7XXX_RESET_DELAY_MS=15000 ++# CONFIG_AIC7XXX_DEBUG_ENABLE is not set ++# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set ++CONFIG_SCSI_AIC79XX=m ++CONFIG_AIC79XX_CMDS_PER_DEVICE=4 ++CONFIG_AIC79XX_RESET_DELAY_MS=15000 ++# CONFIG_AIC79XX_DEBUG_ENABLE is not set ++# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set ++CONFIG_SCSI_ARCMSR=m ++CONFIG_MEGARAID_NEWGEN=y ++CONFIG_MEGARAID_MM=m ++CONFIG_MEGARAID_MAILBOX=m ++CONFIG_MEGARAID_SAS=m ++CONFIG_SCSI_GDTH=m ++CONFIG_SCSI_IPS=m ++CONFIG_SCSI_INITIO=m ++CONFIG_SCSI_SYM53C8XX_2=m ++CONFIG_SCSI_QLOGIC_1280=m ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_NETCONSOLE=m ++CONFIG_TUN=m ++# CONFIG_ATM_DRIVERS is not set ++CONFIG_MV643XX_ETH=y ++CONFIG_VITESSE_PHY=y ++CONFIG_INPUT_EVDEV=y ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++CONFIG_INPUT_MISC=y ++CONFIG_INPUT_UINPUT=m ++# CONFIG_SERIO is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_SERIAL_NONSTANDARD=y ++CONFIG_SERIAL_MPSC=y ++CONFIG_SERIAL_MPSC_CONSOLE=y ++CONFIG_NVRAM=m ++CONFIG_RAW_DRIVER=y ++CONFIG_MAX_RAW_DEVS=8192 ++CONFIG_I2C=m ++CONFIG_I2C_CHARDEV=m ++CONFIG_I2C_MV64XXX=m ++CONFIG_HWMON=m ++CONFIG_SENSORS_ADM1021=m ++CONFIG_SENSORS_ADM1025=m ++CONFIG_SENSORS_ADM1026=m ++CONFIG_SENSORS_ADM1031=m ++CONFIG_SENSORS_DS1621=m ++CONFIG_SENSORS_GL518SM=m ++CONFIG_SENSORS_MAX1619=m ++CONFIG_SENSORS_LM75=m ++CONFIG_SENSORS_LM77=m ++CONFIG_SENSORS_LM78=m ++CONFIG_SENSORS_LM80=m ++CONFIG_SENSORS_LM83=m ++CONFIG_SENSORS_LM85=m ++CONFIG_SENSORS_LM87=m ++CONFIG_SENSORS_LM90=m ++CONFIG_SENSORS_PCF8591=m ++CONFIG_SENSORS_VIA686A=m ++CONFIG_SENSORS_W83781D=m ++CONFIG_SENSORS_W83L785TS=m ++CONFIG_WATCHDOG=y ++CONFIG_SOFT_WATCHDOG=m ++CONFIG_PCIPCWATCHDOG=m ++CONFIG_WDTPCI=m ++CONFIG_USBPCWATCHDOG=m ++# CONFIG_VGA_CONSOLE is not set ++CONFIG_USB=m ++CONFIG_USB_MON=m ++CONFIG_USB_EHCI_HCD=m ++CONFIG_USB_EHCI_ROOT_HUB_TT=y ++CONFIG_USB_OHCI_HCD=m ++CONFIG_USB_OHCI_HCD_PPC_OF_BE=y ++CONFIG_USB_UHCI_HCD=m ++CONFIG_USB_ACM=m ++CONFIG_USB_PRINTER=m ++CONFIG_USB_STORAGE=m ++CONFIG_USB_STORAGE_DATAFAB=m ++CONFIG_USB_STORAGE_FREECOM=m ++CONFIG_USB_STORAGE_ISD200=m ++CONFIG_USB_STORAGE_SDDR09=m ++CONFIG_USB_STORAGE_SDDR55=m ++CONFIG_USB_STORAGE_JUMPSHOT=m ++CONFIG_USB_MDC800=m ++CONFIG_USB_MICROTEK=m ++CONFIG_USB_SERIAL=m ++CONFIG_USB_SERIAL_GENERIC=y ++CONFIG_USB_SERIAL_BELKIN=m ++CONFIG_USB_SERIAL_WHITEHEAT=m ++CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m ++CONFIG_USB_SERIAL_EMPEG=m ++CONFIG_USB_SERIAL_FTDI_SIO=m ++CONFIG_USB_SERIAL_VISOR=m ++CONFIG_USB_SERIAL_IPAQ=m ++CONFIG_USB_SERIAL_IR=m ++CONFIG_USB_SERIAL_EDGEPORT=m ++CONFIG_USB_SERIAL_EDGEPORT_TI=m ++CONFIG_USB_SERIAL_KEYSPAN_PDA=m ++CONFIG_USB_SERIAL_KEYSPAN=m ++CONFIG_USB_SERIAL_KLSI=m ++CONFIG_USB_SERIAL_KOBIL_SCT=m ++CONFIG_USB_SERIAL_MCT_U232=m ++CONFIG_USB_SERIAL_PL2303=m ++CONFIG_USB_SERIAL_SAFE=m ++CONFIG_USB_SERIAL_SAFE_PADDED=y ++CONFIG_USB_SERIAL_CYBERJACK=m ++CONFIG_USB_SERIAL_XIRCOM=m ++CONFIG_USB_SERIAL_OMNINET=m ++CONFIG_USB_EMI62=m ++CONFIG_USB_RIO500=m ++CONFIG_USB_LEGOTOWER=m ++CONFIG_USB_LCD=m ++CONFIG_USB_LED=m ++CONFIG_USB_TEST=m ++CONFIG_USB_ATM=m ++CONFIG_USB_SPEEDTOUCH=m ++CONFIG_INFINIBAND=m ++CONFIG_INFINIBAND_USER_MAD=m ++CONFIG_INFINIBAND_USER_ACCESS=m ++CONFIG_INFINIBAND_MTHCA=m ++CONFIG_INFINIBAND_IPOIB=m ++CONFIG_INFINIBAND_IPOIB_CM=y ++CONFIG_INFINIBAND_SRP=m ++CONFIG_DMADEVICES=y ++CONFIG_EXT4_FS=m ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_QUOTA=y ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_HFS_FS=m ++CONFIG_HFSPLUS_FS=m ++CONFIG_JFFS2_FS=y ++CONFIG_CRAMFS=m ++CONFIG_VXFS_FS=m ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=y ++CONFIG_ROOT_NFS=y ++CONFIG_CIFS=m ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_NLS=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_CRC_CCITT=m ++CONFIG_CRC_T10DIF=m ++CONFIG_DEBUG_INFO=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_KERNEL=y ++CONFIG_DEBUG_STACK_USAGE=y ++CONFIG_DEBUG_HIGHMEM=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_DETECT_HUNG_TASK=y ++CONFIG_DEBUG_SPINLOCK=y ++CONFIG_BOOTX_TEXT=y ++CONFIG_PPC_EARLY_DEBUG=y ++CONFIG_SECURITY=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m +diff --git a/arch/powerpc/configs/ppc6xx_defconfig b/arch/powerpc/configs/ppc6xx_defconfig +index 7ee736f20774..8663c0043a56 100644 +--- a/arch/powerpc/configs/ppc6xx_defconfig ++++ b/arch/powerpc/configs/ppc6xx_defconfig +@@ -74,7 +74,7 @@ CONFIG_QE_GPIO=y + CONFIG_MCU_MPC8349EMITX=y + CONFIG_HIGHMEM=y + CONFIG_HZ_1000=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_BINFMT_MISC=y + CONFIG_HIBERNATION=y + CONFIG_PM_DEBUG=y +diff --git a/arch/score/configs/spct6600_defconfig b/arch/score/configs/spct6600_defconfig +new file mode 100644 +index 000000000000..46434ca1fa10 +--- /dev/null ++++ b/arch/score/configs/spct6600_defconfig +@@ -0,0 +1,84 @@ ++CONFIG_HZ_100=y ++CONFIG_PREEMPT=y ++CONFIG_EXPERIMENTAL=y ++# CONFIG_LOCALVERSION_AUTO is not set ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_LOG_BUF_SHIFT=12 ++CONFIG_SYSFS_DEPRECATED_V2=y ++CONFIG_BLK_DEV_INITRD=y ++# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set ++CONFIG_EXPERT=y ++# CONFIG_KALLSYMS is not set ++# CONFIG_HOTPLUG is not set ++CONFIG_SLAB=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_MODULE_FORCE_UNLOAD=y ++# CONFIG_BLK_DEV_BSG is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_UNIX=y ++CONFIG_NET_KEY=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_ARPD=y ++# CONFIG_INET_LRO is not set ++# CONFIG_IPV6 is not set ++# CONFIG_STANDALONE is not set ++# CONFIG_PREVENT_FIRMWARE_BUILD is not set ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=y ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_COUNT=1 ++# CONFIG_MISC_DEVICES is not set ++CONFIG_NETDEVICES=y ++# CONFIG_NETDEV_1000 is not set ++# CONFIG_NETDEV_10000 is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++CONFIG_SERIAL_NONSTANDARD=y ++CONFIG_STALDRV=y ++# CONFIG_HW_RANDOM is not set ++CONFIG_RAW_DRIVER=y ++CONFIG_MAX_RAW_DEVS=8192 ++# CONFIG_HWMON is not set ++# CONFIG_VGA_CONSOLE is not set ++# CONFIG_HID_SUPPORT is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT3_FS=y ++# CONFIG_EXT3_DEFAULTS_TO_ORDERED is not set ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_AUTOFS_FS=y ++CONFIG_AUTOFS4_FS=y ++CONFIG_PROC_KCORE=y ++# CONFIG_PROC_PAGE_MONITOR is not set ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_NFS_FS=y ++CONFIG_NFS_V3=y ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=y ++CONFIG_NFSD=y ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++# CONFIG_RCU_CPU_STALL_DETECTOR is not set ++CONFIG_SECURITY=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_CRYPTO_NULL=y ++CONFIG_CRYPTO_CRYPTD=y ++CONFIG_CRYPTO_SEQIV=y ++CONFIG_CRYPTO_MD4=y ++CONFIG_CRYPTO_MICHAEL_MIC=y ++# CONFIG_CRYPTO_ANSI_CPRNG is not set ++# CONFIG_CRYPTO_HW is not set ++CONFIG_CRC_CCITT=y ++CONFIG_CRC16=y ++CONFIG_LIBCRC32C=y +diff --git a/arch/sh/configs/se7712_defconfig b/arch/sh/configs/se7712_defconfig +index 5a1097641247..eb5fbf554e7f 100644 +--- a/arch/sh/configs/se7712_defconfig ++++ b/arch/sh/configs/se7712_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=66666666 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda1" + CONFIG_NET=y +diff --git a/arch/sh/configs/se7721_defconfig b/arch/sh/configs/se7721_defconfig +index 9c0ef13bee10..cbaa65c8bf9e 100644 +--- a/arch/sh/configs/se7721_defconfig ++++ b/arch/sh/configs/se7721_defconfig +@@ -23,7 +23,7 @@ CONFIG_FLATMEM_MANUAL=y + CONFIG_SH_7721_SOLUTION_ENGINE=y + CONFIG_SH_PCLK_FREQ=33333333 + CONFIG_HEARTBEAT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC0,115200 root=/dev/sda2" + CONFIG_NET=y +diff --git a/arch/sh/configs/titan_defconfig b/arch/sh/configs/titan_defconfig +index ceb48e9b70f4..1a69eda6610c 100644 +--- a/arch/sh/configs/titan_defconfig ++++ b/arch/sh/configs/titan_defconfig +@@ -20,7 +20,7 @@ CONFIG_SH_TITAN=y + CONFIG_SH_PCLK_FREQ=30000000 + CONFIG_SH_DMA=y + CONFIG_SH_DMA_API=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_CMDLINE_OVERWRITE=y + CONFIG_CMDLINE="console=ttySC1,38400N81 root=/dev/nfs ip=:::::eth1:autoconf rw" + CONFIG_PCI=y +diff --git a/arch/sparc/configs/sparc64_defconfig b/arch/sparc/configs/sparc64_defconfig +index 4d4e1cc6402f..04bea1d28ba7 100644 +--- a/arch/sparc/configs/sparc64_defconfig ++++ b/arch/sparc/configs/sparc64_defconfig +@@ -22,7 +22,7 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_NUMA=y + CONFIG_DEFAULT_MMAP_MIN_ADDR=8192 +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_SUN_LDOMS=y + CONFIG_PCI=y + CONFIG_PCI_MSI=y +diff --git a/arch/tile/configs/tilegx_defconfig b/arch/tile/configs/tilegx_defconfig +new file mode 100644 +index 000000000000..939c63ba7e6e +--- /dev/null ++++ b/arch/tile/configs/tilegx_defconfig +@@ -0,0 +1,411 @@ ++CONFIG_TILEGX=y ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_FHANDLE=y ++CONFIG_AUDIT=y ++CONFIG_NO_HZ=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BSD_PROCESS_ACCT_V3=y ++CONFIG_TASKSTATS=y ++CONFIG_TASK_DELAY_ACCT=y ++CONFIG_TASK_XACCT=y ++CONFIG_TASK_IO_ACCOUNTING=y ++CONFIG_LOG_BUF_SHIFT=19 ++CONFIG_CGROUPS=y ++CONFIG_CGROUP_DEBUG=y ++CONFIG_CGROUP_DEVICE=y ++CONFIG_CPUSETS=y ++CONFIG_CGROUP_CPUACCT=y ++CONFIG_CGROUP_SCHED=y ++CONFIG_RT_GROUP_SCHED=y ++CONFIG_BLK_CGROUP=y ++CONFIG_NAMESPACES=y ++CONFIG_RELAY=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_RD_XZ=y ++CONFIG_SYSCTL_SYSCALL=y ++CONFIG_EMBEDDED=y ++# CONFIG_COMPAT_BRK is not set ++CONFIG_PROFILING=y ++CONFIG_KPROBES=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_BLK_DEV_INTEGRITY=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_AMIGA_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++CONFIG_KARMA_PARTITION=y ++CONFIG_CFQ_GROUP_IOSCHED=y ++CONFIG_NR_CPUS=100 ++CONFIG_HZ_100=y ++# CONFIG_COMPACTION is not set ++CONFIG_PREEMPT=y ++CONFIG_TILE_PCI_IO=y ++CONFIG_PCI_DEBUG=y ++# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_XFRM_SUB_POLICY=y ++CONFIG_XFRM_STATISTICS=y ++CONFIG_NET_KEY=m ++CONFIG_NET_KEY_MIGRATE=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET_XFRM_MODE_TRANSPORT=m ++CONFIG_INET_XFRM_MODE_TUNNEL=m ++CONFIG_INET_XFRM_MODE_BEET=m ++CONFIG_INET_DIAG=m ++CONFIG_TCP_CONG_ADVANCED=y ++CONFIG_TCP_CONG_HSTCP=m ++CONFIG_TCP_CONG_HYBLA=m ++CONFIG_TCP_CONG_SCALABLE=m ++CONFIG_TCP_CONG_LP=m ++CONFIG_TCP_CONG_VENO=m ++CONFIG_TCP_CONG_YEAH=m ++CONFIG_TCP_CONG_ILLINOIS=m ++CONFIG_TCP_MD5SIG=y ++CONFIG_IPV6=y ++CONFIG_IPV6_ROUTER_PREF=y ++CONFIG_IPV6_ROUTE_INFO=y ++CONFIG_IPV6_OPTIMISTIC_DAD=y ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_MIP6=m ++CONFIG_INET6_XFRM_MODE_TRANSPORT=m ++CONFIG_INET6_XFRM_MODE_TUNNEL=m ++CONFIG_INET6_XFRM_MODE_BEET=m ++CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m ++CONFIG_IPV6_SIT=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_IPV6_MULTIPLE_TABLES=y ++CONFIG_IPV6_MROUTE=y ++CONFIG_IPV6_PIMSM_V2=y ++CONFIG_NETLABEL=y ++CONFIG_RDS=m ++CONFIG_RDS_TCP=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_VLAN_8021Q_GVRP=y ++CONFIG_PHONET=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_MULTIQ=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_SCH_DRR=m ++CONFIG_NET_SCH_INGRESS=m ++CONFIG_NET_CLS_BASIC=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_CLS_U32_MARK=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_FLOW=m ++CONFIG_NET_CLS_CGROUP=y ++CONFIG_NET_EMATCH=y ++CONFIG_NET_EMATCH_CMP=m ++CONFIG_NET_EMATCH_NBYTE=m ++CONFIG_NET_EMATCH_U32=m ++CONFIG_NET_EMATCH_META=m ++CONFIG_NET_EMATCH_TEXT=m ++CONFIG_NET_CLS_ACT=y ++CONFIG_NET_ACT_POLICE=m ++CONFIG_NET_ACT_GACT=m ++CONFIG_GACT_PROB=y ++CONFIG_NET_ACT_MIRRED=m ++CONFIG_NET_ACT_NAT=m ++CONFIG_NET_ACT_PEDIT=m ++CONFIG_NET_ACT_SIMP=m ++CONFIG_NET_ACT_SKBEDIT=m ++CONFIG_NET_CLS_IND=y ++CONFIG_DCB=y ++CONFIG_DNS_RESOLVER=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++CONFIG_CONNECTOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_SX8=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_ATA_OVER_ETH=m ++CONFIG_RAID_ATTRS=m ++CONFIG_BLK_DEV_SD=y ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_SCSI_SAS_ATA=y ++CONFIG_ISCSI_TCP=m ++CONFIG_SCSI_MVSAS=y ++# CONFIG_SCSI_MVSAS_DEBUG is not set ++CONFIG_SCSI_MVSAS_TASKLET=y ++CONFIG_ATA=y ++CONFIG_SATA_AHCI=y ++CONFIG_SATA_SIL24=y ++# CONFIG_ATA_SFF is not set ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=m ++CONFIG_MD_RAID0=m ++CONFIG_MD_RAID1=m ++CONFIG_MD_RAID10=m ++CONFIG_MD_RAID456=m ++CONFIG_MD_FAULTY=m ++CONFIG_BLK_DEV_DM=m ++CONFIG_DM_DEBUG=y ++CONFIG_DM_CRYPT=m ++CONFIG_DM_SNAPSHOT=m ++CONFIG_DM_MIRROR=m ++CONFIG_DM_LOG_USERSPACE=m ++CONFIG_DM_ZERO=m ++CONFIG_DM_MULTIPATH=m ++CONFIG_DM_MULTIPATH_QL=m ++CONFIG_DM_MULTIPATH_ST=m ++CONFIG_DM_DELAY=m ++CONFIG_DM_UEVENT=y ++CONFIG_TARGET_CORE=m ++CONFIG_TCM_IBLOCK=m ++CONFIG_TCM_FILEIO=m ++CONFIG_TCM_PSCSI=m ++CONFIG_LOOPBACK_TARGET=m ++CONFIG_ISCSI_TARGET=m ++CONFIG_FUSION=y ++CONFIG_FUSION_SAS=y ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_IFB=m ++CONFIG_MACVLAN=m ++CONFIG_MACVTAP=m ++CONFIG_NETCONSOLE=m ++CONFIG_NETCONSOLE_DYNAMIC=y ++CONFIG_TUN=y ++CONFIG_VETH=m ++CONFIG_NET_DSA_MV88E6060=y ++CONFIG_NET_DSA_MV88E6XXX=y ++CONFIG_SKY2=y ++CONFIG_PTP_1588_CLOCK_TILEGX=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_SERIAL_TILEGX=y ++CONFIG_HW_RANDOM=y ++CONFIG_HW_RANDOM_TIMERIOMEM=m ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_WATCHDOG_NOWAYOUT=y ++# CONFIG_VGA_ARB is not set ++CONFIG_DRM=m ++CONFIG_DRM_TDFX=m ++CONFIG_DRM_R128=m ++CONFIG_DRM_MGA=m ++CONFIG_DRM_VIA=m ++CONFIG_DRM_SAVAGE=m ++CONFIG_USB=y ++CONFIG_USB_EHCI_HCD=y ++CONFIG_USB_OHCI_HCD=y ++CONFIG_USB_STORAGE=y ++CONFIG_EDAC=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_TILE=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++CONFIG_EXT2_FS_XIP=y ++CONFIG_EXT3_FS=y ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_EXT3_FS_SECURITY=y ++CONFIG_EXT4_FS=y ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_XFS_FS=y ++CONFIG_XFS_QUOTA=y ++CONFIG_XFS_POSIX_ACL=y ++CONFIG_GFS2_FS=m ++CONFIG_GFS2_FS_LOCKING_DLM=y ++CONFIG_BTRFS_FS=m ++CONFIG_BTRFS_FS_POSIX_ACL=y ++CONFIG_QUOTA=y ++CONFIG_QUOTA_NETLINK_INTERFACE=y ++# CONFIG_PRINT_QUOTA_WARNING is not set ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_FUSE_FS=y ++CONFIG_CUSE=m ++CONFIG_FSCACHE=m ++CONFIG_FSCACHE_STATS=y ++CONFIG_CACHEFILES=m ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_HUGETLBFS=y ++CONFIG_ECRYPT_FS=m ++CONFIG_CRAMFS=m ++CONFIG_SQUASHFS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=m ++CONFIG_NFS_V4_1=y ++CONFIG_NFS_FSCACHE=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_CIFS_STATS=y ++CONFIG_CIFS_WEAK_PW_HASH=y ++CONFIG_CIFS_UPCALL=y ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_CIFS_DFS_UPCALL=y ++CONFIG_CIFS_FSCACHE=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=m ++CONFIG_DLM=m ++CONFIG_DLM_DEBUG=y ++CONFIG_DYNAMIC_DEBUG=y ++CONFIG_DEBUG_INFO=y ++CONFIG_DEBUG_INFO_REDUCED=y ++# CONFIG_ENABLE_WARN_DEPRECATED is not set ++CONFIG_STRIP_ASM_SYMS=y ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_CHECK=y ++# CONFIG_FRAME_POINTER is not set ++CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y ++CONFIG_DEBUG_VM=y ++CONFIG_DEBUG_MEMORY_INIT=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_LOCKUP_DETECTOR=y ++CONFIG_SCHEDSTATS=y ++CONFIG_TIMER_STATS=y ++CONFIG_DEBUG_LIST=y ++CONFIG_DEBUG_CREDENTIALS=y ++CONFIG_RCU_CPU_STALL_TIMEOUT=60 ++CONFIG_ASYNC_RAID6_TEST=m ++CONFIG_KGDB=y ++CONFIG_SECURITY=y ++CONFIG_SECURITYFS=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_NETWORK_XFRM=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_PCRYPT=m ++CONFIG_CRYPTO_CRYPTD=m ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_CCM=m ++CONFIG_CRYPTO_GCM=m ++CONFIG_CRYPTO_CTS=m ++CONFIG_CRYPTO_LRW=m ++CONFIG_CRYPTO_PCBC=m ++CONFIG_CRYPTO_XTS=m ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_XCBC=m ++CONFIG_CRYPTO_VMAC=m ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_RMD128=m ++CONFIG_CRYPTO_RMD160=m ++CONFIG_CRYPTO_RMD256=m ++CONFIG_CRYPTO_RMD320=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAMELLIA=m ++CONFIG_CRYPTO_CAST5=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_FCRYPT=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SEED=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++CONFIG_CRYPTO_LZO=m +diff --git a/arch/tile/configs/tilepro_defconfig b/arch/tile/configs/tilepro_defconfig +new file mode 100644 +index 000000000000..e8c4003cbd81 +--- /dev/null ++++ b/arch/tile/configs/tilepro_defconfig +@@ -0,0 +1,524 @@ ++CONFIG_SYSVIPC=y ++CONFIG_POSIX_MQUEUE=y ++CONFIG_AUDIT=y ++CONFIG_NO_HZ=y ++CONFIG_HIGH_RES_TIMERS=y ++CONFIG_BSD_PROCESS_ACCT=y ++CONFIG_BSD_PROCESS_ACCT_V3=y ++CONFIG_TASKSTATS=y ++CONFIG_TASK_DELAY_ACCT=y ++CONFIG_TASK_XACCT=y ++CONFIG_TASK_IO_ACCOUNTING=y ++CONFIG_LOG_BUF_SHIFT=19 ++CONFIG_CGROUPS=y ++CONFIG_CGROUP_DEBUG=y ++CONFIG_CGROUP_DEVICE=y ++CONFIG_CPUSETS=y ++CONFIG_CGROUP_CPUACCT=y ++CONFIG_CGROUP_SCHED=y ++CONFIG_RT_GROUP_SCHED=y ++CONFIG_BLK_CGROUP=y ++CONFIG_NAMESPACES=y ++CONFIG_RELAY=y ++CONFIG_BLK_DEV_INITRD=y ++CONFIG_RD_XZ=y ++CONFIG_SYSCTL_SYSCALL=y ++CONFIG_EMBEDDED=y ++# CONFIG_COMPAT_BRK is not set ++CONFIG_PROFILING=y ++CONFIG_MODULES=y ++CONFIG_MODULE_FORCE_LOAD=y ++CONFIG_MODULE_UNLOAD=y ++CONFIG_BLK_DEV_INTEGRITY=y ++CONFIG_PARTITION_ADVANCED=y ++CONFIG_OSF_PARTITION=y ++CONFIG_AMIGA_PARTITION=y ++CONFIG_MAC_PARTITION=y ++CONFIG_BSD_DISKLABEL=y ++CONFIG_MINIX_SUBPARTITION=y ++CONFIG_SOLARIS_X86_PARTITION=y ++CONFIG_UNIXWARE_DISKLABEL=y ++CONFIG_SGI_PARTITION=y ++CONFIG_SUN_PARTITION=y ++CONFIG_KARMA_PARTITION=y ++CONFIG_CFQ_GROUP_IOSCHED=y ++CONFIG_HZ_100=y ++# CONFIG_COMPACTION is not set ++CONFIG_PREEMPT=y ++CONFIG_PCI_DEBUG=y ++# CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS is not set ++CONFIG_BINFMT_MISC=y ++CONFIG_NET=y ++CONFIG_PACKET=y ++CONFIG_UNIX=y ++CONFIG_XFRM_USER=y ++CONFIG_XFRM_SUB_POLICY=y ++CONFIG_XFRM_STATISTICS=y ++CONFIG_NET_KEY=m ++CONFIG_NET_KEY_MIGRATE=y ++CONFIG_INET=y ++CONFIG_IP_MULTICAST=y ++CONFIG_IP_ADVANCED_ROUTER=y ++CONFIG_IP_MULTIPLE_TABLES=y ++CONFIG_IP_ROUTE_MULTIPATH=y ++CONFIG_IP_ROUTE_VERBOSE=y ++CONFIG_NET_IPIP=m ++CONFIG_IP_MROUTE=y ++CONFIG_IP_PIMSM_V1=y ++CONFIG_IP_PIMSM_V2=y ++CONFIG_SYN_COOKIES=y ++CONFIG_INET_AH=m ++CONFIG_INET_ESP=m ++CONFIG_INET_IPCOMP=m ++CONFIG_INET_XFRM_MODE_TRANSPORT=m ++CONFIG_INET_XFRM_MODE_TUNNEL=m ++CONFIG_INET_XFRM_MODE_BEET=m ++CONFIG_INET_DIAG=m ++CONFIG_TCP_CONG_ADVANCED=y ++CONFIG_TCP_CONG_HSTCP=m ++CONFIG_TCP_CONG_HYBLA=m ++CONFIG_TCP_CONG_SCALABLE=m ++CONFIG_TCP_CONG_LP=m ++CONFIG_TCP_CONG_VENO=m ++CONFIG_TCP_CONG_YEAH=m ++CONFIG_TCP_CONG_ILLINOIS=m ++CONFIG_TCP_MD5SIG=y ++CONFIG_IPV6=y ++CONFIG_IPV6_ROUTER_PREF=y ++CONFIG_IPV6_ROUTE_INFO=y ++CONFIG_IPV6_OPTIMISTIC_DAD=y ++CONFIG_INET6_AH=m ++CONFIG_INET6_ESP=m ++CONFIG_INET6_IPCOMP=m ++CONFIG_IPV6_MIP6=m ++CONFIG_INET6_XFRM_MODE_TRANSPORT=m ++CONFIG_INET6_XFRM_MODE_TUNNEL=m ++CONFIG_INET6_XFRM_MODE_BEET=m ++CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m ++CONFIG_IPV6_SIT=m ++CONFIG_IPV6_TUNNEL=m ++CONFIG_IPV6_MULTIPLE_TABLES=y ++CONFIG_IPV6_MROUTE=y ++CONFIG_IPV6_PIMSM_V2=y ++CONFIG_NETLABEL=y ++CONFIG_NETFILTER=y ++CONFIG_NF_CONNTRACK=m ++CONFIG_NF_CONNTRACK_SECMARK=y ++CONFIG_NF_CONNTRACK_ZONES=y ++CONFIG_NF_CONNTRACK_EVENTS=y ++CONFIG_NF_CT_PROTO_DCCP=m ++CONFIG_NF_CT_PROTO_UDPLITE=m ++CONFIG_NF_CONNTRACK_AMANDA=m ++CONFIG_NF_CONNTRACK_FTP=m ++CONFIG_NF_CONNTRACK_H323=m ++CONFIG_NF_CONNTRACK_IRC=m ++CONFIG_NF_CONNTRACK_NETBIOS_NS=m ++CONFIG_NF_CONNTRACK_PPTP=m ++CONFIG_NF_CONNTRACK_SANE=m ++CONFIG_NF_CONNTRACK_SIP=m ++CONFIG_NF_CONNTRACK_TFTP=m ++CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m ++CONFIG_NETFILTER_XT_TARGET_CONNMARK=m ++CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m ++CONFIG_NETFILTER_XT_TARGET_DSCP=m ++CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m ++CONFIG_NETFILTER_XT_TARGET_MARK=m ++CONFIG_NETFILTER_XT_TARGET_NFLOG=m ++CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m ++CONFIG_NETFILTER_XT_TARGET_NOTRACK=m ++CONFIG_NETFILTER_XT_TARGET_TEE=m ++CONFIG_NETFILTER_XT_TARGET_TPROXY=m ++CONFIG_NETFILTER_XT_TARGET_TRACE=m ++CONFIG_NETFILTER_XT_TARGET_SECMARK=m ++CONFIG_NETFILTER_XT_TARGET_TCPMSS=m ++CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m ++CONFIG_NETFILTER_XT_MATCH_CLUSTER=m ++CONFIG_NETFILTER_XT_MATCH_COMMENT=m ++CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m ++CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m ++CONFIG_NETFILTER_XT_MATCH_CONNMARK=m ++CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m ++CONFIG_NETFILTER_XT_MATCH_DCCP=m ++CONFIG_NETFILTER_XT_MATCH_DSCP=m ++CONFIG_NETFILTER_XT_MATCH_ESP=m ++CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m ++CONFIG_NETFILTER_XT_MATCH_HELPER=m ++CONFIG_NETFILTER_XT_MATCH_IPRANGE=m ++CONFIG_NETFILTER_XT_MATCH_IPVS=m ++CONFIG_NETFILTER_XT_MATCH_LENGTH=m ++CONFIG_NETFILTER_XT_MATCH_LIMIT=m ++CONFIG_NETFILTER_XT_MATCH_MAC=m ++CONFIG_NETFILTER_XT_MATCH_MARK=m ++CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m ++CONFIG_NETFILTER_XT_MATCH_OSF=m ++CONFIG_NETFILTER_XT_MATCH_OWNER=m ++CONFIG_NETFILTER_XT_MATCH_POLICY=m ++CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m ++CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m ++CONFIG_NETFILTER_XT_MATCH_QUOTA=m ++CONFIG_NETFILTER_XT_MATCH_RATEEST=m ++CONFIG_NETFILTER_XT_MATCH_REALM=m ++CONFIG_NETFILTER_XT_MATCH_RECENT=m ++CONFIG_NETFILTER_XT_MATCH_SOCKET=m ++CONFIG_NETFILTER_XT_MATCH_STATE=m ++CONFIG_NETFILTER_XT_MATCH_STATISTIC=m ++CONFIG_NETFILTER_XT_MATCH_STRING=m ++CONFIG_NETFILTER_XT_MATCH_TCPMSS=m ++CONFIG_NETFILTER_XT_MATCH_TIME=m ++CONFIG_NETFILTER_XT_MATCH_U32=m ++CONFIG_IP_VS=m ++CONFIG_IP_VS_IPV6=y ++CONFIG_IP_VS_PROTO_TCP=y ++CONFIG_IP_VS_PROTO_UDP=y ++CONFIG_IP_VS_PROTO_ESP=y ++CONFIG_IP_VS_PROTO_AH=y ++CONFIG_IP_VS_PROTO_SCTP=y ++CONFIG_IP_VS_RR=m ++CONFIG_IP_VS_WRR=m ++CONFIG_IP_VS_LC=m ++CONFIG_IP_VS_WLC=m ++CONFIG_IP_VS_LBLC=m ++CONFIG_IP_VS_LBLCR=m ++CONFIG_IP_VS_SED=m ++CONFIG_IP_VS_NQ=m ++CONFIG_NF_CONNTRACK_IPV4=m ++# CONFIG_NF_CONNTRACK_PROC_COMPAT is not set ++CONFIG_IP_NF_IPTABLES=y ++CONFIG_IP_NF_MATCH_AH=m ++CONFIG_IP_NF_MATCH_ECN=m ++CONFIG_IP_NF_MATCH_TTL=m ++CONFIG_IP_NF_FILTER=y ++CONFIG_IP_NF_TARGET_REJECT=y ++CONFIG_IP_NF_MANGLE=m ++CONFIG_IP_NF_TARGET_ECN=m ++CONFIG_IP_NF_TARGET_TTL=m ++CONFIG_IP_NF_RAW=m ++CONFIG_IP_NF_SECURITY=m ++CONFIG_IP_NF_ARPTABLES=m ++CONFIG_IP_NF_ARPFILTER=m ++CONFIG_IP_NF_ARP_MANGLE=m ++CONFIG_NF_CONNTRACK_IPV6=m ++CONFIG_IP6_NF_MATCH_AH=m ++CONFIG_IP6_NF_MATCH_EUI64=m ++CONFIG_IP6_NF_MATCH_FRAG=m ++CONFIG_IP6_NF_MATCH_OPTS=m ++CONFIG_IP6_NF_MATCH_HL=m ++CONFIG_IP6_NF_MATCH_IPV6HEADER=m ++CONFIG_IP6_NF_MATCH_MH=m ++CONFIG_IP6_NF_MATCH_RT=m ++CONFIG_IP6_NF_TARGET_HL=m ++CONFIG_IP6_NF_FILTER=m ++CONFIG_IP6_NF_TARGET_REJECT=m ++CONFIG_IP6_NF_MANGLE=m ++CONFIG_IP6_NF_RAW=m ++CONFIG_IP6_NF_SECURITY=m ++CONFIG_BRIDGE_NF_EBTABLES=m ++CONFIG_BRIDGE_EBT_BROUTE=m ++CONFIG_BRIDGE_EBT_T_FILTER=m ++CONFIG_BRIDGE_EBT_T_NAT=m ++CONFIG_BRIDGE_EBT_802_3=m ++CONFIG_BRIDGE_EBT_AMONG=m ++CONFIG_BRIDGE_EBT_ARP=m ++CONFIG_BRIDGE_EBT_IP=m ++CONFIG_BRIDGE_EBT_IP6=m ++CONFIG_BRIDGE_EBT_LIMIT=m ++CONFIG_BRIDGE_EBT_MARK=m ++CONFIG_BRIDGE_EBT_PKTTYPE=m ++CONFIG_BRIDGE_EBT_STP=m ++CONFIG_BRIDGE_EBT_VLAN=m ++CONFIG_BRIDGE_EBT_ARPREPLY=m ++CONFIG_BRIDGE_EBT_DNAT=m ++CONFIG_BRIDGE_EBT_MARK_T=m ++CONFIG_BRIDGE_EBT_REDIRECT=m ++CONFIG_BRIDGE_EBT_SNAT=m ++CONFIG_BRIDGE_EBT_LOG=m ++CONFIG_BRIDGE_EBT_ULOG=m ++CONFIG_BRIDGE_EBT_NFLOG=m ++CONFIG_RDS=m ++CONFIG_RDS_TCP=m ++CONFIG_BRIDGE=m ++CONFIG_VLAN_8021Q=m ++CONFIG_VLAN_8021Q_GVRP=y ++CONFIG_PHONET=m ++CONFIG_NET_SCHED=y ++CONFIG_NET_SCH_CBQ=m ++CONFIG_NET_SCH_HTB=m ++CONFIG_NET_SCH_HFSC=m ++CONFIG_NET_SCH_PRIO=m ++CONFIG_NET_SCH_MULTIQ=m ++CONFIG_NET_SCH_RED=m ++CONFIG_NET_SCH_SFQ=m ++CONFIG_NET_SCH_TEQL=m ++CONFIG_NET_SCH_TBF=m ++CONFIG_NET_SCH_GRED=m ++CONFIG_NET_SCH_DSMARK=m ++CONFIG_NET_SCH_NETEM=m ++CONFIG_NET_SCH_DRR=m ++CONFIG_NET_SCH_INGRESS=m ++CONFIG_NET_CLS_BASIC=m ++CONFIG_NET_CLS_TCINDEX=m ++CONFIG_NET_CLS_ROUTE4=m ++CONFIG_NET_CLS_FW=m ++CONFIG_NET_CLS_U32=m ++CONFIG_CLS_U32_PERF=y ++CONFIG_CLS_U32_MARK=y ++CONFIG_NET_CLS_RSVP=m ++CONFIG_NET_CLS_RSVP6=m ++CONFIG_NET_CLS_FLOW=m ++CONFIG_NET_CLS_CGROUP=y ++CONFIG_NET_EMATCH=y ++CONFIG_NET_EMATCH_CMP=m ++CONFIG_NET_EMATCH_NBYTE=m ++CONFIG_NET_EMATCH_U32=m ++CONFIG_NET_EMATCH_META=m ++CONFIG_NET_EMATCH_TEXT=m ++CONFIG_NET_CLS_ACT=y ++CONFIG_NET_ACT_POLICE=m ++CONFIG_NET_ACT_GACT=m ++CONFIG_GACT_PROB=y ++CONFIG_NET_ACT_MIRRED=m ++CONFIG_NET_ACT_IPT=m ++CONFIG_NET_ACT_NAT=m ++CONFIG_NET_ACT_PEDIT=m ++CONFIG_NET_ACT_SIMP=m ++CONFIG_NET_ACT_SKBEDIT=m ++CONFIG_NET_CLS_IND=y ++CONFIG_DCB=y ++CONFIG_DNS_RESOLVER=y ++# CONFIG_WIRELESS is not set ++CONFIG_UEVENT_HELPER_PATH="/sbin/hotplug" ++CONFIG_DEVTMPFS=y ++CONFIG_DEVTMPFS_MOUNT=y ++CONFIG_CONNECTOR=y ++CONFIG_BLK_DEV_LOOP=y ++CONFIG_BLK_DEV_CRYPTOLOOP=m ++CONFIG_BLK_DEV_SX8=m ++CONFIG_BLK_DEV_RAM=y ++CONFIG_BLK_DEV_RAM_SIZE=16384 ++CONFIG_ATA_OVER_ETH=m ++CONFIG_RAID_ATTRS=m ++CONFIG_BLK_DEV_SD=y ++CONFIG_SCSI_CONSTANTS=y ++CONFIG_SCSI_LOGGING=y ++CONFIG_ATA=y ++CONFIG_SATA_SIL24=y ++# CONFIG_ATA_SFF is not set ++CONFIG_MD=y ++CONFIG_BLK_DEV_MD=y ++CONFIG_MD_LINEAR=m ++CONFIG_MD_RAID0=m ++CONFIG_MD_RAID1=m ++CONFIG_MD_RAID10=m ++CONFIG_MD_RAID456=m ++CONFIG_MD_FAULTY=m ++CONFIG_BLK_DEV_DM=m ++CONFIG_DM_DEBUG=y ++CONFIG_DM_CRYPT=m ++CONFIG_DM_SNAPSHOT=m ++CONFIG_DM_MIRROR=m ++CONFIG_DM_LOG_USERSPACE=m ++CONFIG_DM_ZERO=m ++CONFIG_DM_MULTIPATH=m ++CONFIG_DM_MULTIPATH_QL=m ++CONFIG_DM_MULTIPATH_ST=m ++CONFIG_DM_DELAY=m ++CONFIG_DM_UEVENT=y ++CONFIG_FUSION=y ++CONFIG_FUSION_SAS=y ++CONFIG_NETDEVICES=y ++CONFIG_BONDING=m ++CONFIG_DUMMY=m ++CONFIG_IFB=m ++CONFIG_MACVLAN=m ++CONFIG_MACVTAP=m ++CONFIG_NETCONSOLE=m ++CONFIG_NETCONSOLE_DYNAMIC=y ++CONFIG_TUN=y ++CONFIG_VETH=m ++CONFIG_NET_DSA_MV88E6060=y ++CONFIG_NET_DSA_MV88E6XXX=y ++# CONFIG_NET_VENDOR_3COM is not set ++CONFIG_E1000E=y ++# CONFIG_WLAN is not set ++# CONFIG_INPUT_MOUSEDEV is not set ++# CONFIG_INPUT_KEYBOARD is not set ++# CONFIG_INPUT_MOUSE is not set ++# CONFIG_SERIO is not set ++# CONFIG_VT is not set ++# CONFIG_LEGACY_PTYS is not set ++CONFIG_HW_RANDOM=y ++CONFIG_HW_RANDOM_TIMERIOMEM=m ++CONFIG_I2C=y ++CONFIG_I2C_CHARDEV=y ++# CONFIG_HWMON is not set ++CONFIG_WATCHDOG=y ++CONFIG_WATCHDOG_NOWAYOUT=y ++# CONFIG_VGA_ARB is not set ++# CONFIG_USB_SUPPORT is not set ++CONFIG_EDAC=y ++CONFIG_RTC_CLASS=y ++CONFIG_RTC_DRV_TILE=y ++CONFIG_EXT2_FS=y ++CONFIG_EXT2_FS_XATTR=y ++CONFIG_EXT2_FS_POSIX_ACL=y ++CONFIG_EXT2_FS_SECURITY=y ++CONFIG_EXT2_FS_XIP=y ++CONFIG_EXT3_FS=y ++CONFIG_EXT3_FS_POSIX_ACL=y ++CONFIG_EXT3_FS_SECURITY=y ++CONFIG_EXT4_FS=y ++CONFIG_EXT4_FS_POSIX_ACL=y ++CONFIG_EXT4_FS_SECURITY=y ++CONFIG_XFS_FS=y ++CONFIG_XFS_QUOTA=y ++CONFIG_XFS_POSIX_ACL=y ++CONFIG_GFS2_FS=m ++CONFIG_GFS2_FS_LOCKING_DLM=y ++CONFIG_BTRFS_FS=m ++CONFIG_BTRFS_FS_POSIX_ACL=y ++CONFIG_QUOTA=y ++CONFIG_QUOTA_NETLINK_INTERFACE=y ++# CONFIG_PRINT_QUOTA_WARNING is not set ++CONFIG_QFMT_V2=y ++CONFIG_AUTOFS4_FS=m ++CONFIG_FUSE_FS=y ++CONFIG_CUSE=m ++CONFIG_FSCACHE=m ++CONFIG_FSCACHE_STATS=y ++CONFIG_CACHEFILES=m ++CONFIG_ISO9660_FS=m ++CONFIG_JOLIET=y ++CONFIG_ZISOFS=y ++CONFIG_UDF_FS=m ++CONFIG_MSDOS_FS=m ++CONFIG_VFAT_FS=m ++CONFIG_FAT_DEFAULT_IOCHARSET="ascii" ++CONFIG_PROC_KCORE=y ++CONFIG_TMPFS=y ++CONFIG_TMPFS_POSIX_ACL=y ++CONFIG_HUGETLBFS=y ++CONFIG_CONFIGFS_FS=m ++CONFIG_ECRYPT_FS=m ++CONFIG_CRAMFS=m ++CONFIG_SQUASHFS=m ++CONFIG_NFS_FS=m ++CONFIG_NFS_V3_ACL=y ++CONFIG_NFS_V4=m ++CONFIG_NFS_V4_1=y ++CONFIG_NFS_FSCACHE=y ++CONFIG_NFSD=m ++CONFIG_NFSD_V3_ACL=y ++CONFIG_NFSD_V4=y ++CONFIG_CIFS=m ++CONFIG_CIFS_STATS=y ++CONFIG_CIFS_WEAK_PW_HASH=y ++CONFIG_CIFS_UPCALL=y ++CONFIG_CIFS_XATTR=y ++CONFIG_CIFS_POSIX=y ++CONFIG_CIFS_DFS_UPCALL=y ++CONFIG_CIFS_FSCACHE=y ++CONFIG_NLS=y ++CONFIG_NLS_DEFAULT="utf8" ++CONFIG_NLS_CODEPAGE_437=y ++CONFIG_NLS_CODEPAGE_737=m ++CONFIG_NLS_CODEPAGE_775=m ++CONFIG_NLS_CODEPAGE_850=m ++CONFIG_NLS_CODEPAGE_852=m ++CONFIG_NLS_CODEPAGE_855=m ++CONFIG_NLS_CODEPAGE_857=m ++CONFIG_NLS_CODEPAGE_860=m ++CONFIG_NLS_CODEPAGE_861=m ++CONFIG_NLS_CODEPAGE_862=m ++CONFIG_NLS_CODEPAGE_863=m ++CONFIG_NLS_CODEPAGE_864=m ++CONFIG_NLS_CODEPAGE_865=m ++CONFIG_NLS_CODEPAGE_866=m ++CONFIG_NLS_CODEPAGE_869=m ++CONFIG_NLS_CODEPAGE_936=m ++CONFIG_NLS_CODEPAGE_950=m ++CONFIG_NLS_CODEPAGE_932=m ++CONFIG_NLS_CODEPAGE_949=m ++CONFIG_NLS_CODEPAGE_874=m ++CONFIG_NLS_ISO8859_8=m ++CONFIG_NLS_CODEPAGE_1250=m ++CONFIG_NLS_CODEPAGE_1251=m ++CONFIG_NLS_ASCII=y ++CONFIG_NLS_ISO8859_1=m ++CONFIG_NLS_ISO8859_2=m ++CONFIG_NLS_ISO8859_3=m ++CONFIG_NLS_ISO8859_4=m ++CONFIG_NLS_ISO8859_5=m ++CONFIG_NLS_ISO8859_6=m ++CONFIG_NLS_ISO8859_7=m ++CONFIG_NLS_ISO8859_9=m ++CONFIG_NLS_ISO8859_13=m ++CONFIG_NLS_ISO8859_14=m ++CONFIG_NLS_ISO8859_15=m ++CONFIG_NLS_KOI8_R=m ++CONFIG_NLS_KOI8_U=m ++CONFIG_NLS_UTF8=m ++CONFIG_DLM=m ++CONFIG_DLM_DEBUG=y ++CONFIG_DYNAMIC_DEBUG=y ++CONFIG_DEBUG_INFO=y ++CONFIG_DEBUG_INFO_REDUCED=y ++# CONFIG_ENABLE_WARN_DEPRECATED is not set ++CONFIG_FRAME_WARN=2048 ++CONFIG_STRIP_ASM_SYMS=y ++CONFIG_DEBUG_FS=y ++CONFIG_HEADERS_CHECK=y ++# CONFIG_FRAME_POINTER is not set ++CONFIG_DEBUG_FORCE_WEAK_PER_CPU=y ++CONFIG_MAGIC_SYSRQ=y ++CONFIG_DEBUG_VM=y ++CONFIG_DEBUG_MEMORY_INIT=y ++CONFIG_DEBUG_STACKOVERFLOW=y ++CONFIG_LOCKUP_DETECTOR=y ++CONFIG_SCHEDSTATS=y ++CONFIG_TIMER_STATS=y ++CONFIG_DEBUG_LIST=y ++CONFIG_DEBUG_CREDENTIALS=y ++CONFIG_RCU_CPU_STALL_TIMEOUT=60 ++CONFIG_ASYNC_RAID6_TEST=m ++CONFIG_SECURITY=y ++CONFIG_SECURITYFS=y ++CONFIG_SECURITY_NETWORK=y ++CONFIG_SECURITY_NETWORK_XFRM=y ++CONFIG_SECURITY_SELINUX=y ++CONFIG_SECURITY_SELINUX_BOOTPARAM=y ++CONFIG_SECURITY_SELINUX_DISABLE=y ++CONFIG_CRYPTO_PCRYPT=m ++CONFIG_CRYPTO_CRYPTD=m ++CONFIG_CRYPTO_TEST=m ++CONFIG_CRYPTO_CCM=m ++CONFIG_CRYPTO_GCM=m ++CONFIG_CRYPTO_CTS=m ++CONFIG_CRYPTO_LRW=m ++CONFIG_CRYPTO_PCBC=m ++CONFIG_CRYPTO_XTS=m ++CONFIG_CRYPTO_HMAC=y ++CONFIG_CRYPTO_XCBC=m ++CONFIG_CRYPTO_VMAC=m ++CONFIG_CRYPTO_MICHAEL_MIC=m ++CONFIG_CRYPTO_RMD128=m ++CONFIG_CRYPTO_RMD160=m ++CONFIG_CRYPTO_RMD256=m ++CONFIG_CRYPTO_RMD320=m ++CONFIG_CRYPTO_SHA1=y ++CONFIG_CRYPTO_SHA512=m ++CONFIG_CRYPTO_TGR192=m ++CONFIG_CRYPTO_WP512=m ++CONFIG_CRYPTO_ANUBIS=m ++CONFIG_CRYPTO_BLOWFISH=m ++CONFIG_CRYPTO_CAMELLIA=m ++CONFIG_CRYPTO_CAST5=m ++CONFIG_CRYPTO_CAST6=m ++CONFIG_CRYPTO_FCRYPT=m ++CONFIG_CRYPTO_KHAZAD=m ++CONFIG_CRYPTO_SEED=m ++CONFIG_CRYPTO_SERPENT=m ++CONFIG_CRYPTO_TEA=m ++CONFIG_CRYPTO_TWOFISH=m ++CONFIG_CRYPTO_LZO=m ++CONFIG_CRC_CCITT=m ++CONFIG_CRC7=m +diff --git a/arch/x86/configs/i386_defconfig b/arch/x86/configs/i386_defconfig +index 0eb9f92f3717..e5890ae917e5 100644 +--- a/arch/x86/configs/i386_defconfig ++++ b/arch/x86/configs/i386_defconfig +@@ -41,7 +41,7 @@ CONFIG_SMP=y + CONFIG_X86_GENERIC=y + CONFIG_HPET_TIMER=y + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_X86_REBOOTFIXUPS=y +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index e32fc1f274d8..4368ba4f7967 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -40,7 +40,7 @@ CONFIG_SMP=y + CONFIG_CALGARY_IOMMU=y + CONFIG_NR_CPUS=64 + CONFIG_SCHED_SMT=y +-CONFIG_PREEMPT_VOLUNTARY=y ++CONFIG_PREEMPT=y + CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y + CONFIG_X86_MCE=y + CONFIG_MICROCODE=y +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index cd1655122ec0..9cf10230d5fb 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -1,7 +1,7 @@ + + choice + prompt "Preemption Model" +- default PREEMPT_NONE ++ default PREEMPT + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" +@@ -17,7 +17,7 @@ config PREEMPT_NONE + latencies. + + config PREEMPT_VOLUNTARY +- bool "Voluntary Kernel Preemption (Desktop)" ++ bool "Voluntary Kernel Preemption (Nothing)" + depends on !ARCH_NO_PREEMPT + help + This option reduces the latency of the kernel by adding more +@@ -32,7 +32,8 @@ config PREEMPT_VOLUNTARY + applications to run more 'smoothly' even when the system is + under load. + +- Select this if you are building a kernel for a desktop system. ++ Select this for no system in particular (choose Preemptible ++ instead on a desktop if you know what's good for you). + + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" +@@ -57,4 +58,4 @@ config PREEMPT + endchoice + + config PREEMPT_COUNT +- bool +\ No newline at end of file ++ bool +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0004-Expose-vmsplit-for-our-poor-32-bit-users.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0004-Expose-vmsplit-for-our-poor-32-bit-users.patch new file mode 100644 index 00000000..0aaae235 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0004-Expose-vmsplit-for-our-poor-32-bit-users.patch @@ -0,0 +1,48 @@ +From d67d0504370871bea9e73c69c840fb3d0a88d9cb Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Fri, 12 May 2017 13:07:37 +1000 +Subject: [PATCH 04/16] Expose vmsplit for our poor 32 bit users. + +--- + arch/x86/Kconfig | 12 ++++++------ + 1 file changed, 6 insertions(+), 6 deletions(-) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 88bfccabcf3b..cfa268364ec7 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1495,7 +1495,7 @@ config HIGHMEM64G + endchoice + + choice +- prompt "Memory split" if EXPERT ++ prompt "Memory split" + default VMSPLIT_3G + depends on X86_32 + ---help--- +@@ -1515,17 +1515,17 @@ choice + option alone! + + config VMSPLIT_3G +- bool "3G/1G user/kernel split" ++ bool "Default 896MB lowmem (3G/1G user/kernel split)" + config VMSPLIT_3G_OPT + depends on !X86_PAE +- bool "3G/1G user/kernel split (for full 1G low memory)" ++ bool "1GB lowmem (3G/1G user/kernel split)" + config VMSPLIT_2G +- bool "2G/2G user/kernel split" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_2G_OPT + depends on !X86_PAE +- bool "2G/2G user/kernel split (for full 2G low memory)" ++ bool "2GB lowmem (2G/2G user/kernel split)" + config VMSPLIT_1G +- bool "1G/3G user/kernel split" ++ bool "3GB lowmem (1G/3G user/kernel split)" + endchoice + + config PAGE_OFFSET +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0005-Create-highres-timeout-variants-of-schedule_timeout-.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0005-Create-highres-timeout-variants-of-schedule_timeout-.patch new file mode 100644 index 00000000..500dd852 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0005-Create-highres-timeout-variants-of-schedule_timeout-.patch @@ -0,0 +1,153 @@ +From 552f25751a108c7e185b82aa3110d43bfe1e59b1 Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Sat, 12 Aug 2017 11:53:39 +1000 +Subject: [PATCH 05/16] Create highres timeout variants of schedule_timeout + functions. + +--- + include/linux/freezer.h | 1 + + include/linux/sched.h | 31 ++++++++++++++++-- + kernel/time/hrtimer.c | 71 +++++++++++++++++++++++++++++++++++++++++ + 3 files changed, 101 insertions(+), 2 deletions(-) + +diff --git a/include/linux/freezer.h b/include/linux/freezer.h +index 21f5aa0b217f..ee9b46394fdf 100644 +--- a/include/linux/freezer.h ++++ b/include/linux/freezer.h +@@ -297,6 +297,7 @@ static inline void set_freezable(void) {} + #define wait_event_freezekillable_unsafe(wq, condition) \ + wait_event_killable(wq, condition) + ++#define pm_freezing (false) + #endif /* !CONFIG_FREEZER */ + + #endif /* FREEZER_H_INCLUDED */ +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 1f8cea1436b4..1cd022304c64 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -211,13 +211,40 @@ struct task_group; + + extern void scheduler_tick(void); + +-#define MAX_SCHEDULE_TIMEOUT LONG_MAX +- ++#define MAX_SCHEDULE_TIMEOUT LONG_MAX + extern long schedule_timeout(long timeout); + extern long schedule_timeout_interruptible(long timeout); + extern long schedule_timeout_killable(long timeout); + extern long schedule_timeout_uninterruptible(long timeout); + extern long schedule_timeout_idle(long timeout); ++ ++#ifdef CONFIG_HIGH_RES_TIMERS ++extern long schedule_msec_hrtimeout(long timeout); ++extern long schedule_min_hrtimeout(void); ++extern long schedule_msec_hrtimeout_interruptible(long timeout); ++extern long schedule_msec_hrtimeout_uninterruptible(long timeout); ++#else ++static inline long schedule_msec_hrtimeout(long timeout) ++{ ++ return schedule_timeout(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_min_hrtimeout(void) ++{ ++ return schedule_timeout(1); ++} ++ ++static inline long schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ return schedule_timeout_interruptible(msecs_to_jiffies(timeout)); ++} ++ ++static inline long schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ return schedule_timeout_uninterruptible(msecs_to_jiffies(timeout)); ++} ++#endif ++ + asmlinkage void schedule(void); + extern void schedule_preempt_disabled(void); + +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index e1a549c9e399..12735724cce4 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2026,3 +2026,74 @@ int __sched schedule_hrtimeout(ktime_t *expires, + return schedule_hrtimeout_range(expires, 0, mode); + } + EXPORT_SYMBOL_GPL(schedule_hrtimeout); ++ ++/* ++ * As per schedule_hrtimeout but taskes a millisecond value and returns how ++ * many milliseconds are left. ++ */ ++long __sched schedule_msec_hrtimeout(long timeout) ++{ ++ struct hrtimer_sleeper t; ++ int delta, secs, jiffs; ++ ktime_t expires; ++ ++ if (!timeout) { ++ __set_current_state(TASK_RUNNING); ++ return 0; ++ } ++ ++ jiffs = msecs_to_jiffies(timeout); ++ /* ++ * If regular timer resolution is adequate or hrtimer resolution is not ++ * (yet) better than Hz, as would occur during startup, use regular ++ * timers. ++ */ ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ return schedule_timeout(jiffs); ++ ++ secs = timeout / 1000; ++ delta = (timeout % 1000) * NSEC_PER_MSEC; ++ expires = ktime_set(secs, delta); ++ ++ hrtimer_init_on_stack(&t.timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ hrtimer_set_expires_range_ns(&t.timer, expires, delta); ++ ++ hrtimer_init_sleeper(&t, current); ++ ++ hrtimer_start_expires(&t.timer, HRTIMER_MODE_REL); ++ ++ if (likely(t.task)) ++ schedule(); ++ ++ hrtimer_cancel(&t.timer); ++ destroy_hrtimer_on_stack(&t.timer); ++ ++ __set_current_state(TASK_RUNNING); ++ ++ expires = hrtimer_expires_remaining(&t.timer); ++ timeout = ktime_to_ms(expires); ++ return timeout < 0 ? 0 : timeout; ++} ++ ++EXPORT_SYMBOL(schedule_msec_hrtimeout); ++ ++long __sched schedule_min_hrtimeout(void) ++{ ++ return schedule_msec_hrtimeout(1); ++} ++ ++EXPORT_SYMBOL(schedule_min_hrtimeout); ++ ++long __sched schedule_msec_hrtimeout_interruptible(long timeout) ++{ ++ __set_current_state(TASK_INTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_interruptible); ++ ++long __sched schedule_msec_hrtimeout_uninterruptible(long timeout) ++{ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ return schedule_msec_hrtimeout(timeout); ++} ++EXPORT_SYMBOL(schedule_msec_hrtimeout_uninterruptible); +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0006-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0006-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch new file mode 100644 index 00000000..aa348475 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0006-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch @@ -0,0 +1,49 @@ +From 65ea992f1da66b8c0a5554776d1350417b9107cb Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Sat, 5 Nov 2016 09:27:36 +1100 +Subject: [PATCH 06/16] Special case calls of schedule_timeout(1) to use the + min hrtimeout of 1ms, working around low Hz resolutions. + +--- + kernel/time/timer.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 981eaddff95b..ae942d459aa2 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1801,6 +1801,18 @@ signed long __sched schedule_timeout(signed long timeout) + + expire = timeout + jiffies; + ++#ifdef CONFIG_HIGH_RES_TIMERS ++ if (timeout == 1 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ /* ++ * Special case 1 as being a request for the minimum timeout ++ * and use highres timers to timeout after 1ms to workaround ++ * the granularity of low Hz tick timers. ++ */ ++ if (!schedule_min_hrtimeout()) ++ return 0; ++ goto out_timeout; ++ } ++#endif + timer.task = current; + timer_setup_on_stack(&timer.timer, process_timeout, 0); + __mod_timer(&timer.timer, expire, 0); +@@ -1809,10 +1821,10 @@ signed long __sched schedule_timeout(signed long timeout) + + /* Remove the timer from the object tracker */ + destroy_timer_on_stack(&timer.timer); +- ++out_timeout: + timeout = expire - jiffies; + +- out: ++out: + return timeout < 0 ? 0 : timeout; + } + EXPORT_SYMBOL(schedule_timeout); +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0007-Convert-msleep-to-use-hrtimers-when-active.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0007-Convert-msleep-to-use-hrtimers-when-active.patch new file mode 100644 index 00000000..e9edcd12 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0007-Convert-msleep-to-use-hrtimers-when-active.patch @@ -0,0 +1,54 @@ +From 7b74daf29a88f3314af306509bd40d45c34f11c7 Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Fri, 4 Nov 2016 09:25:54 +1100 +Subject: [PATCH 07/16] Convert msleep to use hrtimers when active. + +--- + kernel/time/timer.c | 24 ++++++++++++++++++++++-- + 1 file changed, 22 insertions(+), 2 deletions(-) + +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index ae942d459aa2..542c13d98950 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1965,7 +1965,19 @@ void __init init_timers(void) + */ + void msleep(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ /* ++ * Use high resolution timers where the resolution of tick based ++ * timers is inadequate. ++ */ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ while (msecs) ++ msecs = schedule_msec_hrtimeout_uninterruptible(msecs); ++ return; ++ } ++ timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -1979,7 +1991,15 @@ EXPORT_SYMBOL(msleep); + */ + unsigned long msleep_interruptible(unsigned int msecs) + { +- unsigned long timeout = msecs_to_jiffies(msecs) + 1; ++ int jiffs = msecs_to_jiffies(msecs); ++ unsigned long timeout; ++ ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ while (msecs && !signal_pending(current)) ++ msecs = schedule_msec_hrtimeout_interruptible(msecs); ++ return msecs; ++ } ++ timeout = msecs_to_jiffies(msecs) + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0008-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0008-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch new file mode 100644 index 00000000..f4b441d8 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0008-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch @@ -0,0 +1,1009 @@ +From 8a679ba5279cbff1a8e4c47b55ac4bd6d66289f8 Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Mon, 20 Feb 2017 13:28:30 +1100 +Subject: [PATCH 08/16] Replace all schedule timeout(1) with + schedule_min_hrtimeout() + +--- + drivers/block/swim.c | 6 +- + drivers/bluetooth/hci_qca.c | 2 +- + drivers/char/ipmi/ipmi_msghandler.c | 2 +- + drivers/char/ipmi/ipmi_ssif.c | 2 +- + drivers/char/snsc.c | 4 +- + drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c | 2 +- + drivers/gpu/drm/vmwgfx/vmwgfx_irq.c | 2 +- + drivers/media/pci/ivtv/ivtv-ioctl.c | 2 +- + drivers/media/pci/ivtv/ivtv-streams.c | 2 +- + drivers/mfd/ucb1x00-core.c | 2 +- + drivers/misc/sgi-xp/xpc_channel.c | 2 +- + drivers/net/caif/caif_hsi.c | 2 +- + drivers/net/can/usb/peak_usb/pcan_usb.c | 2 +- + drivers/net/usb/lan78xx.c | 2 +- + drivers/net/usb/usbnet.c | 2 +- + drivers/scsi/fnic/fnic_scsi.c | 4 +- + drivers/scsi/snic/snic_scsi.c | 2 +- + .../staging/comedi/drivers/ni_mio_common.c | 2 +- + drivers/staging/lustre/lnet/lnet/lib-eq.c | 426 ++++++++++++++++++ + drivers/staging/rts5208/rtsx.c | 2 +- + drivers/staging/speakup/speakup_acntpc.c | 4 +- + drivers/staging/speakup/speakup_apollo.c | 2 +- + drivers/staging/speakup/speakup_decext.c | 2 +- + drivers/staging/speakup/speakup_decpc.c | 2 +- + drivers/staging/speakup/speakup_dectlk.c | 2 +- + drivers/staging/speakup/speakup_dtlk.c | 4 +- + drivers/staging/speakup/speakup_keypc.c | 4 +- + drivers/staging/speakup/synth.c | 14 +- + .../staging/unisys/visornic/visornic_main.c | 6 +- + drivers/video/fbdev/omap/hwa742.c | 2 +- + drivers/video/fbdev/pxafb.c | 2 +- + fs/btrfs/extent-tree.c | 2 +- + fs/btrfs/inode-map.c | 2 +- + sound/usb/line6/pcm.c | 2 +- + 34 files changed, 471 insertions(+), 51 deletions(-) + create mode 100644 drivers/staging/lustre/lnet/lnet/lib-eq.c + +diff --git a/drivers/block/swim.c b/drivers/block/swim.c +index 0e31884a9519..16fcfbde31d5 100644 +--- a/drivers/block/swim.c ++++ b/drivers/block/swim.c +@@ -332,7 +332,7 @@ static inline void swim_motor(struct swim __iomem *base, + if (swim_readbit(base, MOTOR_ON)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + } else if (action == OFF) { + swim_action(base, MOTOR_OFF); +@@ -351,7 +351,7 @@ static inline void swim_eject(struct swim __iomem *base) + if (!swim_readbit(base, DISK_IN)) + break; + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + swim_select(base, RELAX); + } +@@ -375,7 +375,7 @@ static inline int swim_step(struct swim __iomem *base) + for (wait = 0; wait < HZ; wait++) { + + current->state = TASK_INTERRUPTIBLE; +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + swim_select(base, RELAX); + if (!swim_readbit(base, STEP)) +diff --git a/drivers/bluetooth/hci_qca.c b/drivers/bluetooth/hci_qca.c +index 2fee65886d50..4ca0bae3df58 100644 +--- a/drivers/bluetooth/hci_qca.c ++++ b/drivers/bluetooth/hci_qca.c +@@ -980,7 +980,7 @@ static int qca_set_baudrate(struct hci_dev *hdev, uint8_t baudrate) + * then host can communicate with new baudrate to controller + */ + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(BAUDRATE_SETTLE_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((BAUDRATE_SETTLE_TIMEOUT_MS)); + set_current_state(TASK_RUNNING); + + if (qcadev->btsoc_type == QCA_WCN3990) +diff --git a/drivers/char/ipmi/ipmi_msghandler.c b/drivers/char/ipmi/ipmi_msghandler.c +index 7fc9612070a1..5a7f8a879001 100644 +--- a/drivers/char/ipmi/ipmi_msghandler.c ++++ b/drivers/char/ipmi/ipmi_msghandler.c +@@ -3453,7 +3453,7 @@ static void cleanup_smi_msgs(struct ipmi_smi *intf) + /* Current message first, to preserve order */ + while (intf->curr_msg && !list_empty(&intf->waiting_rcv_msgs)) { + /* Wait for the message to clear out. */ +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + /* No need for locks, the interface is down. */ +diff --git a/drivers/char/ipmi/ipmi_ssif.c b/drivers/char/ipmi/ipmi_ssif.c +index 29e67a80fb20..73bd0eca5fe5 100644 +--- a/drivers/char/ipmi/ipmi_ssif.c ++++ b/drivers/char/ipmi/ipmi_ssif.c +@@ -1208,7 +1208,7 @@ static void shutdown_ssif(void *send_info) + + /* make sure the driver is not looking for flags any more. */ + while (ssif_info->ssif_state != SSIF_NORMAL) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + + ssif_info->stopping = true; + del_timer_sync(&ssif_info->retry_timer); +diff --git a/drivers/char/snsc.c b/drivers/char/snsc.c +index 5918ea7499bb..5228e78df804 100644 +--- a/drivers/char/snsc.c ++++ b/drivers/char/snsc.c +@@ -198,7 +198,7 @@ scdrv_read(struct file *file, char __user *buf, size_t count, loff_t *f_pos) + add_wait_queue(&sd->sd_rq, &wait); + spin_unlock_irqrestore(&sd->sd_rlock, flags); + +- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT)); ++ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); + + remove_wait_queue(&sd->sd_rq, &wait); + if (signal_pending(current)) { +@@ -294,7 +294,7 @@ scdrv_write(struct file *file, const char __user *buf, + add_wait_queue(&sd->sd_wq, &wait); + spin_unlock_irqrestore(&sd->sd_wlock, flags); + +- schedule_timeout(msecs_to_jiffies(SCDRV_TIMEOUT)); ++ schedule_msec_hrtimeout((SCDRV_TIMEOUT)); + + remove_wait_queue(&sd->sd_wq, &wait); + if (signal_pending(current)) { +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +index d0fd147ef75f..730ae4fe6b85 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_fifo.c +@@ -235,7 +235,7 @@ static int vmw_fifo_wait_noirq(struct vmw_private *dev_priv, + DRM_ERROR("SVGA device lockup.\n"); + break; + } +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + if (interruptible && signal_pending(current)) { + ret = -ERESTARTSYS; + break; +diff --git a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +index c3ad4478266b..7e2a29d56459 100644 +--- a/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c ++++ b/drivers/gpu/drm/vmwgfx/vmwgfx_irq.c +@@ -202,7 +202,7 @@ int vmw_fallback_wait(struct vmw_private *dev_priv, + break; + } + if (lazy) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + else if ((++count & 0x0F) == 0) { + /** + * FIXME: Use schedule_hr_timeout here for +diff --git a/drivers/media/pci/ivtv/ivtv-ioctl.c b/drivers/media/pci/ivtv/ivtv-ioctl.c +index 4cdc6d2be85d..22c0803cbff3 100644 +--- a/drivers/media/pci/ivtv/ivtv-ioctl.c ++++ b/drivers/media/pci/ivtv/ivtv-ioctl.c +@@ -1154,7 +1154,7 @@ void ivtv_s_std_dec(struct ivtv *itv, v4l2_std_id std) + TASK_UNINTERRUPTIBLE); + if ((read_reg(IVTV_REG_DEC_LINE_FIELD) >> 16) < 100) + break; +- schedule_timeout(msecs_to_jiffies(25)); ++ schedule_msec_hrtimeout((25)); + } + finish_wait(&itv->vsync_waitq, &wait); + mutex_lock(&itv->serialize_lock); +diff --git a/drivers/media/pci/ivtv/ivtv-streams.c b/drivers/media/pci/ivtv/ivtv-streams.c +index d27c6df97566..e9ffc4eeb478 100644 +--- a/drivers/media/pci/ivtv/ivtv-streams.c ++++ b/drivers/media/pci/ivtv/ivtv-streams.c +@@ -834,7 +834,7 @@ int ivtv_stop_v4l2_encode_stream(struct ivtv_stream *s, int gop_end) + while (!test_bit(IVTV_F_I_EOS, &itv->i_flags) && + time_before(jiffies, + then + msecs_to_jiffies(2000))) { +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + } + + /* To convert jiffies to ms, we must multiply by 1000 +diff --git a/drivers/mfd/ucb1x00-core.c b/drivers/mfd/ucb1x00-core.c +index d6fb2e1a759a..7ac951b84beb 100644 +--- a/drivers/mfd/ucb1x00-core.c ++++ b/drivers/mfd/ucb1x00-core.c +@@ -253,7 +253,7 @@ unsigned int ucb1x00_adc_read(struct ucb1x00 *ucb, int adc_channel, int sync) + break; + /* yield to other processes */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + return UCB_ADC_DAT(val); +diff --git a/drivers/misc/sgi-xp/xpc_channel.c b/drivers/misc/sgi-xp/xpc_channel.c +index 05a890ce2ab8..f6eb97bc3a2c 100644 +--- a/drivers/misc/sgi-xp/xpc_channel.c ++++ b/drivers/misc/sgi-xp/xpc_channel.c +@@ -834,7 +834,7 @@ xpc_allocate_msg_wait(struct xpc_channel *ch) + + atomic_inc(&ch->n_on_msg_allocate_wq); + prepare_to_wait(&ch->msg_allocate_wq, &wait, TASK_INTERRUPTIBLE); +- ret = schedule_timeout(1); ++ ret = schedule_min_hrtimeout(); + finish_wait(&ch->msg_allocate_wq, &wait); + atomic_dec(&ch->n_on_msg_allocate_wq); + +diff --git a/drivers/net/caif/caif_hsi.c b/drivers/net/caif/caif_hsi.c +index 433a14b9f731..4d197a99472b 100644 +--- a/drivers/net/caif/caif_hsi.c ++++ b/drivers/net/caif/caif_hsi.c +@@ -939,7 +939,7 @@ static void cfhsi_wake_down(struct work_struct *work) + break; + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + retry--; + } + +diff --git a/drivers/net/can/usb/peak_usb/pcan_usb.c b/drivers/net/can/usb/peak_usb/pcan_usb.c +index 13238a72a338..fc51ae55c63f 100644 +--- a/drivers/net/can/usb/peak_usb/pcan_usb.c ++++ b/drivers/net/can/usb/peak_usb/pcan_usb.c +@@ -250,7 +250,7 @@ static int pcan_usb_write_mode(struct peak_usb_device *dev, u8 onoff) + } else { + /* the PCAN-USB needs time to init */ + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(PCAN_USB_STARTUP_TIMEOUT)); ++ schedule_msec_hrtimeout((PCAN_USB_STARTUP_TIMEOUT)); + } + + return err; +diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c +index c3c9ba44e2a1..1bc66289699f 100644 +--- a/drivers/net/usb/lan78xx.c ++++ b/drivers/net/usb/lan78xx.c +@@ -2681,7 +2681,7 @@ static void lan78xx_terminate_urbs(struct lan78xx_net *dev) + while (!skb_queue_empty(&dev->rxq) && + !skb_queue_empty(&dev->txq) && + !skb_queue_empty(&dev->done)) { +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + netif_dbg(dev, ifdown, dev->net, + "waited for %d urb completions\n", temp); +diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c +index 770aa624147f..9384de186bf9 100644 +--- a/drivers/net/usb/usbnet.c ++++ b/drivers/net/usb/usbnet.c +@@ -770,7 +770,7 @@ static void wait_skb_queue_empty(struct sk_buff_head *q) + spin_lock_irqsave(&q->lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(&q->lock, flags); +- schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ++ schedule_msec_hrtimeout((UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); + spin_lock_irqsave(&q->lock, flags); + } +diff --git a/drivers/scsi/fnic/fnic_scsi.c b/drivers/scsi/fnic/fnic_scsi.c +index 8cbd3c9f0b4c..7e3f9baa4ac6 100644 +--- a/drivers/scsi/fnic/fnic_scsi.c ++++ b/drivers/scsi/fnic/fnic_scsi.c +@@ -217,7 +217,7 @@ int fnic_fw_reset_handler(struct fnic *fnic) + + /* wait for io cmpl */ + while (atomic_read(&fnic->in_flight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + spin_lock_irqsave(&fnic->wq_copy_lock[0], flags); + +@@ -2255,7 +2255,7 @@ static int fnic_clean_pending_aborts(struct fnic *fnic, + } + } + +- schedule_timeout(msecs_to_jiffies(2 * fnic->config.ed_tov)); ++ schedule_msec_hrtimeout((2 * fnic->config.ed_tov)); + + /* walk again to check, if IOs are still pending in fw */ + if (fnic_is_abts_pending(fnic, lr_sc)) +diff --git a/drivers/scsi/snic/snic_scsi.c b/drivers/scsi/snic/snic_scsi.c +index d9b2e46424aa..4a313a0f2039 100644 +--- a/drivers/scsi/snic/snic_scsi.c ++++ b/drivers/scsi/snic/snic_scsi.c +@@ -2354,7 +2354,7 @@ snic_reset(struct Scsi_Host *shost, struct scsi_cmnd *sc) + + /* Wait for all the IOs that are entered in Qcmd */ + while (atomic_read(&snic->ios_inflight)) +- schedule_timeout(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout((1)); + + ret = snic_issue_hba_reset(snic, sc); + if (ret) { +diff --git a/drivers/staging/comedi/drivers/ni_mio_common.c b/drivers/staging/comedi/drivers/ni_mio_common.c +index 4dee2fc37aed..2bb1c1157636 100644 +--- a/drivers/staging/comedi/drivers/ni_mio_common.c ++++ b/drivers/staging/comedi/drivers/ni_mio_common.c +@@ -4650,7 +4650,7 @@ static int cs5529_wait_for_idle(struct comedi_device *dev) + if ((status & NI67XX_CAL_STATUS_BUSY) == 0) + break; + set_current_state(TASK_INTERRUPTIBLE); +- if (schedule_timeout(1)) ++ if (schedule_min_hrtimeout()) + return -EIO; + } + if (i == timeout) { +diff --git a/drivers/staging/lustre/lnet/lnet/lib-eq.c b/drivers/staging/lustre/lnet/lnet/lib-eq.c +new file mode 100644 +index 000000000000..8cca151741b2 +--- /dev/null ++++ b/drivers/staging/lustre/lnet/lnet/lib-eq.c +@@ -0,0 +1,426 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * GPL HEADER START ++ * ++ * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 only, ++ * as published by the Free Software Foundation. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License version 2 for more details (a copy is included ++ * in the LICENSE file that accompanied this code). ++ * ++ * You should have received a copy of the GNU General Public License ++ * version 2 along with this program; If not, see ++ * http://www.gnu.org/licenses/gpl-2.0.html ++ * ++ * GPL HEADER END ++ */ ++/* ++ * Copyright (c) 2003, 2010, Oracle and/or its affiliates. All rights reserved. ++ * Use is subject to license terms. ++ * ++ * Copyright (c) 2012, Intel Corporation. ++ */ ++/* ++ * This file is part of Lustre, http://www.lustre.org/ ++ * Lustre is a trademark of Sun Microsystems, Inc. ++ * ++ * lnet/lnet/lib-eq.c ++ * ++ * Library level Event queue management routines ++ */ ++ ++#define DEBUG_SUBSYSTEM S_LNET ++ ++#include <linux/lnet/lib-lnet.h> ++ ++/** ++ * Create an event queue that has room for \a count number of events. ++ * ++ * The event queue is circular and older events will be overwritten by new ++ * ones if they are not removed in time by the user using the functions ++ * LNetEQGet(), LNetEQWait(), or LNetEQPoll(). It is up to the user to ++ * determine the appropriate size of the event queue to prevent this loss ++ * of events. Note that when EQ handler is specified in \a callback, no ++ * event loss can happen, since the handler is run for each event deposited ++ * into the EQ. ++ * ++ * \param count The number of events to be stored in the event queue. It ++ * will be rounded up to the next power of two. ++ * \param callback A handler function that runs when an event is deposited ++ * into the EQ. The constant value LNET_EQ_HANDLER_NONE can be used to ++ * indicate that no event handler is desired. ++ * \param handle On successful return, this location will hold a handle for ++ * the newly created EQ. ++ * ++ * \retval 0 On success. ++ * \retval -EINVAL If an parameter is not valid. ++ * \retval -ENOMEM If memory for the EQ can't be allocated. ++ * ++ * \see lnet_eq_handler_t for the discussion on EQ handler semantics. ++ */ ++int ++LNetEQAlloc(unsigned int count, lnet_eq_handler_t callback, ++ struct lnet_handle_eq *handle) ++{ ++ struct lnet_eq *eq; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ /* ++ * We need count to be a power of 2 so that when eq_{enq,deq}_seq ++ * overflow, they don't skip entries, so the queue has the same ++ * apparent capacity at all times ++ */ ++ if (count) ++ count = roundup_pow_of_two(count); ++ ++ if (callback != LNET_EQ_HANDLER_NONE && count) ++ CWARN("EQ callback is guaranteed to get every event, do you still want to set eqcount %d for polling event which will have locking overhead? Please contact with developer to confirm\n", count); ++ ++ /* ++ * count can be 0 if only need callback, we can eliminate ++ * overhead of enqueue event ++ */ ++ if (!count && callback == LNET_EQ_HANDLER_NONE) ++ return -EINVAL; ++ ++ eq = kzalloc(sizeof(*eq), GFP_NOFS); ++ if (!eq) ++ return -ENOMEM; ++ ++ if (count) { ++ eq->eq_events = kvmalloc_array(count, sizeof(struct lnet_event), ++ GFP_KERNEL | __GFP_ZERO); ++ if (!eq->eq_events) ++ goto failed; ++ /* ++ * NB allocator has set all event sequence numbers to 0, ++ * so all them should be earlier than eq_deq_seq ++ */ ++ } ++ ++ eq->eq_deq_seq = 1; ++ eq->eq_enq_seq = 1; ++ eq->eq_size = count; ++ eq->eq_callback = callback; ++ ++ eq->eq_refs = cfs_percpt_alloc(lnet_cpt_table(), ++ sizeof(*eq->eq_refs[0])); ++ if (!eq->eq_refs) ++ goto failed; ++ ++ /* MUST hold both exclusive lnet_res_lock */ ++ lnet_res_lock(LNET_LOCK_EX); ++ /* ++ * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do ++ * both EQ lookup and poll event with only lnet_eq_wait_lock ++ */ ++ lnet_eq_wait_lock(); ++ ++ lnet_res_lh_initialize(&the_lnet.ln_eq_container, &eq->eq_lh); ++ list_add(&eq->eq_list, &the_lnet.ln_eq_container.rec_active); ++ ++ lnet_eq_wait_unlock(); ++ lnet_res_unlock(LNET_LOCK_EX); ++ ++ lnet_eq2handle(handle, eq); ++ return 0; ++ ++failed: ++ kvfree(eq->eq_events); ++ ++ if (eq->eq_refs) ++ cfs_percpt_free(eq->eq_refs); ++ ++ kfree(eq); ++ return -ENOMEM; ++} ++EXPORT_SYMBOL(LNetEQAlloc); ++ ++/** ++ * Release the resources associated with an event queue if it's idle; ++ * otherwise do nothing and it's up to the user to try again. ++ * ++ * \param eqh A handle for the event queue to be released. ++ * ++ * \retval 0 If the EQ is not in use and freed. ++ * \retval -ENOENT If \a eqh does not point to a valid EQ. ++ * \retval -EBUSY If the EQ is still in use by some MDs. ++ */ ++int ++LNetEQFree(struct lnet_handle_eq eqh) ++{ ++ struct lnet_eq *eq; ++ struct lnet_event *events = NULL; ++ int **refs = NULL; ++ int *ref; ++ int rc = 0; ++ int size = 0; ++ int i; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ lnet_res_lock(LNET_LOCK_EX); ++ /* ++ * NB: hold lnet_eq_wait_lock for EQ link/unlink, so we can do ++ * both EQ lookup and poll event with only lnet_eq_wait_lock ++ */ ++ lnet_eq_wait_lock(); ++ ++ eq = lnet_handle2eq(&eqh); ++ if (!eq) { ++ rc = -ENOENT; ++ goto out; ++ } ++ ++ cfs_percpt_for_each(ref, i, eq->eq_refs) { ++ LASSERT(*ref >= 0); ++ if (!*ref) ++ continue; ++ ++ CDEBUG(D_NET, "Event equeue (%d: %d) busy on destroy.\n", ++ i, *ref); ++ rc = -EBUSY; ++ goto out; ++ } ++ ++ /* stash for free after lock dropped */ ++ events = eq->eq_events; ++ size = eq->eq_size; ++ refs = eq->eq_refs; ++ ++ lnet_res_lh_invalidate(&eq->eq_lh); ++ list_del(&eq->eq_list); ++ kfree(eq); ++ out: ++ lnet_eq_wait_unlock(); ++ lnet_res_unlock(LNET_LOCK_EX); ++ ++ kvfree(events); ++ if (refs) ++ cfs_percpt_free(refs); ++ ++ return rc; ++} ++EXPORT_SYMBOL(LNetEQFree); ++ ++void ++lnet_eq_enqueue_event(struct lnet_eq *eq, struct lnet_event *ev) ++{ ++ /* MUST called with resource lock hold but w/o lnet_eq_wait_lock */ ++ int index; ++ ++ if (!eq->eq_size) { ++ LASSERT(eq->eq_callback != LNET_EQ_HANDLER_NONE); ++ eq->eq_callback(ev); ++ return; ++ } ++ ++ lnet_eq_wait_lock(); ++ ev->sequence = eq->eq_enq_seq++; ++ ++ LASSERT(eq->eq_size == LOWEST_BIT_SET(eq->eq_size)); ++ index = ev->sequence & (eq->eq_size - 1); ++ ++ eq->eq_events[index] = *ev; ++ ++ if (eq->eq_callback != LNET_EQ_HANDLER_NONE) ++ eq->eq_callback(ev); ++ ++ /* Wake anyone waiting in LNetEQPoll() */ ++ if (waitqueue_active(&the_lnet.ln_eq_waitq)) ++ wake_up_all(&the_lnet.ln_eq_waitq); ++ lnet_eq_wait_unlock(); ++} ++ ++static int ++lnet_eq_dequeue_event(struct lnet_eq *eq, struct lnet_event *ev) ++{ ++ int new_index = eq->eq_deq_seq & (eq->eq_size - 1); ++ struct lnet_event *new_event = &eq->eq_events[new_index]; ++ int rc; ++ ++ /* must called with lnet_eq_wait_lock hold */ ++ if (LNET_SEQ_GT(eq->eq_deq_seq, new_event->sequence)) ++ return 0; ++ ++ /* We've got a new event... */ ++ *ev = *new_event; ++ ++ CDEBUG(D_INFO, "event: %p, sequence: %lu, eq->size: %u\n", ++ new_event, eq->eq_deq_seq, eq->eq_size); ++ ++ /* ...but did it overwrite an event we've not seen yet? */ ++ if (eq->eq_deq_seq == new_event->sequence) { ++ rc = 1; ++ } else { ++ /* ++ * don't complain with CERROR: some EQs are sized small ++ * anyway; if it's important, the caller should complain ++ */ ++ CDEBUG(D_NET, "Event Queue Overflow: eq seq %lu ev seq %lu\n", ++ eq->eq_deq_seq, new_event->sequence); ++ rc = -EOVERFLOW; ++ } ++ ++ eq->eq_deq_seq = new_event->sequence + 1; ++ return rc; ++} ++ ++/** ++ * A nonblocking function that can be used to get the next event in an EQ. ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully. The event is removed from the queue. ++ * ++ * \param eventq A handle for the event queue. ++ * \param event On successful return (1 or -EOVERFLOW), this location will ++ * hold the next event in the EQ. ++ * ++ * \retval 0 No pending event in the EQ. ++ * \retval 1 Indicates success. ++ * \retval -ENOENT If \a eventq does not point to a valid EQ. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ has been dropped due to limited space in the EQ. ++ */ ++ ++/** ++ * Block the calling process until there is an event in the EQ. ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully. This function returns the next event ++ * in the EQ and removes it from the EQ. ++ * ++ * \param eventq A handle for the event queue. ++ * \param event On successful return (1 or -EOVERFLOW), this location will ++ * hold the next event in the EQ. ++ * ++ * \retval 1 Indicates success. ++ * \retval -ENOENT If \a eventq does not point to a valid EQ. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ has been dropped due to limited space in the EQ. ++ */ ++ ++static int ++lnet_eq_wait_locked(int *timeout_ms, long state) ++__must_hold(&the_lnet.ln_eq_wait_lock) ++{ ++ int tms = *timeout_ms; ++ int wait; ++ wait_queue_entry_t wl; ++ unsigned long now; ++ ++ if (!tms) ++ return -ENXIO; /* don't want to wait and no new event */ ++ ++ init_waitqueue_entry(&wl, current); ++ set_current_state(state); ++ add_wait_queue(&the_lnet.ln_eq_waitq, &wl); ++ ++ lnet_eq_wait_unlock(); ++ ++ if (tms < 0) { ++ schedule(); ++ } else { ++ now = jiffies; ++ schedule_msec_hrtimeout((tms)); ++ tms -= jiffies_to_msecs(jiffies - now); ++ if (tms < 0) /* no more wait but may have new event */ ++ tms = 0; ++ } ++ ++ wait = tms; /* might need to call here again */ ++ *timeout_ms = tms; ++ ++ lnet_eq_wait_lock(); ++ remove_wait_queue(&the_lnet.ln_eq_waitq, &wl); ++ ++ return wait; ++} ++ ++/** ++ * Block the calling process until there's an event from a set of EQs or ++ * timeout happens. ++ * ++ * If an event handler is associated with the EQ, the handler will run before ++ * this function returns successfully, in which case the corresponding event ++ * is consumed. ++ * ++ * LNetEQPoll() provides a timeout to allow applications to poll, block for a ++ * fixed period, or block indefinitely. ++ * ++ * \param eventqs,neq An array of EQ handles, and size of the array. ++ * \param timeout_ms Time in milliseconds to wait for an event to occur on ++ * one of the EQs. The constant LNET_TIME_FOREVER can be used to indicate an ++ * infinite timeout. ++ * \param interruptible, if true, use TASK_INTERRUPTIBLE, else TASK_NOLOAD ++ * \param event,which On successful return (1 or -EOVERFLOW), \a event will ++ * hold the next event in the EQs, and \a which will contain the index of the ++ * EQ from which the event was taken. ++ * ++ * \retval 0 No pending event in the EQs after timeout. ++ * \retval 1 Indicates success. ++ * \retval -EOVERFLOW Indicates success (i.e., an event is returned) and that ++ * at least one event between this event and the last event obtained from the ++ * EQ indicated by \a which has been dropped due to limited space in the EQ. ++ * \retval -ENOENT If there's an invalid handle in \a eventqs. ++ */ ++int ++LNetEQPoll(struct lnet_handle_eq *eventqs, int neq, int timeout_ms, ++ int interruptible, ++ struct lnet_event *event, int *which) ++{ ++ int wait = 1; ++ int rc; ++ int i; ++ ++ LASSERT(the_lnet.ln_refcount > 0); ++ ++ if (neq < 1) ++ return -ENOENT; ++ ++ lnet_eq_wait_lock(); ++ ++ for (;;) { ++ for (i = 0; i < neq; i++) { ++ struct lnet_eq *eq = lnet_handle2eq(&eventqs[i]); ++ ++ if (!eq) { ++ lnet_eq_wait_unlock(); ++ return -ENOENT; ++ } ++ ++ rc = lnet_eq_dequeue_event(eq, event); ++ if (rc) { ++ lnet_eq_wait_unlock(); ++ *which = i; ++ return rc; ++ } ++ } ++ ++ if (!wait) ++ break; ++ ++ /* ++ * return value of lnet_eq_wait_locked: ++ * -1 : did nothing and it's sure no new event ++ * 1 : sleep inside and wait until new event ++ * 0 : don't want to wait anymore, but might have new event ++ * so need to call dequeue again ++ */ ++ wait = lnet_eq_wait_locked(&timeout_ms, ++ interruptible ? TASK_INTERRUPTIBLE ++ : TASK_NOLOAD); ++ if (wait < 0) /* no new event */ ++ break; ++ } ++ ++ lnet_eq_wait_unlock(); ++ return 0; ++} +diff --git a/drivers/staging/rts5208/rtsx.c b/drivers/staging/rts5208/rtsx.c +index 69e6abe14abf..7d23e214ac21 100644 +--- a/drivers/staging/rts5208/rtsx.c ++++ b/drivers/staging/rts5208/rtsx.c +@@ -507,7 +507,7 @@ static int rtsx_polling_thread(void *__dev) + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(POLLING_INTERVAL)); ++ schedule_msec_hrtimeout((POLLING_INTERVAL)); + + /* lock the device pointers */ + mutex_lock(&dev->dev_mutex); +diff --git a/drivers/staging/speakup/speakup_acntpc.c b/drivers/staging/speakup/speakup_acntpc.c +index 28519754b2f0..a96805bbec5c 100644 +--- a/drivers/staging/speakup/speakup_acntpc.c ++++ b/drivers/staging/speakup/speakup_acntpc.c +@@ -198,7 +198,7 @@ static void do_catch_up(struct spk_synth *synth) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -226,7 +226,7 @@ static void do_catch_up(struct spk_synth *synth) + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_apollo.c b/drivers/staging/speakup/speakup_apollo.c +index 0877b4044c28..627102d048c1 100644 +--- a/drivers/staging/speakup/speakup_apollo.c ++++ b/drivers/staging/speakup/speakup_apollo.c +@@ -165,7 +165,7 @@ static void do_catch_up(struct spk_synth *synth) + if (!synth->io_ops->synth_out(synth, ch)) { + synth->io_ops->tiocmset(0, UART_MCR_RTS); + synth->io_ops->tiocmset(UART_MCR_RTS, 0); +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +diff --git a/drivers/staging/speakup/speakup_decext.c b/drivers/staging/speakup/speakup_decext.c +index 3741c0fcf5bb..bff857b4aa5f 100644 +--- a/drivers/staging/speakup/speakup_decext.c ++++ b/drivers/staging/speakup/speakup_decext.c +@@ -176,7 +176,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full() || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_decpc.c b/drivers/staging/speakup/speakup_decpc.c +index 6649309e0342..c60e4712d817 100644 +--- a/drivers/staging/speakup/speakup_decpc.c ++++ b/drivers/staging/speakup/speakup_decpc.c +@@ -394,7 +394,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (dt_sendchar(ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dectlk.c b/drivers/staging/speakup/speakup_dectlk.c +index a144f28ee1a8..c34764fafe2b 100644 +--- a/drivers/staging/speakup/speakup_dectlk.c ++++ b/drivers/staging/speakup/speakup_dectlk.c +@@ -244,7 +244,7 @@ static void do_catch_up(struct spk_synth *synth) + if (ch == '\n') + ch = 0x0D; + if (synth_full_val || !synth->io_ops->synth_out(synth, ch)) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + continue; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/speakup/speakup_dtlk.c b/drivers/staging/speakup/speakup_dtlk.c +index dbebed0eeeec..6d83c13ca4a6 100644 +--- a/drivers/staging/speakup/speakup_dtlk.c ++++ b/drivers/staging/speakup/speakup_dtlk.c +@@ -211,7 +211,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -227,7 +227,7 @@ static void do_catch_up(struct spk_synth *synth) + delay_time_val = delay_time->u.n.value; + jiffy_delta_val = jiffy_delta->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies + jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/speakup_keypc.c b/drivers/staging/speakup/speakup_keypc.c +index 3901734982a4..4e8a7a98b46d 100644 +--- a/drivers/staging/speakup/speakup_keypc.c ++++ b/drivers/staging/speakup/speakup_keypc.c +@@ -199,7 +199,7 @@ spin_lock_irqsave(&speakup_info.spinlock, flags); + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth_full()) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout((full_time_val)); + continue; + } + set_current_state(TASK_RUNNING); +@@ -232,7 +232,7 @@ spin_lock_irqsave(&speakup_info.spinlock, flags); + jiffy_delta_val = jiffy_delta->u.n.value; + delay_time_val = delay_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); +- schedule_timeout(msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout((delay_time_val)); + jiff_max = jiffies+jiffy_delta_val; + } + } +diff --git a/drivers/staging/speakup/synth.c b/drivers/staging/speakup/synth.c +index 25f259ee4ffc..b9721103e651 100644 +--- a/drivers/staging/speakup/synth.c ++++ b/drivers/staging/speakup/synth.c +@@ -93,12 +93,8 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (ch == '\n') + ch = synth->procspeech; +- if (unicode) +- ret = synth->io_ops->synth_out_unicode(synth, ch); +- else +- ret = synth->io_ops->synth_out(synth, ch); +- if (!ret) { +- schedule_timeout(msecs_to_jiffies(full_time_val)); ++ if (!synth->io_ops->synth_out(synth, ch)) { ++ schedule_msec_hrtimeout(full_time_val); + continue; + } + if (time_after_eq(jiffies, jiff_max) && (ch == SPACE)) { +@@ -108,11 +104,9 @@ static void _spk_do_catch_up(struct spk_synth *synth, int unicode) + full_time_val = full_time->u.n.value; + spin_unlock_irqrestore(&speakup_info.spinlock, flags); + if (synth->io_ops->synth_out(synth, synth->procspeech)) +- schedule_timeout( +- msecs_to_jiffies(delay_time_val)); ++ schedule_msec_hrtimeout(delay_time_val); + else +- schedule_timeout( +- msecs_to_jiffies(full_time_val)); ++ schedule_msec_hrtimeout(full_time_val); + jiff_max = jiffies + jiffy_delta_val; + } + set_current_state(TASK_RUNNING); +diff --git a/drivers/staging/unisys/visornic/visornic_main.c b/drivers/staging/unisys/visornic/visornic_main.c +index 3647b8f1ed28..9fb26ccc2b3b 100644 +--- a/drivers/staging/unisys/visornic/visornic_main.c ++++ b/drivers/staging/unisys/visornic/visornic_main.c +@@ -549,7 +549,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +@@ -560,7 +560,7 @@ static int visornic_disable_with_timeout(struct net_device *netdev, + while (1) { + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- schedule_timeout(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + if (atomic_read(&devdata->usage)) + break; +@@ -714,7 +714,7 @@ static int visornic_enable_with_timeout(struct net_device *netdev, + } + set_current_state(TASK_INTERRUPTIBLE); + spin_unlock_irqrestore(&devdata->priv_lock, flags); +- wait += schedule_timeout(msecs_to_jiffies(10)); ++ wait += schedule_msec_hrtimeout((10)); + spin_lock_irqsave(&devdata->priv_lock, flags); + } + +diff --git a/drivers/video/fbdev/omap/hwa742.c b/drivers/video/fbdev/omap/hwa742.c +index 6199d4806193..7c7165f2dad4 100644 +--- a/drivers/video/fbdev/omap/hwa742.c ++++ b/drivers/video/fbdev/omap/hwa742.c +@@ -926,7 +926,7 @@ static void hwa742_resume(void) + if (hwa742_read_reg(HWA742_PLL_DIV_REG) & (1 << 7)) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(5)); ++ schedule_msec_hrtimeout((5)); + } + hwa742_set_update_mode(hwa742.update_mode_before_suspend); + } +diff --git a/drivers/video/fbdev/pxafb.c b/drivers/video/fbdev/pxafb.c +index bbed039617a4..681ae041ea77 100644 +--- a/drivers/video/fbdev/pxafb.c ++++ b/drivers/video/fbdev/pxafb.c +@@ -1287,7 +1287,7 @@ static int pxafb_smart_thread(void *arg) + mutex_unlock(&fbi->ctrlr_lock); + + set_current_state(TASK_INTERRUPTIBLE); +- schedule_timeout(msecs_to_jiffies(30)); ++ schedule_msec_hrtimeout((30)); + } + + pr_debug("%s(): task ending\n", __func__); +diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c +index 2d9074295d7f..7df3e60e4e89 100644 +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -5905,7 +5905,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes) + flush = BTRFS_RESERVE_FLUSH_LIMIT; + + if (btrfs_transaction_in_commit(fs_info)) +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } + + if (delalloc_lock) +diff --git a/fs/btrfs/inode-map.c b/fs/btrfs/inode-map.c +index ffca2abf13d0..89b2a7f7397e 100644 +--- a/fs/btrfs/inode-map.c ++++ b/fs/btrfs/inode-map.c +@@ -75,7 +75,7 @@ static int caching_kthread(void *data) + btrfs_release_path(path); + root->ino_cache_progress = last; + up_read(&fs_info->commit_root_sem); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + goto again; + } else + continue; +diff --git a/sound/usb/line6/pcm.c b/sound/usb/line6/pcm.c +index 72c6f8e82a7e..46d8c2a148ad 100644 +--- a/sound/usb/line6/pcm.c ++++ b/sound/usb/line6/pcm.c +@@ -131,7 +131,7 @@ static void line6_wait_clear_audio_urbs(struct snd_line6_pcm *line6pcm, + if (!alive) + break; + set_current_state(TASK_UNINTERRUPTIBLE); +- schedule_timeout(1); ++ schedule_min_hrtimeout(); + } while (--timeout > 0); + if (alive) + dev_err(line6pcm->line6->ifcdev, +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0009-Replace-all-calls-to-schedule_timeout_interruptible-.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0009-Replace-all-calls-to-schedule_timeout_interruptible-.patch new file mode 100644 index 00000000..0ddf57af --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0009-Replace-all-calls-to-schedule_timeout_interruptible-.patch @@ -0,0 +1,311 @@ +From cd03bffabeee4f4c8438969b3b4d184d0d0bb81b Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Mon, 20 Feb 2017 13:30:07 +1100 +Subject: [PATCH 09/16] Replace all calls to schedule_timeout_interruptible of + potentially under 50ms to use schedule_msec_hrtimeout_interruptible. + +--- + drivers/hwmon/fam15h_power.c | 2 +- + drivers/iio/light/tsl2563.c | 6 +----- + drivers/media/i2c/msp3400-driver.c | 4 ++-- + drivers/media/pci/ivtv/ivtv-gpio.c | 6 +++--- + drivers/media/radio/radio-mr800.c | 2 +- + drivers/media/radio/radio-tea5777.c | 2 +- + drivers/media/radio/tea575x.c | 2 +- + drivers/parport/ieee1284.c | 2 +- + drivers/parport/ieee1284_ops.c | 2 +- + drivers/platform/x86/intel_ips.c | 8 ++++---- + net/core/pktgen.c | 2 +- + sound/soc/codecs/wm8350.c | 12 ++++++------ + sound/soc/codecs/wm8900.c | 2 +- + sound/soc/codecs/wm9713.c | 4 ++-- + 14 files changed, 26 insertions(+), 30 deletions(-) + +diff --git a/drivers/hwmon/fam15h_power.c b/drivers/hwmon/fam15h_power.c +index 9545a346044f..c24cf1302ec7 100644 +--- a/drivers/hwmon/fam15h_power.c ++++ b/drivers/hwmon/fam15h_power.c +@@ -237,7 +237,7 @@ static ssize_t power1_average_show(struct device *dev, + prev_ptsc[cu] = data->cpu_sw_pwr_ptsc[cu]; + } + +- leftover = schedule_timeout_interruptible(msecs_to_jiffies(data->power_period)); ++ leftover = schedule_msec_hrtimeout_interruptible((data->power_period)); + if (leftover) + return 0; + +diff --git a/drivers/iio/light/tsl2563.c b/drivers/iio/light/tsl2563.c +index 6bbb0b1e6032..f4b83648c405 100644 +--- a/drivers/iio/light/tsl2563.c ++++ b/drivers/iio/light/tsl2563.c +@@ -282,11 +282,7 @@ static void tsl2563_wait_adc(struct tsl2563_chip *chip) + default: + delay = 402; + } +- /* +- * TODO: Make sure that we wait at least required delay but why we +- * have to extend it one tick more? +- */ +- schedule_timeout_interruptible(msecs_to_jiffies(delay) + 2); ++ schedule_msec_hrtimeout_interruptible(delay + 1); + } + + static int tsl2563_adjust_gainlevel(struct tsl2563_chip *chip, u16 adc) +diff --git a/drivers/media/i2c/msp3400-driver.c b/drivers/media/i2c/msp3400-driver.c +index 3db966db83eb..f0fab7676f72 100644 +--- a/drivers/media/i2c/msp3400-driver.c ++++ b/drivers/media/i2c/msp3400-driver.c +@@ -179,7 +179,7 @@ static int msp_read(struct i2c_client *client, int dev, int addr) + break; + dev_warn(&client->dev, "I/O error #%d (read 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +@@ -220,7 +220,7 @@ static int msp_write(struct i2c_client *client, int dev, int addr, int val) + break; + dev_warn(&client->dev, "I/O error #%d (write 0x%02x/0x%02x)\n", err, + dev, addr); +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + if (err == 3) { + dev_warn(&client->dev, "resetting chip, sound will go off.\n"); +diff --git a/drivers/media/pci/ivtv/ivtv-gpio.c b/drivers/media/pci/ivtv/ivtv-gpio.c +index f752f3993687..23372af61ebf 100644 +--- a/drivers/media/pci/ivtv/ivtv-gpio.c ++++ b/drivers/media/pci/ivtv/ivtv-gpio.c +@@ -117,7 +117,7 @@ void ivtv_reset_ir_gpio(struct ivtv *itv) + curout = (curout & ~0xF) | 1; + write_reg(curout, IVTV_REG_GPIO_OUT); + /* We could use something else for smaller time */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + curout |= 2; + write_reg(curout, IVTV_REG_GPIO_OUT); + curdir &= ~0x80; +@@ -137,11 +137,11 @@ int ivtv_reset_tuner_gpio(void *dev, int component, int cmd, int value) + curout = read_reg(IVTV_REG_GPIO_OUT); + curout &= ~(1 << itv->card->xceive_pin); + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + + curout |= 1 << itv->card->xceive_pin; + write_reg(curout, IVTV_REG_GPIO_OUT); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible((1)); + return 0; + } + +diff --git a/drivers/media/radio/radio-mr800.c b/drivers/media/radio/radio-mr800.c +index 0f292c6ba338..9d7f22fe1ca8 100644 +--- a/drivers/media/radio/radio-mr800.c ++++ b/drivers/media/radio/radio-mr800.c +@@ -378,7 +378,7 @@ static int vidioc_s_hw_freq_seek(struct file *file, void *priv, + retval = -ENODATA; + break; + } +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + retval = -ERESTARTSYS; + break; + } +diff --git a/drivers/media/radio/radio-tea5777.c b/drivers/media/radio/radio-tea5777.c +index 04ed1a5d1177..d593d28dc286 100644 +--- a/drivers/media/radio/radio-tea5777.c ++++ b/drivers/media/radio/radio-tea5777.c +@@ -245,7 +245,7 @@ static int radio_tea5777_update_read_reg(struct radio_tea5777 *tea, int wait) + } + + if (wait) { +- if (schedule_timeout_interruptible(msecs_to_jiffies(wait))) ++ if (schedule_msec_hrtimeout_interruptible((wait))) + return -ERESTARTSYS; + } + +diff --git a/drivers/media/radio/tea575x.c b/drivers/media/radio/tea575x.c +index 7412fe1b10c6..92dce75e6ce9 100644 +--- a/drivers/media/radio/tea575x.c ++++ b/drivers/media/radio/tea575x.c +@@ -416,7 +416,7 @@ int snd_tea575x_s_hw_freq_seek(struct file *file, struct snd_tea575x *tea, + for (;;) { + if (time_after(jiffies, timeout)) + break; +- if (schedule_timeout_interruptible(msecs_to_jiffies(10))) { ++ if (schedule_msec_hrtimeout_interruptible((10))) { + /* some signal arrived, stop search */ + tea->val &= ~TEA575X_BIT_SEARCH; + snd_tea575x_set_freq(tea); +diff --git a/drivers/parport/ieee1284.c b/drivers/parport/ieee1284.c +index f12b9da69255..6ca6eecbdb2d 100644 +--- a/drivers/parport/ieee1284.c ++++ b/drivers/parport/ieee1284.c +@@ -208,7 +208,7 @@ int parport_wait_peripheral(struct parport *port, + /* parport_wait_event didn't time out, but the + * peripheral wasn't actually ready either. + * Wait for another 10ms. */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + } + } + +diff --git a/drivers/parport/ieee1284_ops.c b/drivers/parport/ieee1284_ops.c +index 5d41dda6da4e..34705f6b423f 100644 +--- a/drivers/parport/ieee1284_ops.c ++++ b/drivers/parport/ieee1284_ops.c +@@ -537,7 +537,7 @@ size_t parport_ieee1284_ecp_read_data (struct parport *port, + /* Yield the port for a while. */ + if (count && dev->port->irq != PARPORT_IRQ_NONE) { + parport_release (dev); +- schedule_timeout_interruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_interruptible((40)); + parport_claim_or_block (dev); + } + else +diff --git a/drivers/platform/x86/intel_ips.c b/drivers/platform/x86/intel_ips.c +index c5ece7ef08c6..9256fb502545 100644 +--- a/drivers/platform/x86/intel_ips.c ++++ b/drivers/platform/x86/intel_ips.c +@@ -809,7 +809,7 @@ static int ips_adjust(void *data) + ips_gpu_lower(ips); + + sleep: +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_ADJUST_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_ADJUST_PERIOD)); + } while (!kthread_should_stop()); + + dev_dbg(ips->dev, "ips-adjust thread stopped\n"); +@@ -985,7 +985,7 @@ static int ips_monitor(void *data) + seqno_timestamp = get_jiffies_64(); + + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + + /* Collect an initial average */ + for (i = 0; i < IPS_SAMPLE_COUNT; i++) { +@@ -1012,7 +1012,7 @@ static int ips_monitor(void *data) + mchp_samples[i] = mchp; + } + +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + if (kthread_should_stop()) + break; + } +@@ -1039,7 +1039,7 @@ static int ips_monitor(void *data) + * us to reduce the sample frequency if the CPU and GPU are idle. + */ + old_cpu_power = thm_readl(THM_CEC); +- schedule_timeout_interruptible(msecs_to_jiffies(IPS_SAMPLE_PERIOD)); ++ schedule_msec_hrtimeout_interruptible((IPS_SAMPLE_PERIOD)); + last_sample_period = IPS_SAMPLE_PERIOD; + + timer_setup(&ips->timer, monitor_timeout, TIMER_DEFERRABLE); +diff --git a/net/core/pktgen.c b/net/core/pktgen.c +index 7f6938405fa1..369ad3eca2a3 100644 +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -1900,7 +1900,7 @@ static void pktgen_mark_device(const struct pktgen_net *pn, const char *ifname) + mutex_unlock(&pktgen_thread_lock); + pr_debug("%s: waiting for %s to disappear....\n", + __func__, ifname); +- schedule_timeout_interruptible(msecs_to_jiffies(msec_per_try)); ++ schedule_msec_hrtimeout_interruptible((msec_per_try)); + mutex_lock(&pktgen_thread_lock); + + if (++i >= max_tries) { +diff --git a/sound/soc/codecs/wm8350.c b/sound/soc/codecs/wm8350.c +index e92ebe52d485..88791ebb6df0 100644 +--- a/sound/soc/codecs/wm8350.c ++++ b/sound/soc/codecs/wm8350.c +@@ -236,10 +236,10 @@ static void wm8350_pga_work(struct work_struct *work) + out2->ramp == WM8350_RAMP_UP) { + /* delay is longer over 0dB as increases are larger */ + if (i >= WM8350_OUTn_0dB) +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (2)); + else +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (1)); + } else + udelay(50); /* doesn't matter if we delay longer */ +@@ -1123,7 +1123,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + (platform->dis_out4 << 6)); + + /* wait for discharge */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + cap_discharge_msecs)); + +@@ -1139,7 +1139,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + WM8350_VBUFEN); + + /* wait for vmid */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_charge_msecs)); + +@@ -1190,7 +1190,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + wm8350_reg_write(wm8350, WM8350_POWER_MGMT_1, pm1); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform-> + vmid_discharge_msecs)); + +@@ -1208,7 +1208,7 @@ static int wm8350_set_bias_level(struct snd_soc_component *component, + pm1 | WM8350_OUTPUT_DRAIN_EN); + + /* wait */ +- schedule_timeout_interruptible(msecs_to_jiffies ++ schedule_msec_hrtimeout_interruptible( + (platform->drain_msecs)); + + pm1 &= ~WM8350_BIASEN; +diff --git a/sound/soc/codecs/wm8900.c b/sound/soc/codecs/wm8900.c +index 1a14e902949d..68f17d9877ec 100644 +--- a/sound/soc/codecs/wm8900.c ++++ b/sound/soc/codecs/wm8900.c +@@ -1112,7 +1112,7 @@ static int wm8900_set_bias_level(struct snd_soc_component *component, + /* Need to let things settle before stopping the clock + * to ensure that restart works, see "Stopping the + * master clock" in the datasheet. */ +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_write(component, WM8900_REG_POWER2, + WM8900_REG_POWER2_SYSCLK_ENA); + break; +diff --git a/sound/soc/codecs/wm9713.c b/sound/soc/codecs/wm9713.c +index 643863bb32e0..fc318d71a8a3 100644 +--- a/sound/soc/codecs/wm9713.c ++++ b/sound/soc/codecs/wm9713.c +@@ -203,7 +203,7 @@ static int wm9713_voice_shutdown(struct snd_soc_dapm_widget *w, + + /* Gracefully shut down the voice interface. */ + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0200); +- schedule_timeout_interruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_interruptible(1); + snd_soc_component_update_bits(component, AC97_HANDSET_RATE, 0x0f00, 0x0f00); + snd_soc_component_update_bits(component, AC97_EXTENDED_MID, 0x1000, 0x1000); + +@@ -872,7 +872,7 @@ static int wm9713_set_pll(struct snd_soc_component *component, + wm9713->pll_in = freq_in; + + /* wait 10ms AC97 link frames for the link to stabilise */ +- schedule_timeout_interruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_interruptible((10)); + return 0; + } + +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0010-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0010-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch new file mode 100644 index 00000000..3eb7c889 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0010-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch @@ -0,0 +1,160 @@ +From ba3f464ce9dd28a8999f56b327b458f869258a1a Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Mon, 20 Feb 2017 13:30:32 +1100 +Subject: [PATCH 10/16] Replace all calls to schedule_timeout_uninterruptible + of potentially under 50ms to use schedule_msec_hrtimeout_uninterruptible + +--- + drivers/media/pci/cx18/cx18-gpio.c | 4 ++-- + drivers/net/wireless/intel/ipw2x00/ipw2100.c | 4 ++-- + drivers/rtc/rtc-wm8350.c | 6 +++--- + drivers/scsi/lpfc/lpfc_scsi.c | 2 +- + sound/pci/maestro3.c | 4 ++-- + sound/soc/codecs/rt5631.c | 4 ++-- + sound/soc/soc-dapm.c | 2 +- + 7 files changed, 13 insertions(+), 13 deletions(-) + +diff --git a/drivers/media/pci/cx18/cx18-gpio.c b/drivers/media/pci/cx18/cx18-gpio.c +index 012859e6dc7b..206bd08265a5 100644 +--- a/drivers/media/pci/cx18/cx18-gpio.c ++++ b/drivers/media/pci/cx18/cx18-gpio.c +@@ -90,11 +90,11 @@ static void gpio_reset_seq(struct cx18 *cx, u32 active_lo, u32 active_hi, + + /* Assert */ + gpio_update(cx, mask, ~active_lo); +- schedule_timeout_uninterruptible(msecs_to_jiffies(assert_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((assert_msecs)); + + /* Deassert */ + gpio_update(cx, mask, ~active_hi); +- schedule_timeout_uninterruptible(msecs_to_jiffies(recovery_msecs)); ++ schedule_msec_hrtimeout_uninterruptible((recovery_msecs)); + } + + /* +diff --git a/drivers/net/wireless/intel/ipw2x00/ipw2100.c b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +index 910db46db6a1..497b01ab32d4 100644 +--- a/drivers/net/wireless/intel/ipw2x00/ipw2100.c ++++ b/drivers/net/wireless/intel/ipw2x00/ipw2100.c +@@ -830,7 +830,7 @@ static int ipw2100_hw_send_command(struct ipw2100_priv *priv, + * doesn't seem to have as many firmware restart cycles... + * + * As a test, we're sticking in a 1/100s delay here */ +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + return 0; + +@@ -1281,7 +1281,7 @@ static int ipw2100_start_adapter(struct ipw2100_priv *priv) + IPW_DEBUG_FW("Waiting for f/w initialization to complete...\n"); + i = 5000; + do { +- schedule_timeout_uninterruptible(msecs_to_jiffies(40)); ++ schedule_msec_hrtimeout_uninterruptible((40)); + /* Todo... wait for sync command ... */ + + read_register(priv->net_dev, IPW_REG_INTA, &inta); +diff --git a/drivers/rtc/rtc-wm8350.c b/drivers/rtc/rtc-wm8350.c +index 483c7993516b..fddbaa475066 100644 +--- a/drivers/rtc/rtc-wm8350.c ++++ b/drivers/rtc/rtc-wm8350.c +@@ -119,7 +119,7 @@ static int wm8350_rtc_settime(struct device *dev, struct rtc_time *tm) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (--retries && !(rtc_ctrl & WM8350_RTC_STS)); + + if (!retries) { +@@ -202,7 +202,7 @@ static int wm8350_rtc_stop_alarm(struct wm8350 *wm8350) + /* Wait until confirmation of stopping */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && !(rtc_ctrl & WM8350_RTC_ALMSTS)); + + if (!(rtc_ctrl & WM8350_RTC_ALMSTS)) +@@ -225,7 +225,7 @@ static int wm8350_rtc_start_alarm(struct wm8350 *wm8350) + /* Wait until confirmation */ + do { + rtc_ctrl = wm8350_reg_read(wm8350, WM8350_RTC_TIME_CONTROL); +- schedule_timeout_uninterruptible(msecs_to_jiffies(1)); ++ schedule_msec_hrtimeout_uninterruptible((1)); + } while (retries-- && rtc_ctrl & WM8350_RTC_ALMSTS); + + if (rtc_ctrl & WM8350_RTC_ALMSTS) +diff --git a/drivers/scsi/lpfc/lpfc_scsi.c b/drivers/scsi/lpfc/lpfc_scsi.c +index 5c7858e735c9..b56a01420918 100644 +--- a/drivers/scsi/lpfc/lpfc_scsi.c ++++ b/drivers/scsi/lpfc/lpfc_scsi.c +@@ -5201,7 +5201,7 @@ lpfc_reset_flush_io_context(struct lpfc_vport *vport, uint16_t tgt_id, + tgt_id, lun_id, context); + later = msecs_to_jiffies(2 * vport->cfg_devloss_tmo * 1000) + jiffies; + while (time_after(later, jiffies) && cnt) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(20)); ++ schedule_msec_hrtimeout_uninterruptible((20)); + cnt = lpfc_sli_sum_iocb(vport, tgt_id, lun_id, context); + } + if (cnt) { +diff --git a/sound/pci/maestro3.c b/sound/pci/maestro3.c +index 62962178a9d7..87e486740da0 100644 +--- a/sound/pci/maestro3.c ++++ b/sound/pci/maestro3.c +@@ -2016,7 +2016,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(0, io + GPIO_DATA); + outw(dir | GPO_PRIMARY_AC97, io + GPIO_DIRECTION); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay1)); ++ schedule_msec_hrtimeout_uninterruptible((delay1)); + + outw(GPO_PRIMARY_AC97, io + GPIO_DATA); + udelay(5); +@@ -2024,7 +2024,7 @@ static void snd_m3_ac97_reset(struct snd_m3 *chip) + outw(IO_SRAM_ENABLE | SERIAL_AC_LINK_ENABLE, io + RING_BUS_CTRL_A); + outw(~0, io + GPIO_MASK); + +- schedule_timeout_uninterruptible(msecs_to_jiffies(delay2)); ++ schedule_msec_hrtimeout_uninterruptible((delay2)); + + if (! snd_m3_try_read_vendor(chip)) + break; +diff --git a/sound/soc/codecs/rt5631.c b/sound/soc/codecs/rt5631.c +index 865f49ac38dd..3c1190dd114f 100644 +--- a/sound/soc/codecs/rt5631.c ++++ b/sound/soc/codecs/rt5631.c +@@ -419,7 +419,7 @@ static void onebit_depop_mute_stage(struct snd_soc_component *component, int ena + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + /* config one-bit depop parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x307f); + snd_soc_component_update_bits(component, RT5631_HP_OUT_VOL, +@@ -529,7 +529,7 @@ static void depop_seq_mute_stage(struct snd_soc_component *component, int enable + hp_zc = snd_soc_component_read32(component, RT5631_INT_ST_IRQ_CTRL_2); + snd_soc_component_write(component, RT5631_INT_ST_IRQ_CTRL_2, hp_zc & 0xf7ff); + if (enable) { +- schedule_timeout_uninterruptible(msecs_to_jiffies(10)); ++ schedule_msec_hrtimeout_uninterruptible((10)); + + /* config depop sequence parameter */ + rt5631_write_index(component, RT5631_SPK_INTL_CTRL, 0x302f); +diff --git a/sound/soc/soc-dapm.c b/sound/soc/soc-dapm.c +index 461d951917c0..b5eb57ba4af4 100644 +--- a/sound/soc/soc-dapm.c ++++ b/sound/soc/soc-dapm.c +@@ -131,7 +131,7 @@ static void dapm_assert_locked(struct snd_soc_dapm_context *dapm) + static void pop_wait(u32 pop_time) + { + if (pop_time) +- schedule_timeout_uninterruptible(msecs_to_jiffies(pop_time)); ++ schedule_msec_hrtimeout_uninterruptible((pop_time)); + } + + static void pop_dbg(struct device *dev, u32 pop_time, const char *fmt, ...) +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0011-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0011-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch new file mode 100644 index 00000000..680e5fcd --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0011-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch @@ -0,0 +1,69 @@ +From befdee72d814b6c302da85af524b15762e72e0cf Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Mon, 20 Feb 2017 13:32:58 +1100 +Subject: [PATCH 11/16] Don't use hrtimer overlay when pm_freezing since some + drivers still don't correctly use freezable timeouts. + +--- + kernel/time/hrtimer.c | 2 +- + kernel/time/timer.c | 9 +++++---- + 2 files changed, 6 insertions(+), 5 deletions(-) + +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index 12735724cce4..32fb7b6d9568 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -2048,7 +2048,7 @@ long __sched schedule_msec_hrtimeout(long timeout) + * (yet) better than Hz, as would occur during startup, use regular + * timers. + */ +- if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ) ++ if (jiffs > 4 || hrtimer_resolution >= NSEC_PER_SEC / HZ || pm_freezing) + return schedule_timeout(jiffs); + + secs = timeout / 1000; +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index 542c13d98950..d5d0fa004d2b 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -44,6 +44,7 @@ + #include <linux/sched/debug.h> + #include <linux/slab.h> + #include <linux/compat.h> ++#include <linux/freezer.h> + + #include <linux/uaccess.h> + #include <asm/unistd.h> +@@ -1972,12 +1973,12 @@ void msleep(unsigned int msecs) + * Use high resolution timers where the resolution of tick based + * timers is inadequate. + */ +- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { + while (msecs) + msecs = schedule_msec_hrtimeout_uninterruptible(msecs); + return; + } +- timeout = msecs_to_jiffies(msecs) + 1; ++ timeout = jiffs + 1; + + while (timeout) + timeout = schedule_timeout_uninterruptible(timeout); +@@ -1994,12 +1995,12 @@ unsigned long msleep_interruptible(unsigned int msecs) + int jiffs = msecs_to_jiffies(msecs); + unsigned long timeout; + +- if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ) { ++ if (jiffs < 5 && hrtimer_resolution < NSEC_PER_SEC / HZ && !pm_freezing) { + while (msecs && !signal_pending(current)) + msecs = schedule_msec_hrtimeout_interruptible(msecs); + return msecs; + } +- timeout = msecs_to_jiffies(msecs) + 1; ++ timeout = jiffs + 1; + + while (timeout && !signal_pending(current)) + timeout = schedule_timeout_interruptible(timeout); +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0012-Make-threaded-IRQs-optionally-the-default-which-can-.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0012-Make-threaded-IRQs-optionally-the-default-which-can-.patch new file mode 100644 index 00000000..078a4ba0 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0012-Make-threaded-IRQs-optionally-the-default-which-can-.patch @@ -0,0 +1,67 @@ +From df4136f6de5b3f45c2f4be7a3cc042903e983e0c Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Wed, 7 Dec 2016 21:13:16 +1100 +Subject: [PATCH 13/16] Make threaded IRQs optionally the default which can be + disabled. + +--- + kernel/irq/Kconfig | 17 +++++++++++++++++ + kernel/irq/manage.c | 11 +++++++++++ + 2 files changed, 28 insertions(+) + +diff --git a/kernel/irq/Kconfig b/kernel/irq/Kconfig +index 5f3e2baefca9..de3e5740679b 100644 +--- a/kernel/irq/Kconfig ++++ b/kernel/irq/Kconfig +@@ -107,6 +107,23 @@ config GENERIC_IRQ_RESERVATION_MODE + config IRQ_FORCED_THREADING + bool + ++config FORCE_IRQ_THREADING ++ bool "Make IRQ threading compulsory" ++ depends on IRQ_FORCED_THREADING ++ default n ++ ---help--- ++ ++ Make IRQ threading mandatory for any IRQ handlers that support it ++ instead of being optional and requiring the threadirqs kernel ++ parameter. Instead they can be optionally disabled with the ++ nothreadirqs kernel parameter. ++ ++ Enabling this may make some architectures not boot with runqueue ++ sharing and MuQSS. ++ ++ Enable if you are building for a desktop or low latency system, ++ otherwise say N. ++ + config SPARSE_IRQ + bool "Support sparse irq numbering" if MAY_HAVE_SPARSE_IRQ + ---help--- +diff --git a/kernel/irq/manage.c b/kernel/irq/manage.c +index fb86146037a7..b322b1a0caa0 100644 +--- a/kernel/irq/manage.c ++++ b/kernel/irq/manage.c +@@ -23,9 +23,20 @@ + #include "internals.h" + + #ifdef CONFIG_IRQ_FORCED_THREADING ++#ifdef CONFIG_FORCE_IRQ_THREADING ++__read_mostly bool force_irqthreads = true; ++#else + __read_mostly bool force_irqthreads; ++#endif + EXPORT_SYMBOL_GPL(force_irqthreads); + ++static int __init setup_noforced_irqthreads(char *arg) ++{ ++ force_irqthreads = false; ++ return 0; ++} ++early_param("nothreadirqs", setup_noforced_irqthreads); ++ + static int __init setup_forced_irqthreads(char *arg) + { + force_irqthreads = true; +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0013-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0013-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch new file mode 100644 index 00000000..33a98653 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0013-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch @@ -0,0 +1,81 @@ +From ace3d66508ad4e17da3f579eaf04c5582b8256a2 Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Wed, 7 Dec 2016 21:23:01 +1100 +Subject: [PATCH 14/16] Reinstate default Hz of 100 in combination with MuQSS + and -ck patches. + +--- + kernel/Kconfig.hz | 25 ++++++++++++++++++------- + 1 file changed, 18 insertions(+), 7 deletions(-) + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 2a202a846757..1806fcac8f14 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -4,7 +4,8 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_100 if SCHED_MUQSS ++ default HZ_250_NODEF if !SCHED_MUQSS + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -19,11 +20,18 @@ choice + config HZ_100 + bool "100 HZ" + help ++ 100 Hz is a suitable choice in combination with MuQSS which does ++ not rely on ticks for rescheduling interrupts, and is not Hz limited ++ for timeouts and sleeps from both the kernel and userspace. ++ This allows us to benefit from the lower overhead and higher ++ throughput of fewer timer ticks. ++ ++ Non-MuQSS kernels: + 100 Hz is a typical choice for servers, SMP and NUMA systems + with lots of processors that may show reduced performance if + too many timer interrupts are occurring. + +- config HZ_250 ++ config HZ_250_NODEF + bool "250 HZ" + help + 250 Hz is a good compromise choice allowing server performance +@@ -31,7 +39,10 @@ choice + on SMP and NUMA systems. If you are going to be using NTSC video + or multimedia, selected 300Hz instead. + +- config HZ_300 ++ 250 Hz is the default choice for the mainline scheduler but not ++ advantageous in combination with MuQSS. ++ ++ config HZ_300_NODEF + bool "300 HZ" + help + 300 Hz is a good compromise choice allowing server performance +@@ -39,7 +50,7 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + +- config HZ_1000 ++ config HZ_1000_NODEF + bool "1000 HZ" + help + 1000 Hz is the preferred choice for desktop systems and other +@@ -50,9 +61,9 @@ endchoice + config HZ + int + default 100 if HZ_100 +- default 250 if HZ_250 +- default 300 if HZ_300 +- default 1000 if HZ_1000 ++ default 250 if HZ_250_NODEF ++ default 300 if HZ_300_NODEF ++ default 1000 if HZ_1000_NODEF + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0014-Swap-sucks.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0014-Swap-sucks.patch new file mode 100644 index 00000000..bd68ebec --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0014-Swap-sucks.patch @@ -0,0 +1,25 @@ +From d8f6f203f5bdabdf1a5ddb6bdc9e13fae2b640b9 Mon Sep 17 00:00:00 2001 +From: Con Kolivas <kernel@kolivas.org> +Date: Sat, 12 Aug 2017 12:02:04 +1000 +Subject: [PATCH 15/16] Swap sucks. + +--- + mm/vmscan.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/mm/vmscan.c b/mm/vmscan.c +index c5ef7240cbcb..3f04308b6445 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -159,7 +159,7 @@ struct scan_control { + /* + * From 0 .. 100. Higher means more swappy. + */ +-int vm_swappiness = 60; ++int vm_swappiness = 33; + /* + * The total number of pages which are beyond the high watermark within all + * zones. +-- +2.17.1 + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-0015-unfuck-MuQSS-on-linux-4_19_10+.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-0015-unfuck-MuQSS-on-linux-4_19_10+.patch new file mode 100644 index 00000000..fbff2e6e --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-0015-unfuck-MuQSS-on-linux-4_19_10+.patch @@ -0,0 +1,14 @@ +diff -Nur a/kernel/sched/MuQSS.c b/kernel/sched/MuQSS.c +--- a/kernel/sched/MuQSS.c 2019-02-09 19:46:07.899912055 +0000 ++++ b/kernel/sched/MuQSS.c 2019-02-09 19:48:03.743622465 +0000 +@@ -1011,6 +1011,10 @@ + #define CPUIDLE_THREAD_BUSY (16) + #define CPUIDLE_DIFF_NODE (32) + ++#ifdef CONFIG_SCHED_SMT ++DEFINE_STATIC_KEY_FALSE(sched_smt_present); ++#endif ++ + /* + * The best idle CPU is chosen according to the CPUIDLE ranking above where the + * lowest value would give the most suitable CPU to schedule p onto next. The diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-ata-fix-NCQ-LOG-strings-and-move-to-debug.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-ata-fix-NCQ-LOG-strings-and-move-to-debug.patch new file mode 100644 index 00000000..344a8c4b --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-ata-fix-NCQ-LOG-strings-and-move-to-debug.patch @@ -0,0 +1,23 @@ +diff -Naur linux-4.16.5/drivers/ata/libata-core.c linux-4.16.5-p/drivers/ata/libata-core.c +--- linux-4.16.5/drivers/ata/libata-core.c 2018-04-26 11:00:39.000000000 +0200 ++++ linux-4.16.5-p/drivers/ata/libata-core.c 2018-04-28 02:19:06.632381413 +0200 +@@ -2201,7 +2201,7 @@ + unsigned int err_mask; + + if (!ata_log_supported(dev, ATA_LOG_NCQ_SEND_RECV)) { +- ata_dev_warn(dev, "NCQ Send/Recv Log not supported\n"); ++ ata_dev_dbg(dev, "NCQ Send/Recv Log not supported\n"); + return; + } + err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_SEND_RECV, +@@ -2230,8 +2230,8 @@ + unsigned int err_mask; + + if (!ata_log_supported(dev, ATA_LOG_NCQ_NON_DATA)) { +- ata_dev_warn(dev, +- "NCQ Send/Recv Log not supported\n"); ++ ata_dev_dbg(dev, ++ "NCQ Non-Data Log not supported\n"); + return; + } + err_mask = ata_read_log_page(dev, ATA_LOG_NCQ_NON_DATA, diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-ath10k-drop-WARN_ON-added-in-cd93b83ad927b2c7979e0add0343ace59328b461.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-ath10k-drop-WARN_ON-added-in-cd93b83ad927b2c7979e0add0343ace59328b461.patch new file mode 100644 index 00000000..4e810797 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-ath10k-drop-WARN_ON-added-in-cd93b83ad927b2c7979e0add0343ace59328b461.patch @@ -0,0 +1,74 @@ + +Triggers on resume .. + +[ 487.909349] WARNING: CPU: 0 PID: 2125 at drivers/net/wireless/ath/ath10k/mac.c:5625 ath10k_bss_info_changed+0xb33/0xd90 [ath10k_core] +[ 487.909350] Modules linked in: ctr ccm cmac af_packet arc4 xt_tcpudp xt_state xt_conntrack nf_conntrack nf_defrag_ipv6 nf_defrag_ipv4 libcrc32c bnep iptable_filter ip_tables x_tables bpfilter nls_utf8 nls_cp437 vfat fat amdgpu chash gpu_sched joydev uvcvideo iTCO_wdt snd_hda_codec_hdmi iTCO_vendor_support videobuf2_vmalloc videobuf2_memops videobuf2_v4l2 snd_hda_codec_conexant intel_wmi_thunderbolt videobuf2_common btusb snd_hda_codec_generic radeon btrtl btbcm btintel ath10k_pci videodev bluetooth ath10k_core media i915 ttm ecdh_generic coretemp ath intel_rapl x86_pkg_temp_thermal rtsx_usb_ms intel_powerclamp mac80211 memstick kvm_intel kvmgt vfio_mdev mdev vfio_iommu_type1 vfio kvm snd_soc_skl snd_soc_skl_ipc snd_soc_sst_ipc snd_soc_sst_dsp snd_hda_ext_core snd_soc_acpi_intel_match snd_soc_acpi +[ 487.909390] cec snd_soc_core rc_core drm_kms_helper efi_pstore irqbypass cfg80211 snd_compress drm snd_pcm_dmaengine psmouse intel_gtt evdev snd_hda_intel mac_hid agpgart i2c_algo_bit snd_hda_codec r8169 pcspkr efivars fb_sys_fops syscopyarea sysfillrect sysimgblt i2c_i801 snd_hda_core hwmon i2c_core libphy snd_hwdep ideapad_laptop sparse_keymap intel_pch_thermal rfkill thermal wmi battery acpi_pad ac video pcc_cpufreq button ppdev sch_fq_codel fuse snd_pcm_oss snd_mixer_oss snd_pcm snd_seq_dummy snd_seq_oss snd_seq_midi_event snd_seq snd_seq_device snd_timer snd soundcore lp parport_pc parport binfmt_misc sg ext4 crc32c_generic crc16 mbcache jbd2 fscrypto rtsx_usb_sdmmc mmc_core rtsx_usb sr_mod sd_mod cdrom crct10dif_pclmul crc32_pclmul crc32c_intel ghash_clmulni_intel pcbc xhci_pci ahci xhci_hcd +[ 487.909436] libahci libata aesni_intel aes_x86_64 crypto_simd usbcore cryptd scsi_mod glue_helper serio_raw dm_mirror dm_region_hash dm_log dm_mod efivarfs unix sha256_mb sha256_ssse3 sha256_generic sha1_mb mcryptd sha1_ssse3 sha1_generic hmac ipv6 autofs4 +[ 487.909454] CPU: 0 PID: 2125 Comm: kworker/u8:21 Not tainted 4.19.2-fw1 #1 +[ 487.909455] Hardware name: LENOVO 80UD/LNVNB161216, BIOS 1TCN26WW(V2.07) 03/29/2018 +[ 487.909459] Workqueue: events_unbound async_run_entry_fn +[ 487.909466] RIP: 0010:ath10k_bss_info_changed+0xb33/0xd90 [ath10k_core] +[ 487.909468] Code: ff ff b8 a1 ff ff ff e9 e9 f7 ff ff b8 a1 ff ff ff e9 a5 f6 ff ff b8 a1 ff ff ff e9 ef f5 ff ff b8 a1 ff ff ff e9 4f f7 ff ff <0f> 0b e9 37 f8 ff ff b8 a1 ff ff ff e9 f6 fa ff ff b8 a1 ff ff ff +[ 487.909469] RSP: 0018:ffff9b9042e47cd8 EFLAGS: 00010282 +[ 487.909471] RAX: 00000000fffffffe RBX: ffff904af2fb1540 RCX: 0000000000000000 +[ 487.909472] RDX: ffff904af2fb18b8 RSI: ffff9b9042e47cf8 RDI: ffff904af4181380 +[ 487.909473] RBP: ffff904af4181380 R08: 0000000000200000 R09: 0000000000000000 +[ 487.909474] R10: 000000000000001f R11: ffff904ae2b4a600 R12: 0000000002000000 +[ 487.909475] R13: ffff904af4181388 R14: ffff904af2fb24d0 R15: ffff904af2fb1540 +[ 487.909477] FS: 0000000000000000(0000) GS:ffff904af7200000(0000) knlGS:0000000000000000 +[ 487.909478] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 487.909479] CR2: 00007fa6a8300510 CR3: 000000002000a002 CR4: 00000000003606f0 +[ 487.909480] Call Trace: +[ 487.909488] ? ath10k_conf_tx+0x12d/0x4c0 [ath10k_core] +[ 487.909503] ieee80211_bss_info_change_notify+0xa9/0x1c0 [mac80211] +[ 487.909521] ieee80211_reconfig+0x9d7/0x14f0 [mac80211] +[ 487.909538] wiphy_resume+0x7e/0x150 [cfg80211] +[ 487.909549] ? wiphy_namespace+0x10/0x10 [cfg80211] +[ 487.909553] dpm_run_callback+0x2e/0x130 +[ 487.909556] device_resume+0x97/0x190 +[ 487.909558] async_resume+0x19/0x40 +[ 487.909561] async_run_entry_fn+0x37/0xe0 +[ 487.909563] process_one_work+0x1e9/0x410 +[ 487.909565] worker_thread+0x2d/0x3d0 +[ 487.909567] ? process_one_work+0x410/0x410 +[ 487.909569] kthread+0x113/0x130 +[ 487.909572] ? kthread_park+0x90/0x90 +[ 487.909575] ret_from_fork+0x35/0x40 +[ 487.909577] ---[ end trace 56a3ea97193bc4c5 ]--- + +Introduced by: + +..... + +commit cd93b83ad927b2c7979e0add0343ace59328b461 +Author: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org> +Date: Wed Jul 25 10:59:39 2018 +0300 + + ath10k: support for multicast rate control + + Issues a wmi command to firmware when multicast rate change is received with the + new BSS_CHANGED_MCAST_RATE flag. Also fixes the incorrect fixed_rate setting + for CCK rates which got introduced with addition of ath10k_rates_rev2 enum. + + Tested on QCA9984 with firmware ver 10.4-3.6-00104 + + Signed-off-by: Pradeep Kumar Chitrapu <pradeepc@codeaurora.org> + Signed-off-by: Kalle Valo <kvalo@codeaurora.org> + +..... + + +diff -Naur linux-4.19.2/drivers/net/wireless/ath/ath10k/mac.c linux-4.19.2-p/drivers/net/wireless/ath/ath10k/mac.c +--- linux-4.19.2/drivers/net/wireless/ath/ath10k/mac.c 2018-11-13 20:09:00.000000000 +0100 ++++ linux-4.19.2-p/drivers/net/wireless/ath/ath10k/mac.c 2018-11-15 01:41:51.896601274 +0100 +@@ -5621,8 +5621,7 @@ + arvif->vdev_id, ret); + } + +- if (changed & BSS_CHANGED_MCAST_RATE && +- !WARN_ON(ath10k_mac_vif_chan(arvif->vif, &def))) { ++ if (changed & BSS_CHANGED_MCAST_RATE && !ath10k_mac_vif_chan(arvif->vif, &def)) { + band = def.chan->band; + rateidx = vif->bss_conf.mcast_rate[band] - 1; + diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch new file mode 100644 index 00000000..039c8fcd --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-bfq-sq-mq-v9r1-2K190204-rc1.patch @@ -0,0 +1,18511 @@ +diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt +index 8d8d8f06cab2..41d0200944f1 100644 +--- a/Documentation/block/bfq-iosched.txt ++++ b/Documentation/block/bfq-iosched.txt +@@ -1,3 +1,6 @@ ++[ THIS TREE CONTAINS ALSO THE DEV VERSION OF BFQ. ++ DETAILS AT THE END OF THIS DOCUMENT. ] ++ + BFQ (Budget Fair Queueing) + ========================== + +@@ -11,6 +14,15 @@ controllers), BFQ's main features are: + groups (switching back to time distribution when needed to keep + throughput high). + ++If bfq-mq patches have been applied, then the following three ++instances of BFQ are available (otherwise only the first instance): ++- bfq: mainline version of BFQ, for blk-mq ++- bfq-mq: development version of BFQ for blk-mq; this version contains ++ also all latest features and fixes not yet landed in mainline, plus many ++ safety checks ++- bfq-sq: BFQ for legacy blk; also this version contains latest features ++ and fixes, as well as safety checks ++ + In its default configuration, BFQ privileges latency over + throughput. So, when needed for achieving a lower latency, BFQ builds + schedules that may lead to a lower throughput. If your main or only +@@ -22,27 +34,42 @@ latency and throughput, or on how to maximize throughput. + + BFQ has a non-null overhead, which limits the maximum IOPS that a CPU + can process for a device scheduled with BFQ. To give an idea of the +-limits on slow or average CPUs, here are, first, the limits of BFQ for +-three different CPUs, on, respectively, an average laptop, an old +-desktop, and a cheap embedded system, in case full hierarchical +-support is enabled (i.e., CONFIG_BFQ_GROUP_IOSCHED is set), but ++limits on slow or average CPUs, here are, first, the limits of bfq-mq ++and bfq for three different CPUs, on, respectively, an average laptop, ++an old desktop, and a cheap embedded system, in case full hierarchical ++support is enabled (i.e., CONFIG_MQ_BFQ_GROUP_IOSCHED is set for ++bfq-mq, or CONFIG_BFQ_GROUP_IOSCHED is set for bfq), but + CONFIG_DEBUG_BLK_CGROUP is not set (Section 4-2): + - Intel i7-4850HQ: 400 KIOPS + - AMD A8-3850: 250 KIOPS + - ARM CortexTM-A53 Octa-core: 80 KIOPS + +-If CONFIG_DEBUG_BLK_CGROUP is set (and of course full hierarchical +-support is enabled), then the sustainable throughput with BFQ +-decreases, because all blkio.bfq* statistics are created and updated +-(Section 4-2). For BFQ, this leads to the following maximum +-sustainable throughputs, on the same systems as above: ++As for bfq-sq, it cannot reach the above IOPS, because of the ++inherent, lower parallelism of legacy blk and of the components within ++it (including bfq-sq itself). In particular, results with ++CONFIG_DEBUG_BLK_CGROUP unset are rather fluctuating. The limits ++reported below for the case CONFIG_DEBUG_BLK_CGROUP set will however ++provide a lower bound to the limits of bfq-sq. ++ ++Turning back to bfq-mq and bfq, If CONFIG_DEBUG_BLK_CGROUP is set (and ++of course full hierarchical support is enabled), then the sustainable ++throughput with bfq-mq and bfq decreases, because all blkio.bfq* ++statistics are created and updated (Section 4-2). For bfq-mq and bfq, ++this leads to the following maximum sustainable throughputs, on the ++same systems as above: + - Intel i7-4850HQ: 310 KIOPS + - AMD A8-3850: 200 KIOPS + - ARM CortexTM-A53 Octa-core: 56 KIOPS + +-BFQ works for multi-queue devices too. ++Finally, if CONFIG_DEBUG_BLK_CGROUP is set (and full hierarchical ++support is enabled), then bfq-sq exhibits the following limits: ++- Intel i7-4850HQ: 250 KIOPS ++- AMD A8-3850: 170 KIOPS ++- ARM CortexTM-A53 Octa-core: 45 KIOPS + +-The table of contents follow. Impatients can just jump to Section 3. ++BFQ works for multi-queue devices too (bfq and bfq-mq instances). ++ ++The table of contents follows. Impatients can just jump to Section 3. + + CONTENTS + +@@ -509,25 +536,27 @@ To get proportional sharing of bandwidth with BFQ for a given device, + BFQ must of course be the active scheduler for that device. + + Within each group directory, the names of the files associated with +-BFQ-specific cgroup parameters and stats begin with the "bfq." +-prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for +-BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group +-parameter to set the weight of a group with BFQ is blkio.bfq.weight ++BFQ-specific cgroup parameters and stats begin with the "bfq.", ++"bfq-sq." or "bfq-mq." prefix, depending on which instance of bfq you ++want to use. So, with cgroups-v1 or cgroups-v2, the full prefix for ++BFQ-specific files is "blkio.bfqX." or "io.bfqX.", where X can be "" ++(i.e., null string), "-sq" or "-mq". For example, the group parameter ++to set the weight of a group with the mainline BFQ is blkio.bfq.weight + or io.bfq.weight. + + As for cgroups-v1 (blkio controller), the exact set of stat files +-created, and kept up-to-date by bfq, depends on whether +-CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq creates all ++created, and kept up-to-date by bfq*, depends on whether ++CONFIG_DEBUG_BLK_CGROUP is set. If it is set, then bfq* creates all + the stat files documented in + Documentation/cgroup-v1/blkio-controller.txt. If, instead, +-CONFIG_DEBUG_BLK_CGROUP is not set, then bfq creates only the files +-blkio.bfq.io_service_bytes +-blkio.bfq.io_service_bytes_recursive +-blkio.bfq.io_serviced +-blkio.bfq.io_serviced_recursive ++CONFIG_DEBUG_BLK_CGROUP is not set, then bfq* creates only the files ++blkio.bfq*.io_service_bytes ++blkio.bfq*.io_service_bytes_recursive ++blkio.bfq*.io_serviced ++blkio.bfq*.io_serviced_recursive + + The value of CONFIG_DEBUG_BLK_CGROUP greatly influences the maximum +-throughput sustainable with bfq, because updating the blkio.bfq.* ++throughput sustainable with bfq*, because updating the blkio.bfq* + stats is rather costly, especially for some of the stats enabled by + CONFIG_DEBUG_BLK_CGROUP. + +@@ -536,7 +565,7 @@ Parameters to set + + For each group, there is only the following parameter to set. + +-weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the ++weight (namely blkio.bfqX.weight or io.bfqX.weight): the weight of the + group inside its parent. Available values: 1..10000 (default 100). The + linear mapping between ioprio and weights, described at the beginning + of the tunable section, is still valid, but all weights higher than +@@ -559,3 +588,55 @@ applications. Unset this tunable if you need/want to control weights. + Slightly extended version: + http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- + results.pdf ++ ++---------------------------------------------------------------------- ++ ++DETAILS ON THE DEV VERSIONS IN THIS TREE ++ ++The dev version of BFQ is available for both the legacy and the ++multi-queue block layers, as two additional I/O schedulers, named, ++respectively, bfq-sq-iosched and bfq-mq-iosched (the latter is ++available if also the changes introducing bfq-mq-iosched have been ++applied). In particular, this tree contains the dev version of bfq for ++Linux mainline 4.19.0, and has been obtained from the dev version for ++Linux 4.18.0. Rebasing from 4.18 to 4.19 involved two manual ++interventions. ++ ++First, some conflicts had to be resolved, as follows: ++ ++--------------------------------------------------------------- ++ ++diff --cc Makefile ++index 7727c1bf6fa5,69fa5c0310d8..c7cbdf0ad594 ++--- a/Makefile +++++ b/Makefile ++@@@ -1,9 -1,9 +1,9 @@@ ++ # SPDX-License-Identifier: GPL-2.0 ++ VERSION = 4 ++- PATCHLEVEL = 18 +++ PATCHLEVEL = 19 ++ SUBLEVEL = 0 ++ -EXTRAVERSION = ++ +EXTRAVERSION = -bfq-mq ++- NAME = Merciless Moray +++ NAME = "People's Front" ++ ++ # *DOCUMENTATION* ++ # To see a list of typical targets execute "make help" ++diff --cc include/linux/blkdev.h ++index 897c63322bd7,6980014357d4..8c4568ea6884 ++--- a/include/linux/blkdev.h +++++ b/include/linux/blkdev.h ++@@@ -56,7 -54,7 +54,7 @@@ struct blk_stat_callback ++ * Maximum number of blkcg policies allowed to be registered concurrently. ++ * Defined here to simplify include dependency. ++ */ ++--#define BLKCG_MAX_POLS 5 ++++#define BLKCG_MAX_POLS 7 ++ ++ typedef void (rq_end_io_fn)(struct request *, blk_status_t); ++ ++--------------------------------------------------------------- ++ ++Second, the following port commit had to be made: ++port commit "block: use ktime_get_ns() instead of sched_clock() for cfq and bfq" +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index e32fc1f274d8..94cb28eb20ba 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -12,6 +12,11 @@ CONFIG_NO_HZ=y + CONFIG_HIGH_RES_TIMERS=y + CONFIG_LOG_BUF_SHIFT=18 + CONFIG_CGROUPS=y ++CONFIG_BLK_CGROUP=y ++CONFIG_IOSCHED_BFQ_SQ=y ++CONFIG_BFQ_SQ_GROUP_IOSCHED=y ++CONFIG_MQ_IOSCHED_BFQ=y ++CONFIG_MQ_BFQ_GROUP_IOSCHED=y + CONFIG_CGROUP_FREEZER=y + CONFIG_CPUSETS=y + CONFIG_CGROUP_CPUACCT=y +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index a4a8914bf7a4..299a6861fb90 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -40,6 +40,26 @@ config CFQ_GROUP_IOSCHED + ---help--- + Enable group IO scheduling in CFQ. + ++config IOSCHED_BFQ_SQ ++ tristate "BFQ-SQ I/O scheduler" ++ default n ++ ---help--- ++ The BFQ-SQ I/O scheduler (for legacy blk: SQ stands for ++ SingleQueue) distributes bandwidth among all processes ++ according to their weights, regardless of the device ++ parameters and with any workload. It also guarantees a low ++ latency to interactive and soft real-time applications. ++ Details in Documentation/block/bfq-iosched.txt ++ ++config BFQ_SQ_GROUP_IOSCHED ++ bool "BFQ-SQ hierarchical scheduling support" ++ depends on IOSCHED_BFQ_SQ && BLK_CGROUP ++ default n ++ ---help--- ++ ++ Enable hierarchical scheduling in BFQ-SQ, using the blkio ++ (cgroups-v1) or io (cgroups-v2) controller. ++ + choice + + prompt "Default I/O scheduler" +@@ -54,6 +74,16 @@ choice + config DEFAULT_CFQ + bool "CFQ" if IOSCHED_CFQ=y + ++ config DEFAULT_BFQ_SQ ++ bool "BFQ-SQ" if IOSCHED_BFQ_SQ=y ++ help ++ Selects BFQ-SQ as the default I/O scheduler which will be ++ used by default for all block devices. ++ The BFQ-SQ I/O scheduler aims at distributing the bandwidth ++ as desired, independently of the disk parameters and with ++ any workload. It also tries to guarantee low latency to ++ interactive and soft real-time applications. ++ + config DEFAULT_NOOP + bool "No-op" + +@@ -63,8 +93,28 @@ config DEFAULT_IOSCHED + string + default "deadline" if DEFAULT_DEADLINE + default "cfq" if DEFAULT_CFQ ++ default "bfq-sq" if DEFAULT_BFQ_SQ + default "noop" if DEFAULT_NOOP + ++config MQ_IOSCHED_BFQ ++ tristate "BFQ-MQ I/O Scheduler" ++ default y ++ ---help--- ++ BFQ I/O scheduler for BLK-MQ. BFQ-MQ distributes bandwidth ++ among all processes according to their weights, regardless of ++ the device parameters and with any workload. It also ++ guarantees a low latency to interactive and soft real-time ++ applications. Details in Documentation/block/bfq-iosched.txt ++ ++config MQ_BFQ_GROUP_IOSCHED ++ bool "BFQ-MQ hierarchical scheduling support" ++ depends on MQ_IOSCHED_BFQ && BLK_CGROUP ++ default n ++ ---help--- ++ ++ Enable hierarchical scheduling in BFQ-MQ, using the blkio ++ (cgroups-v1) or io (cgroups-v2) controller. ++ + config MQ_IOSCHED_DEADLINE + tristate "MQ deadline I/O scheduler" + default y +diff --git a/block/Makefile b/block/Makefile +index 572b33f32c07..1dd6ffdc2fee 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -25,6 +25,8 @@ obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o + obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o + bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o + obj-$(CONFIG_IOSCHED_BFQ) += bfq.o ++obj-$(CONFIG_IOSCHED_BFQ_SQ) += bfq-sq-iosched.o ++obj-$(CONFIG_MQ_IOSCHED_BFQ) += bfq-mq-iosched.o + + obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o + obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o +diff --git a/block/bfq-cgroup-included.c b/block/bfq-cgroup-included.c +new file mode 100644 +index 000000000000..15459e50cd6a +--- /dev/null ++++ b/block/bfq-cgroup-included.c +@@ -0,0 +1,1359 @@ ++/* ++ * BFQ: CGROUPS support. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> ++ * ++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> ++ * Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org> ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ */ ++ ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++ ++/* bfqg stats flags */ ++enum bfqg_stats_flags { ++ BFQG_stats_waiting = 0, ++ BFQG_stats_idling, ++ BFQG_stats_empty, ++}; ++ ++#define BFQG_FLAG_FNS(name) \ ++static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ ++{ \ ++ stats->flags |= (1 << BFQG_stats_##name); \ ++} \ ++static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ ++{ \ ++ stats->flags &= ~(1 << BFQG_stats_##name); \ ++} \ ++static int bfqg_stats_##name(struct bfqg_stats *stats) \ ++{ \ ++ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ ++} \ ++ ++BFQG_FLAG_FNS(waiting) ++BFQG_FLAG_FNS(idling) ++BFQG_FLAG_FNS(empty) ++#undef BFQG_FLAG_FNS ++ ++#ifdef BFQ_MQ ++/* This should be called with the scheduler lock held. */ ++#else ++/* This should be called with the queue_lock held. */ ++#endif ++static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) ++{ ++ u64 now; ++ ++ if (!bfqg_stats_waiting(stats)) ++ return; ++ ++ now = ktime_get_ns(); ++ if (now > stats->start_group_wait_time) ++ blkg_stat_add(&stats->group_wait_time, ++ now - stats->start_group_wait_time); ++ bfqg_stats_clear_waiting(stats); ++} ++ ++#ifdef BFQ_MQ ++/* This should be called with the scheduler lock held. */ ++#else ++/* This should be called with the queue_lock held. */ ++#endif ++static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, ++ struct bfq_group *curr_bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (bfqg_stats_waiting(stats)) ++ return; ++ if (bfqg == curr_bfqg) ++ return; ++ stats->start_group_wait_time = ktime_get_ns(); ++ bfqg_stats_mark_waiting(stats); ++} ++ ++#ifdef BFQ_MQ ++/* This should be called with the scheduler lock held. */ ++#else ++/* This should be called with the queue_lock held. */ ++#endif ++static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) ++{ ++ u64 now; ++ ++ if (!bfqg_stats_empty(stats)) ++ return; ++ ++ now = ktime_get_ns(); ++ if (now > stats->start_empty_time) ++ blkg_stat_add(&stats->empty_time, ++ now - stats->start_empty_time); ++ bfqg_stats_clear_empty(stats); ++} ++ ++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) ++{ ++ blkg_stat_add(&bfqg->stats.dequeue, 1); ++} ++ ++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (blkg_rwstat_total(&stats->queued)) ++ return; ++ ++ /* ++ * group is already marked empty. This can happen if bfqq got new ++ * request in parent group and moved to this group while being added ++ * to service tree. Just ignore the event and move on. ++ */ ++ if (bfqg_stats_empty(stats)) ++ return; ++ ++ stats->start_empty_time = ktime_get_ns(); ++ bfqg_stats_mark_empty(stats); ++} ++ ++static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ if (bfqg_stats_idling(stats)) { ++ u64 now = ktime_get_ns(); ++ ++ if (now > stats->start_idle_time) ++ blkg_stat_add(&stats->idle_time, ++ now - stats->start_idle_time); ++ bfqg_stats_clear_idling(stats); ++ } ++} ++ ++static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ stats->start_idle_time = ktime_get_ns(); ++ bfqg_stats_mark_idling(stats); ++} ++ ++static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ ++ blkg_stat_add(&stats->avg_queue_size_sum, ++ blkg_rwstat_total(&stats->queued)); ++ blkg_stat_add(&stats->avg_queue_size_samples, 1); ++ bfqg_stats_update_group_wait_time(stats); ++} ++ ++static void bfqg_stats_update_io_add(struct bfq_group *bfqg, ++ struct bfq_queue *bfqq, ++ unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.queued, op, 1); ++ bfqg_stats_end_empty_time(&bfqg->stats); ++ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) ++ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); ++} ++ ++static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.queued, op, -1); ++} ++ ++static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) ++{ ++ blkg_rwstat_add(&bfqg->stats.merged, op, 1); ++} ++ ++static void bfqg_stats_update_completion(struct bfq_group *bfqg, ++ u64 start_time_ns, ++ u64 io_start_time_ns, ++ unsigned int op) ++{ ++ struct bfqg_stats *stats = &bfqg->stats; ++ u64 now = ktime_get_ns(); ++ ++ if (now > io_start_time_ns) ++ blkg_rwstat_add(&stats->service_time, op, ++ now - io_start_time_ns); ++ if (io_start_time_ns > start_time_ns) ++ blkg_rwstat_add(&stats->wait_time, op, ++ io_start_time_ns - start_time_ns); ++} ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ ++ ++static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, ++ struct bfq_queue *bfqq, unsigned int op) { } ++static inline void ++bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } ++static inline void ++bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } ++static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, ++ u64 start_time_ns, ++ u64 io_start_time_ns, ++ unsigned int op) { } ++static inline void ++bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, ++ struct bfq_group *curr_bfqg) { } ++static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } ++static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } ++static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct blkcg_policy blkcg_policy_bfq; ++ ++/* ++ * blk-cgroup policy-related handlers ++ * The following functions help in converting between blk-cgroup ++ * internal structures and BFQ-specific structures. ++ */ ++ ++static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) ++{ ++ return pd ? container_of(pd, struct bfq_group, pd) : NULL; ++} ++ ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) ++{ ++ return pd_to_blkg(&bfqg->pd); ++} ++ ++static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) ++{ ++ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); ++ ++ return pd_to_bfqg(pd); ++} ++ ++/* ++ * bfq_group handlers ++ * The following functions help in navigating the bfq_group hierarchy ++ * by allowing to find the parent of a bfq_group or the bfq_group ++ * associated to a bfq_queue. ++ */ ++ ++static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) ++{ ++ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; ++ ++ return pblkg ? blkg_to_bfqg(pblkg) : NULL; ++} ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ return group_entity ? container_of(group_entity, struct bfq_group, ++ entity) : ++ bfqq->bfqd->root_group; ++} ++ ++/* ++ * The following two functions handle get and put of a bfq_group by ++ * wrapping the related blk-cgroup hooks. ++ */ ++ ++static void bfqg_get(struct bfq_group *bfqg) ++{ ++#ifdef BFQ_MQ ++ bfqg->ref++; ++#else ++ blkg_get(bfqg_to_blkg(bfqg)); ++#endif ++} ++ ++static void bfqg_put(struct bfq_group *bfqg) ++{ ++#ifdef BFQ_MQ ++ bfqg->ref--; ++ ++ BUG_ON(bfqg->ref < 0); ++ if (bfqg->ref == 0) ++ kfree(bfqg); ++#else ++ blkg_put(bfqg_to_blkg(bfqg)); ++#endif ++} ++ ++#ifdef BFQ_MQ ++static void bfqg_and_blkg_get(struct bfq_group *bfqg) ++{ ++ /* see comments in bfq_bic_update_cgroup for why refcounting bfqg */ ++ bfqg_get(bfqg); ++ ++ blkg_get(bfqg_to_blkg(bfqg)); ++} ++ ++static void bfqg_and_blkg_put(struct bfq_group *bfqg) ++{ ++ blkg_put(bfqg_to_blkg(bfqg)); ++ ++ bfqg_put(bfqg); ++} ++#endif ++ ++/* @stats = 0 */ ++static void bfqg_stats_reset(struct bfqg_stats *stats) ++{ ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ /* queued stats shouldn't be cleared */ ++ blkg_rwstat_reset(&stats->merged); ++ blkg_rwstat_reset(&stats->service_time); ++ blkg_rwstat_reset(&stats->wait_time); ++ blkg_stat_reset(&stats->time); ++ blkg_stat_reset(&stats->avg_queue_size_sum); ++ blkg_stat_reset(&stats->avg_queue_size_samples); ++ blkg_stat_reset(&stats->dequeue); ++ blkg_stat_reset(&stats->group_wait_time); ++ blkg_stat_reset(&stats->idle_time); ++ blkg_stat_reset(&stats->empty_time); ++#endif ++} ++ ++/* @to += @from */ ++static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) ++{ ++ if (!to || !from) ++ return; ++ ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ /* queued stats shouldn't be cleared */ ++ blkg_rwstat_add_aux(&to->merged, &from->merged); ++ blkg_rwstat_add_aux(&to->service_time, &from->service_time); ++ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); ++ blkg_stat_add_aux(&from->time, &from->time); ++ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); ++ blkg_stat_add_aux(&to->avg_queue_size_samples, ++ &from->avg_queue_size_samples); ++ blkg_stat_add_aux(&to->dequeue, &from->dequeue); ++ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); ++ blkg_stat_add_aux(&to->idle_time, &from->idle_time); ++ blkg_stat_add_aux(&to->empty_time, &from->empty_time); ++#endif ++} ++ ++/* ++ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' ++ * recursive stats can still account for the amount used by this bfqg after ++ * it's gone. ++ */ ++static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) ++{ ++ struct bfq_group *parent; ++ ++ if (!bfqg) /* root_group */ ++ return; ++ ++ parent = bfqg_parent(bfqg); ++ ++ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); ++ ++ if (unlikely(!parent)) ++ return; ++ ++ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); ++ bfqg_stats_reset(&bfqg->stats); ++} ++ ++static void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) { ++ bfqq->ioprio = bfqq->new_ioprio; ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++#ifdef BFQ_MQ ++ /* ++ * Make sure that bfqg and its associated blkg do not ++ * disappear before entity. ++ */ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting bfqg %p and blkg\n", ++ bfqg); ++ ++ bfqg_and_blkg_get(bfqg); ++#else ++ bfqg_get(bfqg); ++#endif ++ } ++ entity->parent = bfqg->my_entity; /* NULL for root group */ ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static void bfqg_stats_exit(struct bfqg_stats *stats) ++{ ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ blkg_rwstat_exit(&stats->merged); ++ blkg_rwstat_exit(&stats->service_time); ++ blkg_rwstat_exit(&stats->wait_time); ++ blkg_rwstat_exit(&stats->queued); ++ blkg_stat_exit(&stats->time); ++ blkg_stat_exit(&stats->avg_queue_size_sum); ++ blkg_stat_exit(&stats->avg_queue_size_samples); ++ blkg_stat_exit(&stats->dequeue); ++ blkg_stat_exit(&stats->group_wait_time); ++ blkg_stat_exit(&stats->idle_time); ++ blkg_stat_exit(&stats->empty_time); ++#endif ++} ++ ++static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) ++{ ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ if (blkg_rwstat_init(&stats->merged, gfp) || ++ blkg_rwstat_init(&stats->service_time, gfp) || ++ blkg_rwstat_init(&stats->wait_time, gfp) || ++ blkg_rwstat_init(&stats->queued, gfp) || ++ blkg_stat_init(&stats->time, gfp) || ++ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || ++ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || ++ blkg_stat_init(&stats->dequeue, gfp) || ++ blkg_stat_init(&stats->group_wait_time, gfp) || ++ blkg_stat_init(&stats->idle_time, gfp) || ++ blkg_stat_init(&stats->empty_time, gfp)) { ++ bfqg_stats_exit(stats); ++ return -ENOMEM; ++ } ++#endif ++ ++ return 0; ++} ++ ++static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) ++{ ++ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; ++} ++ ++static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) ++{ ++ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); ++} ++ ++static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) ++{ ++ struct bfq_group_data *bgd; ++ ++ bgd = kzalloc(sizeof(*bgd), gfp); ++ if (!bgd) ++ return NULL; ++ return &bgd->pd; ++} ++ ++static void bfq_cpd_init(struct blkcg_policy_data *cpd) ++{ ++ struct bfq_group_data *d = cpd_to_bfqgd(cpd); ++ ++ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? ++ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; ++} ++ ++static void bfq_cpd_free(struct blkcg_policy_data *cpd) ++{ ++ kfree(cpd_to_bfqgd(cpd)); ++} ++ ++static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) ++{ ++ struct bfq_group *bfqg; ++ ++ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); ++ if (!bfqg) ++ return NULL; ++ ++ if (bfqg_stats_init(&bfqg->stats, gfp)) { ++ kfree(bfqg); ++ return NULL; ++ } ++#ifdef BFQ_MQ ++ /* see comments in bfq_bic_update_cgroup for why refcounting */ ++ bfqg_get(bfqg); ++#endif ++ return &bfqg->pd; ++} ++ ++static void bfq_pd_init(struct blkg_policy_data *pd) ++{ ++ struct blkcg_gq *blkg; ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; ++ struct bfq_entity *entity; ++ struct bfq_group_data *d; ++ ++ blkg = pd_to_blkg(pd); ++ BUG_ON(!blkg); ++ bfqg = blkg_to_bfqg(blkg); ++ bfqd = blkg->q->elevator->elevator_data; ++ BUG_ON(bfqg == bfqd->root_group); ++ entity = &bfqg->entity; ++ d = blkcg_to_bfqgd(blkg->blkcg); ++ ++ entity->orig_weight = entity->weight = entity->new_weight = d->weight; ++ entity->my_sched_data = &bfqg->sched_data; ++ bfqg->my_entity = entity; /* ++ * the root_group's will be set to NULL ++ * in bfq_init_queue() ++ */ ++ bfqg->bfqd = bfqd; ++ bfqg->active_entities = 0; ++ bfqg->rq_pos_tree = RB_ROOT; ++} ++ ++static void bfq_pd_free(struct blkg_policy_data *pd) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ ++ bfqg_stats_exit(&bfqg->stats); ++#ifdef BFQ_MQ ++ bfqg_put(bfqg); ++#else ++ kfree(bfqg); ++#endif ++} ++ ++static void bfq_pd_reset_stats(struct blkg_policy_data *pd) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ ++ bfqg_stats_reset(&bfqg->stats); ++} ++ ++static void bfq_group_set_parent(struct bfq_group *bfqg, ++ struct bfq_group *parent) ++{ ++ struct bfq_entity *entity; ++ ++ BUG_ON(!parent); ++ BUG_ON(!bfqg); ++ BUG_ON(bfqg == parent); ++ ++ entity = &bfqg->entity; ++ entity->parent = parent->my_entity; ++ entity->sched_data = &parent->sched_data; ++} ++ ++static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ struct blkcg_gq *blkg; ++ ++ blkg = blkg_lookup(blkcg, bfqd->queue); ++ if (likely(blkg)) ++ return blkg_to_bfqg(blkg); ++ return NULL; ++} ++ ++static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ struct bfq_group *bfqg, *parent; ++ struct bfq_entity *entity; ++ ++ bfqg = bfq_lookup_bfqg(bfqd, blkcg); ++ ++ if (unlikely(!bfqg)) ++ return NULL; ++ ++ /* ++ * Update chain of bfq_groups as we might be handling a leaf group ++ * which, along with some of its relatives, has not been hooked yet ++ * to the private hierarchy of BFQ. ++ */ ++ entity = &bfqg->entity; ++ for_each_entity(entity) { ++ bfqg = container_of(entity, struct bfq_group, entity); ++ BUG_ON(!bfqg); ++ if (bfqg != bfqd->root_group) { ++ parent = bfqg_parent(bfqg); ++ if (!parent) ++ parent = bfqd->root_group; ++ BUG_ON(!parent); ++ bfq_group_set_parent(bfqg, parent); ++ } ++ } ++ ++ return bfqg; ++} ++ ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq); ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); ++ ++/** ++ * bfq_bfqq_move - migrate @bfqq to @bfqg. ++ * @bfqd: queue descriptor. ++ * @bfqq: the queue to move. ++ * @bfqg: the group to move to. ++ * ++ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating ++ * it on the new one. Avoid putting the entity on the old group idle tree. ++ * ++#ifdef BFQ_MQ ++ * Must be called under the scheduler lock, to make sure that the blkg ++ * owning @bfqg does not disappear (see comments in ++ * bfq_bic_update_cgroup on guaranteeing the consistency of blkg ++ * objects). ++#else ++ * Must be called under the queue lock; the cgroup owning @bfqg must ++ * not disappear (by now this just means that we are called under ++ * rcu_read_lock()). ++#endif ++ */ ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); ++ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) ++ && entity->on_st && ++ bfqq != bfqd->in_service_queue); ++ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); ++ ++ /* If bfqq is empty, then bfq_bfqq_expire also invokes ++ * bfq_del_bfqq_busy, thereby removing bfqq and its entity ++ * from data structures related to current group. Otherwise we ++ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as ++ * we do below. ++ */ ++ if (bfqq == bfqd->in_service_queue) ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ ++ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) ++ && &bfq_entity_service_tree(entity)->idle != ++ entity->tree); ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); ++ ++ if (bfq_bfqq_busy(bfqq)) ++ bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ else if (entity->on_st) { ++ BUG_ON(&bfq_entity_service_tree(entity)->idle != ++ entity->tree); ++ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); ++ } ++#ifdef BFQ_MQ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); ++ ++ bfqg_and_blkg_put(bfqq_group(bfqq)); ++#else ++ bfqg_put(bfqq_group(bfqq)); ++#endif ++ ++ entity->parent = bfqg->my_entity; ++ entity->sched_data = &bfqg->sched_data; ++#ifdef BFQ_MQ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "getting blkg and bfqg %p\n", bfqg); ++ ++ /* pin down bfqg and its associated blkg */ ++ bfqg_and_blkg_get(bfqg); ++#else ++ bfqg_get(bfqg); ++#endif ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); ++ if (bfq_bfqq_busy(bfqq)) { ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ bfq_activate_bfqq(bfqd, bfqq); ++ } ++ ++ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) ++ && &bfq_entity_service_tree(entity)->idle != ++ entity->tree); ++} ++ ++/** ++ * __bfq_bic_change_cgroup - move @bic to @cgroup. ++ * @bfqd: the queue descriptor. ++ * @bic: the bic to move. ++ * @blkcg: the blk-cgroup to move to. ++ * ++#ifdef BFQ_MQ ++ * Move bic to blkcg, assuming that bfqd->lock is held; which makes ++ * sure that the reference to cgroup is valid across the call (see ++ * comments in bfq_bic_update_cgroup on this issue) ++#else ++ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller ++ * has to make sure that the reference to cgroup is valid across the call. ++#endif ++ * ++ * NOTE: an alternative approach might have been to store the current ++ * cgroup in bfqq and getting a reference to it, reducing the lookup ++ * time here, at the price of slightly more complex code. ++ */ ++static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, ++ struct blkcg *blkcg) ++{ ++ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); ++ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); ++ struct bfq_group *bfqg; ++ struct bfq_entity *entity; ++ ++ bfqg = bfq_find_set_group(bfqd, blkcg); ++ ++ if (unlikely(!bfqg)) ++ bfqg = bfqd->root_group; ++ ++ if (async_bfqq) { ++ entity = &async_bfqq->entity; ++ ++ if (entity->sched_data != &bfqg->sched_data) { ++ bic_set_bfqq(bic, NULL, 0); ++ bfq_log_bfqq(bfqd, async_bfqq, ++ "%p %d", ++ async_bfqq, ++ async_bfqq->ref); ++ bfq_put_queue(async_bfqq); ++ } ++ } ++ ++ if (sync_bfqq) { ++ entity = &sync_bfqq->entity; ++ if (entity->sched_data != &bfqg->sched_data) ++ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); ++ } ++ ++ return bfqg; ++} ++ ++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_group *bfqg = NULL; ++ uint64_t serial_nr; ++ ++ rcu_read_lock(); ++ serial_nr = bio_blkcg(bio)->css.serial_nr; ++ ++ /* ++ * Check whether blkcg has changed. The condition may trigger ++ * spuriously on a newly created cic but there's no harm. ++ */ ++ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) ++ goto out; ++ ++ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); ++#ifdef BFQ_MQ ++ /* ++ * Update blkg_path for bfq_log_* functions. We cache this ++ * path, and update it here, for the following ++ * reasons. Operations on blkg objects in blk-cgroup are ++ * protected with the request_queue lock, and not with the ++ * lock that protects the instances of this scheduler ++ * (bfqd->lock). This exposes BFQ to the following sort of ++ * race. ++ * ++ * The blkg_lookup performed in bfq_get_queue, protected ++ * through rcu, may happen to return the address of a copy of ++ * the original blkg. If this is the case, then the ++ * bfqg_and_blkg_get performed in bfq_get_queue, to pin down ++ * the blkg, is useless: it does not prevent blk-cgroup code ++ * from destroying both the original blkg and all objects ++ * directly or indirectly referred by the copy of the ++ * blkg. ++ * ++ * On the bright side, destroy operations on a blkg invoke, as ++ * a first step, hooks of the scheduler associated with the ++ * blkg. And these hooks are executed with bfqd->lock held for ++ * BFQ. As a consequence, for any blkg associated with the ++ * request queue this instance of the scheduler is attached ++ * to, we are guaranteed that such a blkg is not destroyed, and ++ * that all the pointers it contains are consistent, while we ++ * are holding bfqd->lock. A blkg_lookup performed with ++ * bfqd->lock held then returns a fully consistent blkg, which ++ * remains consistent until this lock is held. ++ * ++ * Thanks to the last fact, and to the fact that: (1) bfqg has ++ * been obtained through a blkg_lookup in the above ++ * assignment, and (2) bfqd->lock is being held, here we can ++ * safely use the policy data for the involved blkg (i.e., the ++ * field bfqg->pd) to get to the blkg associated with bfqg, ++ * and then we can safely use any field of blkg. After we ++ * release bfqd->lock, even just getting blkg through this ++ * bfqg may cause dangling references to be traversed, as ++ * bfqg->pd may not exist any more. ++ * ++ * In view of the above facts, here we cache, in the bfqg, any ++ * blkg data we may need for this bic, and for its associated ++ * bfq_queue. As of now, we need to cache only the path of the ++ * blkg, which is used in the bfq_log_* functions. ++ * ++ * Finally, note that bfqg itself needs to be protected from ++ * destruction on the blkg_free of the original blkg (which ++ * invokes bfq_pd_free). We use an additional private ++ * refcounter for bfqg, to let it disappear only after no ++ * bfq_queue refers to it any longer. ++ */ ++ blkg_path(bfqg_to_blkg(bfqg), bfqg->blkg_path, sizeof(bfqg->blkg_path)); ++#endif ++ bic->blkcg_serial_nr = serial_nr; ++out: ++ rcu_read_unlock(); ++} ++ ++/** ++ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. ++ * @st: the service tree being flushed. ++ */ ++static void bfq_flush_idle_tree(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *entity = st->first_idle; ++ ++ for (; entity ; entity = st->first_idle) ++ __bfq_deactivate_entity(entity, false); ++} ++ ++/** ++ * bfq_reparent_leaf_entity - move leaf entity to the root_group. ++ * @bfqd: the device data structure with the root group. ++ * @entity: the entity to move. ++ */ ++static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ BUG_ON(!bfqq); ++ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); ++} ++ ++/** ++ * bfq_reparent_active_entities - move to the root group all active ++ * entities. ++ * @bfqd: the device data structure with the root group. ++ * @bfqg: the group to move from. ++ * @st: the service tree with the entities. ++ */ ++static void bfq_reparent_active_entities(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ struct bfq_service_tree *st) ++{ ++ struct rb_root *active = &st->active; ++ struct bfq_entity *entity = NULL; ++ ++ if (!RB_EMPTY_ROOT(&st->active)) ++ entity = bfq_entity_of(rb_first(active)); ++ ++ for (; entity ; entity = bfq_entity_of(rb_first(active))) ++ bfq_reparent_leaf_entity(bfqd, entity); ++ ++ if (bfqg->sched_data.in_service_entity) ++ bfq_reparent_leaf_entity(bfqd, ++ bfqg->sched_data.in_service_entity); ++} ++ ++/** ++ * bfq_pd_offline - deactivate the entity associated with @pd, ++ * and reparent its children entities. ++ * @pd: descriptor of the policy going offline. ++ * ++ * blkio already grabs the queue_lock for us, so no need to use ++ * RCU-based magic ++ */ ++static void bfq_pd_offline(struct blkg_policy_data *pd) ++{ ++ struct bfq_service_tree *st; ++ struct bfq_group *bfqg; ++ struct bfq_data *bfqd; ++ struct bfq_entity *entity; ++#ifdef BFQ_MQ ++ unsigned long flags; ++#endif ++ int i; ++ ++ BUG_ON(!pd); ++ bfqg = pd_to_bfqg(pd); ++ BUG_ON(!bfqg); ++ bfqd = bfqg->bfqd; ++ BUG_ON(bfqd && !bfqd->root_group); ++ ++ entity = bfqg->my_entity; ++ ++#ifdef BFQ_MQ ++ spin_lock_irqsave(&bfqd->lock, flags); ++#endif ++ ++ if (!entity) /* root group */ ++ goto put_async_queues; ++ ++ /* ++ * Empty all service_trees belonging to this group before ++ * deactivating the group itself. ++ */ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { ++ BUG_ON(!bfqg->sched_data.service_tree); ++ st = bfqg->sched_data.service_tree + i; ++ /* ++ * The idle tree may still contain bfq_queues belonging ++ * to exited task because they never migrated to a different ++ * cgroup from the one being destroyed now. ++ */ ++ bfq_flush_idle_tree(st); ++ ++ /* ++ * It may happen that some queues are still active ++ * (busy) upon group destruction (if the corresponding ++ * processes have been forced to terminate). We move ++ * all the leaf entities corresponding to these queues ++ * to the root_group. ++ * Also, it may happen that the group has an entity ++ * in service, which is disconnected from the active ++ * tree: it must be moved, too. ++ * There is no need to put the sync queues, as the ++ * scheduler has taken no reference. ++ */ ++ bfq_reparent_active_entities(bfqd, bfqg, st); ++ BUG_ON(!RB_EMPTY_ROOT(&st->active)); ++ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); ++ } ++ BUG_ON(bfqg->sched_data.next_in_service); ++ BUG_ON(bfqg->sched_data.in_service_entity); ++ ++ __bfq_deactivate_entity(entity, false); ++ ++put_async_queues: ++ bfq_put_async_queues(bfqd, bfqg); ++ ++#ifdef BFQ_MQ ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++#endif ++ /* ++ * @blkg is going offline and will be ignored by ++ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so ++ * that they don't get lost. If IOs complete after this point, the ++ * stats for them will be lost. Oh well... ++ */ ++ bfqg_stats_xfer_dead(bfqg); ++} ++ ++static void bfq_end_wr_async(struct bfq_data *bfqd) ++{ ++ struct blkcg_gq *blkg; ++ ++ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ BUG_ON(!bfqg); ++ ++ bfq_end_wr_async_queues(bfqd, bfqg); ++ } ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); ++} ++ ++static int bfq_io_show_weight(struct seq_file *sf, void *v) ++{ ++ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); ++ unsigned int val = 0; ++ ++ if (bfqgd) ++ val = bfqgd->weight; ++ ++ seq_printf(sf, "%u\n", val); ++ ++ return 0; ++} ++ ++static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, ++ struct cftype *cftype, ++ u64 val) ++{ ++ struct blkcg *blkcg = css_to_blkcg(css); ++ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); ++ struct blkcg_gq *blkg; ++ int ret = -ERANGE; ++ ++ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) ++ return ret; ++ ++ ret = 0; ++ spin_lock_irq(&blkcg->lock); ++ bfqgd->weight = (unsigned short)val; ++ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { ++ struct bfq_group *bfqg = blkg_to_bfqg(blkg); ++ ++ if (!bfqg) ++ continue; ++ /* ++ * Setting the prio_changed flag of the entity ++ * to 1 with new_weight == weight would re-set ++ * the value of the weight to its ioprio mapping. ++ * Set the flag only if necessary. ++ */ ++ if ((unsigned short)val != bfqg->entity.new_weight) { ++ bfqg->entity.new_weight = (unsigned short)val; ++ /* ++ * Make sure that the above new value has been ++ * stored in bfqg->entity.new_weight before ++ * setting the prio_changed flag. In fact, ++ * this flag may be read asynchronously (in ++ * critical sections protected by a different ++ * lock than that held here), and finding this ++ * flag set may cause the execution of the code ++ * for updating parameters whose value may ++ * depend also on bfqg->entity.new_weight (in ++ * __bfq_entity_update_weight_prio). ++ * This barrier makes sure that the new value ++ * of bfqg->entity.new_weight is correctly ++ * seen in that code. ++ */ ++ smp_wmb(); ++ bfqg->entity.prio_changed = 1; ++ } ++ } ++ spin_unlock_irq(&blkcg->lock); ++ ++ return ret; ++} ++ ++static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, ++ char *buf, size_t nbytes, ++ loff_t off) ++{ ++ u64 weight; ++ /* First unsigned long found in the file is used */ ++ int ret = kstrtoull(strim(buf), 0, &weight); ++ ++ if (ret) ++ return ret; ++ ++ ret = bfq_io_set_weight_legacy(of_css(of), NULL, weight); ++ return ret ?: nbytes; ++} ++ ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++static int bfqg_print_stat(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, ++ &blkcg_policy_bfq, seq_cft(sf)->private, false); ++ return 0; ++} ++ ++static int bfqg_print_rwstat(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, ++ &blkcg_policy_bfq, seq_cft(sf)->private, true); ++ return 0; ++} ++ ++static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), ++ &blkcg_policy_bfq, off); ++ return __blkg_prfill_u64(sf, pd, sum); ++} ++ ++static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), ++ &blkcg_policy_bfq, ++ off); ++ return __blkg_prfill_rwstat(sf, pd, &sum); ++} ++ ++static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, ++ seq_cft(sf)->private, false); ++ return 0; ++} ++ ++static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, ++ seq_cft(sf)->private, true); ++ return 0; ++} ++ ++static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, ++ int off) ++{ ++ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); ++ ++ return __blkg_prfill_u64(sf, pd, sum >> 9); ++} ++ ++static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); ++ return 0; ++} ++ ++static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, ++ offsetof(struct blkcg_gq, stat_bytes)); ++ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + ++ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); ++ ++ return __blkg_prfill_u64(sf, pd, sum >> 9); ++} ++ ++static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, ++ false); ++ return 0; ++} ++ ++ ++static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, ++ struct blkg_policy_data *pd, int off) ++{ ++ struct bfq_group *bfqg = pd_to_bfqg(pd); ++ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); ++ u64 v = 0; ++ ++ if (samples) { ++ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); ++ v = div64_u64(v, samples); ++ } ++ __blkg_prfill_u64(sf, pd, v); ++ return 0; ++} ++ ++/* print avg_queue_size */ ++static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) ++{ ++ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), ++ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, ++ 0, false); ++ return 0; ++} ++#endif /* CONFIG_DEBUG_BLK_CGROUP */ ++ ++static struct bfq_group * ++bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) ++{ ++ int ret; ++ ++ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); ++ if (ret) ++ return NULL; ++ ++ return blkg_to_bfqg(bfqd->queue->root_blkg); ++} ++ ++#ifdef BFQ_MQ ++#define BFQ_CGROUP_FNAME(param) "bfq-mq."#param ++#else ++#define BFQ_CGROUP_FNAME(param) "bfq-sq."#param ++#endif ++ ++static struct cftype bfq_blkcg_legacy_files[] = { ++ { ++ .name = BFQ_CGROUP_FNAME(weight), ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = bfq_io_show_weight, ++ .write_u64 = bfq_io_set_weight_legacy, ++ }, ++ ++ /* statistics, covers only the tasks in the bfqg */ ++ { ++ .name = BFQ_CGROUP_FNAME(io_service_bytes), ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_bytes, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_serviced), ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_ios, ++ }, ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ { ++ .name = BFQ_CGROUP_FNAME(time), ++ .private = offsetof(struct bfq_group, stats.time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(sectors), ++ .seq_show = bfqg_print_stat_sectors, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_service_time), ++ .private = offsetof(struct bfq_group, stats.service_time), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_wait_time), ++ .private = offsetof(struct bfq_group, stats.wait_time), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_merged), ++ .private = offsetof(struct bfq_group, stats.merged), ++ .seq_show = bfqg_print_rwstat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_queued), ++ .private = offsetof(struct bfq_group, stats.queued), ++ .seq_show = bfqg_print_rwstat, ++ }, ++#endif /* CONFIG_DEBUG_BLK_CGROUP */ ++ ++ /* the same statictics which cover the bfqg and its descendants */ ++ { ++ .name = BFQ_CGROUP_FNAME(io_service_bytes_recursive), ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_bytes_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_serviced_recursive), ++ .private = (unsigned long)&blkcg_policy_bfq, ++ .seq_show = blkg_print_stat_ios_recursive, ++ }, ++#ifdef CONFIG_DEBUG_BLK_CGROUP ++ { ++ .name = BFQ_CGROUP_FNAME(time_recursive), ++ .private = offsetof(struct bfq_group, stats.time), ++ .seq_show = bfqg_print_stat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(sectors_recursive), ++ .seq_show = bfqg_print_stat_sectors_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_service_time_recursive), ++ .private = offsetof(struct bfq_group, stats.service_time), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_wait_time_recursive), ++ .private = offsetof(struct bfq_group, stats.wait_time), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_merged_recursive), ++ .private = offsetof(struct bfq_group, stats.merged), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(io_queued_recursive), ++ .private = offsetof(struct bfq_group, stats.queued), ++ .seq_show = bfqg_print_rwstat_recursive, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(avg_queue_size), ++ .seq_show = bfqg_print_avg_queue_size, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(group_wait_time), ++ .private = offsetof(struct bfq_group, stats.group_wait_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(idle_time), ++ .private = offsetof(struct bfq_group, stats.idle_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(empty_time), ++ .private = offsetof(struct bfq_group, stats.empty_time), ++ .seq_show = bfqg_print_stat, ++ }, ++ { ++ .name = BFQ_CGROUP_FNAME(dequeue), ++ .private = offsetof(struct bfq_group, stats.dequeue), ++ .seq_show = bfqg_print_stat, ++ }, ++#endif /* CONFIG_DEBUG_BLK_CGROUP */ ++ { } /* terminate */ ++}; ++ ++static struct cftype bfq_blkg_files[] = { ++ { ++ .name = BFQ_CGROUP_FNAME(weight), ++ .flags = CFTYPE_NOT_ON_ROOT, ++ .seq_show = bfq_io_show_weight, ++ .write = bfq_io_set_weight, ++ }, ++ {} /* terminate */ ++}; ++ ++#undef BFQ_CGROUP_FNAME ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_group *bfqg) {} ++ ++static void bfq_init_entity(struct bfq_entity *entity, ++ struct bfq_group *bfqg) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ entity->weight = entity->new_weight; ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) { ++ bfqq->ioprio = bfqq->new_ioprio; ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++ } ++ entity->sched_data = &bfqg->sched_data; ++} ++ ++static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} ++ ++static void bfq_end_wr_async(struct bfq_data *bfqd) ++{ ++ bfq_end_wr_async_queues(bfqd, bfqd->root_group); ++} ++ ++static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, ++ struct blkcg *blkcg) ++{ ++ return bfqd->root_group; ++} ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) ++{ ++ return bfqq->bfqd->root_group; ++} ++ ++static struct bfq_group * ++bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) ++{ ++ struct bfq_group *bfqg; ++ int i; ++ ++ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); ++ if (!bfqg) ++ return NULL; ++ ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ ++ return bfqg; ++} ++#endif +diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c +new file mode 100644 +index 000000000000..fb7bb8f08b75 +--- /dev/null ++++ b/block/bfq-ioc.c +@@ -0,0 +1,36 @@ ++/* ++ * BFQ: I/O context handling. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> ++ * ++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> ++ * Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2010 Paolo Valente <paolo.valente@unimore.it> ++ */ ++ ++/** ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. ++ * @icq: the iocontext queue. ++ */ ++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) ++{ ++ /* bic->icq is the first member, %NULL will convert to %NULL */ ++ return container_of(icq, struct bfq_io_cq, icq); ++} ++ ++/** ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. ++ * @bfqd: the lookup key. ++ * @ioc: the io_context of the process doing I/O. ++ * ++ * Queue lock must be held. ++ */ ++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, ++ struct io_context *ioc) ++{ ++ if (ioc) ++ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); ++ return NULL; ++} +diff --git a/block/bfq-mq-iosched.c b/block/bfq-mq-iosched.c +new file mode 100644 +index 000000000000..47a49d9e6512 +--- /dev/null ++++ b/block/bfq-mq-iosched.c +@@ -0,0 +1,6548 @@ ++/* ++ * Budget Fair Queueing (BFQ) I/O scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> ++ * ++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> ++ * Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ * ++ * BFQ is a proportional-share I/O scheduler, with some extra ++ * low-latency capabilities. BFQ also supports full hierarchical ++ * scheduling through cgroups. Next paragraphs provide an introduction ++ * on BFQ inner workings. Details on BFQ benefits and usage can be ++ * found in Documentation/block/bfq-iosched.txt. ++ * ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based ++ * on the slice-by-slice service scheme of CFQ. But BFQ assigns ++ * budgets, measured in number of sectors, to processes instead of ++ * time slices. The device is not granted to the in-service process ++ * for a given time slice, but until it has exhausted its assigned ++ * budget. This change from the time to the service domain enables BFQ ++ * to distribute the device throughput among processes as desired, ++ * without any distortion due to throughput fluctuations, or to device ++ * internal queueing. BFQ uses an ad hoc internal scheduler, called ++ * B-WF2Q+, to schedule processes according to their budgets. More ++ * precisely, BFQ schedules queues associated with processes. Thanks to ++ * the accurate policy of B-WF2Q+, BFQ can afford to assign high ++ * budgets to I/O-bound processes issuing sequential requests (to ++ * boost the throughput), and yet guarantee a low latency to ++ * interactive and soft real-time applications. ++ * ++ * In particular, BFQ schedules I/O so as to achieve the latter goal-- ++ * low latency for interactive and soft real-time applications--if the ++ * low_latency parameter is set (default configuration). To this ++ * purpose, BFQ constantly tries to detect whether the I/O requests in ++ * a bfq_queue come from an interactive or a soft real-time ++ * application. For brevity, in these cases, the queue is said to be ++ * interactive or soft real-time. In both cases, BFQ privileges the ++ * service of the queue, over that of non-interactive and ++ * non-soft-real-time queues. This privileging is performed, mainly, ++ * by raising the weight of the queue. So, for brevity, we call just ++ * weight-raising periods the time periods during which a queue is ++ * privileged, because deemed interactive or soft real-time. ++ * ++ * The detection of soft real-time queues/applications is described in ++ * detail in the comments on the function ++ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an ++ * interactive queue works as follows: a queue is deemed interactive ++ * if it is constantly non empty only for a limited time interval, ++ * after which it does become empty. The queue may be deemed ++ * interactive again (for a limited time), if it restarts being ++ * constantly non empty, provided that this happens only after the ++ * queue has remained empty for a given minimum idle time. ++ * ++ * By default, BFQ computes automatically the above maximum time ++ * interval, i.e., the time interval after which a constantly ++ * non-empty queue stops being deemed interactive. Since a queue is ++ * weight-raised while it is deemed interactive, this maximum time ++ * interval happens to coincide with the (maximum) duration of the ++ * weight-raising for interactive queues. ++ * ++ * NOTE: if the main or only goal, with a given device, is to achieve ++ * the maximum-possible throughput at all times, then do switch off ++ * all low-latency heuristics for that device, by setting low_latency ++ * to 0. ++ * ++ * BFQ is described in [1], where also a reference to the initial, ++ * more theoretical paper on BFQ can be found. The interested reader ++ * can find in the latter paper full details on the main algorithm, as ++ * well as formulas of the guarantees and formal proofs of all the ++ * properties. With respect to the version of BFQ presented in these ++ * papers, this implementation adds a few more heuristics, such as the ++ * one that guarantees a low latency to soft real-time applications, ++ * and a hierarchical extension based on H-WF2Q+. ++ * ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) ++ * complexity derives from the one introduced with EEVDF in [3]. ++ * ++ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O ++ * Scheduler", Proceedings of the First Workshop on Mobile System ++ * Technologies (MST-2015), May 2015. ++ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf ++ * ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf ++ * ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, ++ * Oct 1997. ++ * ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline ++ * First: A Flexible and Accurate Mechanism for Proportional Share ++ * Resource Allocation,'' technical report. ++ * ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ */ ++#include <linux/module.h> ++#include <linux/slab.h> ++#include <linux/blkdev.h> ++#include <linux/cgroup.h> ++#include <linux/elevator.h> ++#include <linux/jiffies.h> ++#include <linux/rbtree.h> ++#include <linux/ioprio.h> ++#include <linux/sbitmap.h> ++#include <linux/delay.h> ++ ++#include "blk.h" ++#include "blk-mq.h" ++#include "blk-mq-tag.h" ++#include "blk-mq-sched.h" ++#include "bfq-mq.h" ++#include "blk-wbt.h" ++ ++/* Expiration time of sync (0) and async (1) requests, in ns. */ ++static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; ++ ++/* Maximum backwards seek, in KiB. */ ++static const int bfq_back_max = (16 * 1024); ++ ++/* Penalty of a backwards seek, in number of sectors. */ ++static const int bfq_back_penalty = 2; ++ ++/* Idling period duration, in ns. */ ++static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); ++ ++/* Minimum number of assigned budgets for which stats are safe to compute. */ ++static const int bfq_stats_min_budgets = 194; ++ ++/* Default maximum budget values, in sectors and number of requests. */ ++static const int bfq_default_max_budget = (16 * 1024); ++ ++/* ++ * When a sync request is dispatched, the queue that contains that ++ * request, and all the ancestor entities of that queue, are charged ++ * with the number of sectors of the request. In constrast, if the ++ * request is async, then the queue and its ancestor entities are ++ * charged with the number of sectors of the request, multiplied by ++ * the factor below. This throttles the bandwidth for async I/O, ++ * w.r.t. to sync I/O, and it is done to counter the tendency of async ++ * writes to steal I/O throughput to reads. ++ * ++ * The current value of this parameter is the result of a tuning with ++ * several hardware and software configurations. We tried to find the ++ * lowest value for which writes do not cause noticeable problems to ++ * reads. In fact, the lower this parameter, the stabler I/O control, ++ * in the following respect. The lower this parameter is, the less ++ * the bandwidth enjoyed by a group decreases ++ * - when the group does writes, w.r.t. to when it does reads; ++ * - when other groups do reads, w.r.t. to when they do writes. ++ */ ++static const int bfq_async_charge_factor = 3; ++ ++/* Default timeout values, in jiffies, approximating CFQ defaults. */ ++static const int bfq_timeout = (HZ / 8); ++ ++/* ++ * Time limit for merging (see comments in bfq_setup_cooperator). Set ++ * to the slowest value that, in our tests, proved to be effective in ++ * removing false positives, while not causing true positives to miss ++ * queue merging. ++ * ++ * As can be deduced from the low time limit below, queue merging, if ++ * successful, happens at the very beggining of the I/O of the involved ++ * cooperating processes, as a consequence of the arrival of the very ++ * first requests from each cooperator. After that, there is very ++ * little chance to find cooperators. ++ */ ++static const unsigned long bfq_merge_time_limit = HZ/10; ++ ++#define MAX_LENGTH_REASON_NAME 25 ++ ++static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", ++"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", ++"PREEMPTED"}; ++ ++static struct kmem_cache *bfq_pool; ++ ++/* Below this threshold (in ns), we consider thinktime immediate. */ ++#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) ++ ++/* hw_tag detection: parallel requests threshold and min samples needed. */ ++#define BFQ_HW_QUEUE_THRESHOLD 3 ++#define BFQ_HW_QUEUE_SAMPLES 32 ++ ++#define BFQQ_SEEK_THR (sector_t)(8 * 100) ++#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) ++#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ ++ (get_sdist(last_pos, rq) > \ ++ BFQQ_SEEK_THR && \ ++ (!blk_queue_nonrot(bfqd->queue) || \ ++ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) ++#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) ++ ++/* Min number of samples required to perform peak-rate update */ ++#define BFQ_RATE_MIN_SAMPLES 32 ++/* Min observation time interval required to perform a peak-rate update (ns) */ ++#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) ++/* Target observation time interval for a peak-rate update (ns) */ ++#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC ++ ++/* ++ * Shift used for peak-rate fixed precision calculations. ++ * With ++ * - the current shift: 16 positions ++ * - the current type used to store rate: u32 ++ * - the current unit of measure for rate: [sectors/usec], or, more precisely, ++ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, ++ * the range of rates that can be stored is ++ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = ++ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = ++ * [15, 65G] sectors/sec ++ * Which, assuming a sector size of 512B, corresponds to a range of ++ * [7.5K, 33T] B/sec ++ */ ++#define BFQ_RATE_SHIFT 16 ++ ++/* ++ * When configured for computing the duration of the weight-raising ++ * for interactive queues automatically (see the comments at the ++ * beginning of this file), BFQ does it using the following formula: ++ * duration = (ref_rate / r) * ref_wr_duration, ++ * where r is the peak rate of the device, and ref_rate and ++ * ref_wr_duration are two reference parameters. In particular, ++ * ref_rate is the peak rate of the reference storage device (see ++ * below), and ref_wr_duration is about the maximum time needed, with ++ * BFQ and while reading two files in parallel, to load typical large ++ * applications on the reference device (see the comments on ++ * max_service_from_wr below, for more details on how ref_wr_duration ++ * is obtained). In practice, the slower/faster the device at hand ++ * is, the more/less it takes to load applications with respect to the ++ * reference device. Accordingly, the longer/shorter BFQ grants ++ * weight raising to interactive applications. ++ * ++ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), ++ * depending on whether the device is rotational or non-rotational. ++ * ++ * In the following definitions, ref_rate[0] and ref_wr_duration[0] ++ * are the reference values for a rotational device, whereas ++ * ref_rate[1] and ref_wr_duration[1] are the reference values for a ++ * non-rotational device. The reference rates are not the actual peak ++ * rates of the devices used as a reference, but slightly lower ++ * values. The reason for using slightly lower values is that the ++ * peak-rate estimator tends to yield slightly lower values than the ++ * actual peak rate (it can yield the actual peak rate only if there ++ * is only one process doing I/O, and the process does sequential ++ * I/O). ++ * ++ * The reference peak rates are measured in sectors/usec, left-shifted ++ * by BFQ_RATE_SHIFT. ++ */ ++static int ref_rate[2] = {14000, 33000}; ++/* ++ * To improve readability, a conversion function is used to initialize ++ * the following array, which entails that the array can be ++ * initialized only in a function. ++ */ ++static int ref_wr_duration[2]; ++ ++/* ++ * BFQ uses the above-detailed, time-based weight-raising mechanism to ++ * privilege interactive tasks. This mechanism is vulnerable to the ++ * following false positives: I/O-bound applications that will go on ++ * doing I/O for much longer than the duration of weight ++ * raising. These applications have basically no benefit from being ++ * weight-raised at the beginning of their I/O. On the opposite end, ++ * while being weight-raised, these applications ++ * a) unjustly steal throughput to applications that may actually need ++ * low latency; ++ * b) make BFQ uselessly perform device idling; device idling results ++ * in loss of device throughput with most flash-based storage, and may ++ * increase latencies when used purposelessly. ++ * ++ * BFQ tries to reduce these problems, by adopting the following ++ * countermeasure. To introduce this countermeasure, we need first to ++ * finish explaining how the duration of weight-raising for ++ * interactive tasks is computed. ++ * ++ * For a bfq_queue deemed as interactive, the duration of weight ++ * raising is dynamically adjusted, as a function of the estimated ++ * peak rate of the device, so as to be equal to the time needed to ++ * execute the 'largest' interactive task we benchmarked so far. By ++ * largest task, we mean the task for which each involved process has ++ * to do more I/O than for any of the other tasks we benchmarked. This ++ * reference interactive task is the start-up of LibreOffice Writer, ++ * and in this task each process/bfq_queue needs to have at most ~110K ++ * sectors transferred. ++ * ++ * This last piece of information enables BFQ to reduce the actual ++ * duration of weight-raising for at least one class of I/O-bound ++ * applications: those doing sequential or quasi-sequential I/O. An ++ * example is file copy. In fact, once started, the main I/O-bound ++ * processes of these applications usually consume the above 110K ++ * sectors in much less time than the processes of an application that ++ * is starting, because these I/O-bound processes will greedily devote ++ * almost all their CPU cycles only to their target, ++ * throughput-friendly I/O operations. This is even more true if BFQ ++ * happens to be underestimating the device peak rate, and thus ++ * overestimating the duration of weight raising. But, according to ++ * our measurements, once transferred 110K sectors, these processes ++ * have no right to be weight-raised any longer. ++ * ++ * Basing on the last consideration, BFQ ends weight-raising for a ++ * bfq_queue if the latter happens to have received an amount of ++ * service at least equal to the following constant. The constant is ++ * set to slightly more than 110K, to have a minimum safety margin. ++ * ++ * This early ending of weight-raising reduces the amount of time ++ * during which interactive false positives cause the two problems ++ * described at the beginning of these comments. ++ */ ++static const unsigned long max_service_from_wr = 120000; ++ ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) ++ ++#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) ++ ++/** ++ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. ++ * @icq: the iocontext queue. ++ */ ++static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) ++{ ++ /* bic->icq is the first member, %NULL will convert to %NULL */ ++ return container_of(icq, struct bfq_io_cq, icq); ++} ++ ++/** ++ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. ++ * @bfqd: the lookup key. ++ * @ioc: the io_context of the process doing I/O. ++ * @q: the request queue. ++ */ ++static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, ++ struct io_context *ioc, ++ struct request_queue *q) ++{ ++ if (ioc) { ++ unsigned long flags; ++ struct bfq_io_cq *icq; ++ ++ spin_lock_irqsave(q->queue_lock, flags); ++ icq = icq_to_bic(ioc_lookup_icq(ioc, q)); ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return icq; ++ } ++ ++ return NULL; ++} ++ ++/* ++ * Scheduler run of queue, if there are requests pending and no one in the ++ * driver that will restart queueing. ++ */ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd) ++{ ++ if (bfqd->queued != 0) { ++ bfq_log(bfqd, ""); ++ blk_mq_run_hw_queues(bfqd->queue, true); ++ } ++} ++ ++#define BFQ_MQ ++#include "bfq-sched.c" ++#include "bfq-cgroup-included.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. ++ * We choose the request that is closesr to the head right now. Distance ++ * behind the head is penalized and only allowed to a certain extent. ++ */ ++static struct request *bfq_choose_req(struct bfq_data *bfqd, ++ struct request *rq1, ++ struct request *rq2, ++ sector_t last) ++{ ++ sector_t s1, s2, d1 = 0, d2 = 0; ++ unsigned long back_max; ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ++ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ ++ ++ if (!rq1 || rq1 == rq2) ++ return rq2; ++ if (!rq2) ++ return rq1; ++ ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) ++ return rq1; ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) ++ return rq2; ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) ++ return rq1; ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) ++ return rq2; ++ ++ s1 = blk_rq_pos(rq1); ++ s2 = blk_rq_pos(rq2); ++ ++ /* ++ * By definition, 1KiB is 2 sectors. ++ */ ++ back_max = bfqd->bfq_back_max * 2; ++ ++ /* ++ * Strict one way elevator _except_ in the case where we allow ++ * short backward seeks which are biased as twice the cost of a ++ * similar forward seek. ++ */ ++ if (s1 >= last) ++ d1 = s1 - last; ++ else if (s1 + back_max >= last) ++ d1 = (last - s1) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ1_WRAP; ++ ++ if (s2 >= last) ++ d2 = s2 - last; ++ else if (s2 + back_max >= last) ++ d2 = (last - s2) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ2_WRAP; ++ ++ /* Found required data */ ++ ++ /* ++ * By doing switch() on the bit mask "wrap" we avoid having to ++ * check two variables for all permutations: --> faster! ++ */ ++ switch (wrap) { ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ ++ if (d1 < d2) ++ return rq1; ++ else if (d2 < d1) ++ return rq2; ++ ++ if (s1 >= s2) ++ return rq1; ++ else ++ return rq2; ++ ++ case BFQ_RQ2_WRAP: ++ return rq1; ++ case BFQ_RQ1_WRAP: ++ return rq2; ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ ++ default: ++ /* ++ * Since both rqs are wrapped, ++ * start with the one that's further behind head ++ * (--> only *one* back seek required), ++ * since back seek takes more time than forward. ++ */ ++ if (s1 <= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++} ++ ++/* ++ * Async I/O can easily starve sync I/O (both sync reads and sync ++ * writes), by consuming all tags. Similarly, storms of sync writes, ++ * such as those that sync(2) may trigger, can starve sync reads. ++ * Limit depths of async I/O and sync writes so as to counter both ++ * problems. ++ */ ++static void bfq_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) ++{ ++ struct bfq_data *bfqd = data->q->elevator->elevator_data; ++ ++ if (op_is_sync(op) && !op_is_write(op)) ++ return; ++ ++ data->shallow_depth = ++ bfqd->word_depths[!!bfqd->wr_busy_queues][op_is_sync(op)]; ++ ++ bfq_log(bfqd, "wr_busy %d sync %d depth %u", ++ bfqd->wr_busy_queues, op_is_sync(op), ++ data->shallow_depth); ++} ++ ++static struct bfq_queue * ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, ++ sector_t sector, struct rb_node **ret_parent, ++ struct rb_node ***rb_link) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *bfqq = NULL; ++ ++ parent = NULL; ++ p = &root->rb_node; ++ while (*p) { ++ struct rb_node **n; ++ ++ parent = *p; ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ ++ /* ++ * Sort strictly based on sector. Smallest to the left, ++ * largest to the right. ++ */ ++ if (sector > blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_right; ++ else if (sector < blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_left; ++ else ++ break; ++ p = n; ++ bfqq = NULL; ++ } ++ ++ *ret_parent = parent; ++ if (rb_link) ++ *rb_link = p; ++ ++ bfq_log(bfqd, "%llu: returning %d", ++ (unsigned long long) sector, ++ bfqq ? bfqq->pid : 0); ++ ++ return bfqq; ++} ++ ++static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) ++{ ++ return bfqq->service_from_backlogged > 0 && ++ time_is_before_jiffies(bfqq->first_IO_time + ++ bfq_merge_time_limit); ++} ++ ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *__bfqq; ++ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ ++ /* ++ * bfqq cannot be merged any longer (see comments in ++ * bfq_setup_cooperator): no point in adding bfqq into the ++ * position tree. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) ++ return; ++ ++ if (bfq_class_idle(bfqq)) ++ return; ++ if (!bfqq->next_rq) ++ return; ++ ++ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, ++ blk_rq_pos(bfqq->next_rq), &parent, &p); ++ if (!__bfqq) { ++ rb_link_node(&bfqq->pos_node, parent, p); ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); ++ } else ++ bfqq->pos_root = NULL; ++} ++ ++/* ++ * The following function returns true if every queue must receive the ++ * same share of the throughput (this condition is used when deciding ++ * whether idling may be disabled, see the comments in the function ++ * bfq_better_to_idle()). ++ * ++ * Such a scenario occurs when: ++ * 1) all active queues have the same weight, ++ * 2) all active queues belong to the same I/O-priority class, ++ * 3) all active groups at the same level in the groups tree have the same ++ * weight, ++ * 4) all active groups at the same level in the groups tree have the same ++ * number of children. ++ * ++ * Unfortunately, keeping the necessary state for evaluating exactly ++ * the last two symmetry sub-conditions above would be quite complex ++ * and time consuming. Therefore this function evaluates, instead, ++ * only the following stronger three sub-conditions, for which it is ++ * much easier to maintain the needed state: ++ * 1) all active queues have the same weight, ++ * 2) all active queues belong to the same I/O-priority class, ++ * 3) there are no active groups. ++ * In particular, the last condition is always true if hierarchical ++ * support or the cgroups interface are not enabled, thus no state ++ * needs to be maintained in this case. ++ */ ++static bool bfq_symmetric_scenario(struct bfq_data *bfqd) ++{ ++ /* ++ * For queue weights to differ, queue_weights_tree must contain ++ * at least two nodes. ++ */ ++ bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && ++ (bfqd->queue_weights_tree.rb_node->rb_left || ++ bfqd->queue_weights_tree.rb_node->rb_right); ++ ++ bool multiple_classes_busy = ++ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || ++ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || ++ (bfqd->busy_queues[1] && bfqd->busy_queues[2]); ++ ++ bfq_log(bfqd, "varied_queue_weights %d mul_classes %d", ++ varied_queue_weights, multiple_classes_busy); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfq_log(bfqd, "num_groups_with_pending_reqs %u", ++ bfqd->num_groups_with_pending_reqs); ++#endif ++ ++ return !(varied_queue_weights || multiple_classes_busy ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ || bfqd->num_groups_with_pending_reqs > 0 ++#endif ++ ); ++} ++ ++/* ++ * If the weight-counter tree passed as input contains no counter for ++ * the weight of the input queue, then add that counter; otherwise just ++ * increment the existing counter. ++ * ++ * Note that weight-counter trees contain few nodes in mostly symmetric ++ * scenarios. For example, if all queues have the same weight, then the ++ * weight-counter tree for the queues may contain at most one node. ++ * This holds even if low_latency is on, because weight-raised queues ++ * are not inserted in the tree. ++ * In most scenarios, the rate at which nodes are created/destroyed ++ * should be low too. ++ */ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct rb_node **new = &(root->rb_node), *parent = NULL; ++ ++ /* ++ * Do not insert if the queue is already associated with a ++ * counter, which happens if: ++ * 1) a request arrival has caused the queue to become both ++ * non-weight-raised, and hence change its weight, and ++ * backlogged; in this respect, each of the two events ++ * causes an invocation of this function, ++ * 2) this is the invocation of this function caused by the ++ * second event. This second invocation is actually useless, ++ * and we handle this fact by exiting immediately. More ++ * efficient or clearer solutions might possibly be adopted. ++ */ ++ if (bfqq->weight_counter) ++ return; ++ ++ while (*new) { ++ struct bfq_weight_counter *__counter = container_of(*new, ++ struct bfq_weight_counter, ++ weights_node); ++ parent = *new; ++ ++ if (entity->weight == __counter->weight) { ++ bfqq->weight_counter = __counter; ++ goto inc_counter; ++ } ++ if (entity->weight < __counter->weight) ++ new = &((*new)->rb_left); ++ else ++ new = &((*new)->rb_right); ++ } ++ ++ bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), ++ GFP_ATOMIC); ++ ++ /* ++ * In the unlucky event of an allocation failure, we just ++ * exit. This will cause the weight of queue to not be ++ * considered in bfq_symmetric_scenario, which, in its turn, ++ * causes the scenario to be deemed wrongly symmetric in case ++ * bfqq's weight would have been the only weight making the ++ * scenario asymmetric. On the bright side, no unbalance will ++ * however occur when bfqq becomes inactive again (the ++ * invocation of this function is triggered by an activation ++ * of queue). In fact, bfq_weights_tree_remove does nothing ++ * if !bfqq->weight_counter. ++ */ ++ if (unlikely(!bfqq->weight_counter)) ++ return; ++ ++ bfqq->weight_counter->weight = entity->weight; ++ rb_link_node(&bfqq->weight_counter->weights_node, parent, new); ++ rb_insert_color(&bfqq->weight_counter->weights_node, root); ++ ++inc_counter: ++ bfqq->weight_counter->num_active++; ++ bfqq->ref++; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "refs %d weight %d symmetric %d", ++ bfqq->ref, ++ entity->weight, ++ bfq_symmetric_scenario(bfqd)); ++} ++ ++/* ++ * Decrement the weight counter associated with the queue, and, if the ++ * counter reaches 0, remove the counter from the tree. ++ * See the comments to the function bfq_weights_tree_add() for considerations ++ * about overhead. ++ */ ++static void __bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (!bfqq->weight_counter) ++ return; ++ ++ BUG_ON(RB_EMPTY_ROOT(root)); ++ BUG_ON(bfqq->weight_counter->weight != entity->weight); ++ ++ BUG_ON(!bfqq->weight_counter->num_active); ++ bfqq->weight_counter->num_active--; ++ ++ if (bfqq->weight_counter->num_active > 0) ++ goto reset_entity_pointer; ++ ++ rb_erase(&bfqq->weight_counter->weights_node, root); ++ kfree(bfqq->weight_counter); ++ ++reset_entity_pointer: ++ bfqq->weight_counter = NULL; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "refs %d weight %d symmetric %d", ++ bfqq->ref, ++ entity->weight, ++ bfq_symmetric_scenario(bfqd)); ++ bfq_put_queue(bfqq); ++} ++ ++/* ++ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number ++ * of active groups for each queue's inactive parent entity. ++ */ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = bfqq->entity.parent; ++ ++ for_each_entity(entity) { ++ struct bfq_sched_data *sd = entity->my_sched_data; ++ ++ BUG_ON(entity->sched_data == NULL); /* ++ * It would mean ++ * that this is ++ * the root group. ++ */ ++ ++ if (sd->next_in_service || sd->in_service_entity) { ++ BUG_ON(!entity->in_groups_with_pending_reqs); ++ /* ++ * entity is still active, because either ++ * next_in_service or in_service_entity is not ++ * NULL (see the comments on the definition of ++ * next_in_service for details on why ++ * in_service_entity must be checked too). ++ * ++ * As a consequence, its parent entities are ++ * active as well, and thus this loop must ++ * stop here. ++ */ ++ break; ++ } ++ ++ BUG_ON(!bfqd->num_groups_with_pending_reqs && ++ entity->in_groups_with_pending_reqs); ++ /* ++ * The decrement of num_groups_with_pending_reqs is ++ * not performed immediately upon the deactivation of ++ * entity, but it is delayed to when it also happens ++ * that the first leaf descendant bfqq of entity gets ++ * all its pending requests completed. The following ++ * instructions perform this delayed decrement, if ++ * needed. See the comments on ++ * num_groups_with_pending_reqs for details. ++ */ ++ if (entity->in_groups_with_pending_reqs) { ++ entity->in_groups_with_pending_reqs = false; ++ bfqd->num_groups_with_pending_reqs--; ++ } ++ bfq_log_bfqq(bfqd, bfqq, "num_groups_with_pending_reqs %u", ++ bfqd->num_groups_with_pending_reqs); ++ } ++ ++ /* ++ * Next function is invoked last, because it causes bfqq to be ++ * freed if the following holds: bfqq is not in service and ++ * has no dispatched request. DO NOT use bfqq after the next ++ * function invocation. ++ */ ++ __bfq_weights_tree_remove(bfqd, bfqq, ++ &bfqd->queue_weights_tree); ++} ++ ++/* ++ * Return expired entry, or NULL to just start from scratch in rbtree. ++ */ ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct request *rq; ++ ++ if (bfq_bfqq_fifo_expire(bfqq)) ++ return NULL; ++ ++ bfq_mark_bfqq_fifo_expire(bfqq); ++ ++ rq = rq_entry_fifo(bfqq->fifo.next); ++ ++ if (rq == last || ktime_get_ns() < rq->fifo_time) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); ++ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); ++ return rq; ++} ++ ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct rb_node *rbnext = rb_next(&last->rb_node); ++ struct rb_node *rbprev = rb_prev(&last->rb_node); ++ struct request *next, *prev = NULL; ++ ++ BUG_ON(list_empty(&bfqq->fifo)); ++ ++ /* Follow expired path, else get first next available. */ ++ next = bfq_check_fifo(bfqq, last); ++ if (next) { ++ BUG_ON(next == last); ++ return next; ++ } ++ ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); ++ ++ if (rbprev) ++ prev = rb_entry_rq(rbprev); ++ ++ if (rbnext) ++ next = rb_entry_rq(rbnext); ++ else { ++ rbnext = rb_first(&bfqq->sort_list); ++ if (rbnext && rbnext != &last->rb_node) ++ next = rb_entry_rq(rbnext); ++ } ++ ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); ++} ++ ++/* see the definition of bfq_async_charge_factor for details */ ++static unsigned long bfq_serv_to_charge(struct request *rq, ++ struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || ++ !bfq_symmetric_scenario(bfqq->bfqd)) ++ return blk_rq_sectors(rq); ++ ++ return blk_rq_sectors(rq) * bfq_async_charge_factor; ++} ++ ++/** ++ * bfq_updated_next_req - update the queue after a new next_rq selection. ++ * @bfqd: the device data the queue belongs to. ++ * @bfqq: the queue to update. ++ * ++ * If the first request of a queue changes we make sure that the queue ++ * has enough budget to serve at least its first request (if the ++ * request has grown). We do this because if the queue has not enough ++ * budget for its first request, it has to go through two dispatch ++ * rounds to actually get it dispatched. ++ */ ++static void bfq_updated_next_req(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct request *next_rq = bfqq->next_rq; ++ unsigned long new_budget; ++ ++ if (!next_rq) ++ return; ++ ++ if (bfqq == bfqd->in_service_queue) ++ /* ++ * In order not to break guarantees, budgets cannot be ++ * changed after an entity has been selected. ++ */ ++ return; ++ ++ BUG_ON(entity->tree != &st->active); ++ BUG_ON(entity == entity->sched_data->in_service_entity); ++ ++ new_budget = max_t(unsigned long, ++ max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)), ++ entity->service); ++ if (entity->budget != new_budget) { ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", ++ new_budget); ++ bfq_requeue_bfqq(bfqd, bfqq, false); ++ } ++} ++ ++static unsigned int bfq_wr_duration(struct bfq_data *bfqd) ++{ ++ u64 dur; ++ ++ if (bfqd->bfq_wr_max_time > 0) ++ return bfqd->bfq_wr_max_time; ++ ++ dur = bfqd->rate_dur_prod; ++ do_div(dur, bfqd->peak_rate); ++ ++ /* ++ * Limit duration between 3 and 25 seconds. The upper limit ++ * has been conservatively set after the following worst case: ++ * on a QEMU/KVM virtual machine ++ * - running in a slow PC ++ * - with a virtual disk stacked on a slow low-end 5400rpm HDD ++ * - serving a heavy I/O workload, such as the sequential reading ++ * of several files ++ * mplayer took 23 seconds to start, if constantly weight-raised. ++ * ++ * As for higher values than that accomodating the above bad ++ * scenario, tests show that higher values would often yield ++ * the opposite of the desired result, i.e., would worsen ++ * responsiveness by allowing non-interactive applications to ++ * preserve weight raising for too long. ++ * ++ * On the other end, lower values than 3 seconds make it ++ * difficult for most interactive tasks to complete their jobs ++ * before weight-raising finishes. ++ */ ++ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); ++} ++ ++/* switch back from soft real-time to interactive weight raising */ ++static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, ++ struct bfq_data *bfqd) ++{ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; ++} ++ ++static void ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, bool bfq_already_existing) ++{ ++ unsigned int old_wr_coeff; ++ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); ++ ++ if (bic->saved_has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ else ++ bfq_clear_bfqq_has_short_ttime(bfqq); ++ ++ if (bic->saved_IO_bound) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ else ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (unlikely(busy)) ++ old_wr_coeff = bfqq->wr_coeff; ++ ++ bfqq->ttime = bic->saved_ttime; ++ bfqq->wr_coeff = bic->saved_wr_coeff; ++ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; ++ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); ++ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; ++ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "bic %p wr_coeff %d start_finish %lu max_time %lu", ++ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, ++ bfqq->wr_cur_max_time); ++ ++ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time))) { ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) { ++ switch_back_to_interactive_wr(bfqq, bfqd); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "switching back to interactive"); ++ } else { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "switching off wr (%lu + %lu < %lu)", ++ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, ++ jiffies); ++ } ++ } ++ ++ /* make sure weight will be updated, however we got here */ ++ bfqq->entity.prio_changed = 1; ++ ++ if (likely(!busy)) ++ return; ++ ++ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); ++ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++} ++ ++static int bfqq_process_refs(struct bfq_queue *bfqq) ++{ ++ int process_refs, io_refs; ++ ++ lockdep_assert_held(&bfqq->bfqd->lock); ++ ++ io_refs = bfqq->allocated; ++ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st - ++ (bfqq->weight_counter != NULL); ++ BUG_ON(process_refs < 0); ++ return process_refs; ++} ++ ++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ ++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *item; ++ struct hlist_node *n; ++ ++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) ++ hlist_del_init(&item->burst_list_node); ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++ bfqd->burst_size = 1; ++ bfqd->burst_parent_entity = bfqq->entity.parent; ++} ++ ++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ ++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* Increment burst size to take into account also bfqq */ ++ bfqd->burst_size++; ++ ++ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); ++ ++ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); ++ ++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { ++ struct bfq_queue *pos, *bfqq_item; ++ struct hlist_node *n; ++ ++ /* ++ * Enough queues have been activated shortly after each ++ * other to consider this burst as large. ++ */ ++ bfqd->large_burst = true; ++ bfq_log_bfqq(bfqd, bfqq, "large burst started"); ++ ++ /* ++ * We can now mark all queues in the burst list as ++ * belonging to a large burst. ++ */ ++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, ++ burst_list_node) { ++ bfq_mark_bfqq_in_large_burst(bfqq_item); ++ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); ++ } ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); ++ ++ /* ++ * From now on, and until the current burst finishes, any ++ * new queue being activated shortly after the last queue ++ * was inserted in the burst can be immediately marked as ++ * belonging to a large burst. So the burst list is not ++ * needed any more. Remove it. ++ */ ++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, ++ burst_list_node) ++ hlist_del_init(&pos->burst_list_node); ++ } else /* ++ * Burst not yet large: add bfqq to the burst list. Do ++ * not increment the ref counter for bfqq, because bfqq ++ * is removed from the burst list before freeing bfqq ++ * in put_queue. ++ */ ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++} ++ ++/* ++ * If many queues belonging to the same group happen to be created ++ * shortly after each other, then the processes associated with these ++ * queues have typically a common goal. In particular, bursts of queue ++ * creations are usually caused by services or applications that spawn ++ * many parallel threads/processes. Examples are systemd during boot, ++ * or git grep. To help these processes get their job done as soon as ++ * possible, it is usually better to not grant either weight-raising ++ * or device idling to their queues. ++ * ++ * In this comment we describe, firstly, the reasons why this fact ++ * holds, and, secondly, the next function, which implements the main ++ * steps needed to properly mark these queues so that they can then be ++ * treated in a different way. ++ * ++ * The above services or applications benefit mostly from a high ++ * throughput: the quicker the requests of the activated queues are ++ * cumulatively served, the sooner the target job of these queues gets ++ * completed. As a consequence, weight-raising any of these queues, ++ * which also implies idling the device for it, is almost always ++ * counterproductive. In most cases it just lowers throughput. ++ * ++ * On the other hand, a burst of queue creations may be caused also by ++ * the start of an application that does not consist of a lot of ++ * parallel I/O-bound threads. In fact, with a complex application, ++ * several short processes may need to be executed to start-up the ++ * application. In this respect, to start an application as quickly as ++ * possible, the best thing to do is in any case to privilege the I/O ++ * related to the application with respect to all other ++ * I/O. Therefore, the best strategy to start as quickly as possible ++ * an application that causes a burst of queue creations is to ++ * weight-raise all the queues created during the burst. This is the ++ * exact opposite of the best strategy for the other type of bursts. ++ * ++ * In the end, to take the best action for each of the two cases, the ++ * two types of bursts need to be distinguished. Fortunately, this ++ * seems relatively easy, by looking at the sizes of the bursts. In ++ * particular, we found a threshold such that only bursts with a ++ * larger size than that threshold are apparently caused by ++ * services or commands such as systemd or git grep. For brevity, ++ * hereafter we call just 'large' these bursts. BFQ *does not* ++ * weight-raise queues whose creation occurs in a large burst. In ++ * addition, for each of these queues BFQ performs or does not perform ++ * idling depending on which choice boosts the throughput more. The ++ * exact choice depends on the device and request pattern at ++ * hand. ++ * ++ * Unfortunately, false positives may occur while an interactive task ++ * is starting (e.g., an application is being started). The ++ * consequence is that the queues associated with the task do not ++ * enjoy weight raising as expected. Fortunately these false positives ++ * are very rare. They typically occur if some service happens to ++ * start doing I/O exactly when the interactive task starts. ++ * ++ * Turning back to the next function, it implements all the steps ++ * needed to detect the occurrence of a large burst and to properly ++ * mark all the queues belonging to it (so that they can then be ++ * treated in a different way). This goal is achieved by maintaining a ++ * "burst list" that holds, temporarily, the queues that belong to the ++ * burst in progress. The list is then used to mark these queues as ++ * belonging to a large burst if the burst does become large. The main ++ * steps are the following. ++ * ++ * . when the very first queue is created, the queue is inserted into the ++ * list (as it could be the first queue in a possible burst) ++ * ++ * . if the current burst has not yet become large, and a queue Q that does ++ * not yet belong to the burst is activated shortly after the last time ++ * at which a new queue entered the burst list, then the function appends ++ * Q to the burst list ++ * ++ * . if, as a consequence of the previous step, the burst size reaches ++ * the large-burst threshold, then ++ * ++ * . all the queues in the burst list are marked as belonging to a ++ * large burst ++ * ++ * . the burst list is deleted; in fact, the burst list already served ++ * its purpose (keeping temporarily track of the queues in a burst, ++ * so as to be able to mark them as belonging to a large burst in the ++ * previous sub-step), and now is not needed any more ++ * ++ * . the device enters a large-burst mode ++ * ++ * . if a queue Q that does not belong to the burst is created while ++ * the device is in large-burst mode and shortly after the last time ++ * at which a queue either entered the burst list or was marked as ++ * belonging to the current large burst, then Q is immediately marked ++ * as belonging to a large burst. ++ * ++ * . if a queue Q that does not belong to the burst is created a while ++ * later, i.e., not shortly after, than the last time at which a queue ++ * either entered the burst list or was marked as belonging to the ++ * current large burst, then the current burst is deemed as finished and: ++ * ++ * . the large-burst mode is reset if set ++ * ++ * . the burst list is emptied ++ * ++ * . Q is inserted in the burst list, as Q may be the first queue ++ * in a possible new burst (then the burst list contains just Q ++ * after this step). ++ */ ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq is already in the burst list or is part of a large ++ * burst, or finally has just been split, then there is ++ * nothing else to do. ++ */ ++ if (!hlist_unhashed(&bfqq->burst_list_node) || ++ bfq_bfqq_in_large_burst(bfqq) || ++ time_is_after_eq_jiffies(bfqq->split_time + ++ msecs_to_jiffies(10))) ++ return; ++ ++ /* ++ * If bfqq's creation happens late enough, or bfqq belongs to ++ * a different group than the burst group, then the current ++ * burst is finished, and related data structures must be ++ * reset. ++ * ++ * In this respect, consider the special case where bfqq is ++ * the very first queue created after BFQ is selected for this ++ * device. In this case, last_ins_in_burst and ++ * burst_parent_entity are not yet significant when we get ++ * here. But it is easy to verify that, whether or not the ++ * following condition is true, bfqq will end up being ++ * inserted into the burst list. In particular the list will ++ * happen to contain only bfqq. And this is exactly what has ++ * to happen, as bfqq may be the first queue of the first ++ * burst. ++ */ ++ if (time_is_before_jiffies(bfqd->last_ins_in_burst + ++ bfqd->bfq_burst_interval) || ++ bfqq->entity.parent != bfqd->burst_parent_entity) { ++ bfqd->large_burst = false; ++ bfq_reset_burst_list(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "late activation or different group"); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then bfqq is being activated shortly after the ++ * last queue. So, if the current burst is also large, we can mark ++ * bfqq as belonging to this large burst immediately. ++ */ ++ if (bfqd->large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then a large-burst state has not yet been ++ * reached, but bfqq is being activated shortly after the last ++ * queue. Then we add bfqq to the burst. ++ */ ++ bfq_add_to_burst(bfqd, bfqq); ++end: ++ /* ++ * At this point, bfqq either has been added to the current ++ * burst or has caused the current burst to terminate and a ++ * possible new burst to start. In particular, in the second ++ * case, bfqq has become the first queue in the possible new ++ * burst. In both cases last_ins_in_burst needs to be moved ++ * forward. ++ */ ++ bfqd->last_ins_in_burst = jiffies; ++ ++} ++ ++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (entity->budget < entity->service) { ++ pr_crit("budget %d service %d\n", ++ entity->budget, entity->service); ++ BUG(); ++ } ++ return entity->budget - entity->service; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static int bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static int bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget / 32; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); ++ ++/* ++ * The next function, invoked after the input queue bfqq switches from ++ * idle to busy, updates the budget of bfqq. The function also tells ++ * whether the in-service queue should be expired, by returning ++ * true. The purpose of expiring the in-service queue is to give bfqq ++ * the chance to possibly preempt the in-service queue, and the reason ++ * for preempting the in-service queue is to achieve one of the two ++ * goals below. ++ * ++ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has ++ * expired because it has remained idle. In particular, bfqq may have ++ * expired for one of the following two reasons: ++ * ++ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and ++ * did not make it to issue a new request before its last request ++ * was served; ++ * ++ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue ++ * a new request before the expiration of the idling-time. ++ * ++ * Even if bfqq has expired for one of the above reasons, the process ++ * associated with the queue may be however issuing requests greedily, ++ * and thus be sensitive to the bandwidth it receives (bfqq may have ++ * remained idle for other reasons: CPU high load, bfqq not enjoying ++ * idling, I/O throttling somewhere in the path from the process to ++ * the I/O scheduler, ...). But if, after every expiration for one of ++ * the above two reasons, bfqq has to wait for the service of at least ++ * one full budget of another queue before being served again, then ++ * bfqq is likely to get a much lower bandwidth or resource time than ++ * its reserved ones. To address this issue, two countermeasures need ++ * to be taken. ++ * ++ * First, the budget and the timestamps of bfqq need to be updated in ++ * a special way on bfqq reactivation: they need to be updated as if ++ * bfqq did not remain idle and did not expire. In fact, if they are ++ * computed as if bfqq expired and remained idle until reactivation, ++ * then the process associated with bfqq is treated as if, instead of ++ * being greedy, it stopped issuing requests when bfqq remained idle, ++ * and restarts issuing requests only on this reactivation. In other ++ * words, the scheduler does not help the process recover the "service ++ * hole" between bfqq expiration and reactivation. As a consequence, ++ * the process receives a lower bandwidth than its reserved one. In ++ * contrast, to recover this hole, the budget must be updated as if ++ * bfqq was not expired at all before this reactivation, i.e., it must ++ * be set to the value of the remaining budget when bfqq was ++ * expired. Along the same line, timestamps need to be assigned the ++ * value they had the last time bfqq was selected for service, i.e., ++ * before last expiration. Thus timestamps need to be back-shifted ++ * with respect to their normal computation (see [1] for more details ++ * on this tricky aspect). ++ * ++ * Secondly, to allow the process to recover the hole, the in-service ++ * queue must be expired too, to give bfqq the chance to preempt it ++ * immediately. In fact, if bfqq has to wait for a full budget of the ++ * in-service queue to be completed, then it may become impossible to ++ * let the process recover the hole, even if the back-shifted ++ * timestamps of bfqq are lower than those of the in-service queue. If ++ * this happens for most or all of the holes, then the process may not ++ * receive its reserved bandwidth. In this respect, it is worth noting ++ * that, being the service of outstanding requests unpreemptible, a ++ * little fraction of the holes may however be unrecoverable, thereby ++ * causing a little loss of bandwidth. ++ * ++ * The last important point is detecting whether bfqq does need this ++ * bandwidth recovery. In this respect, the next function deems the ++ * process associated with bfqq greedy, and thus allows it to recover ++ * the hole, if: 1) the process is waiting for the arrival of a new ++ * request (which implies that bfqq expired for one of the above two ++ * reasons), and 2) such a request has arrived soon. The first ++ * condition is controlled through the flag non_blocking_wait_rq, ++ * while the second through the flag arrived_in_time. If both ++ * conditions hold, then the function computes the budget in the ++ * above-described special way, and signals that the in-service queue ++ * should be expired. Timestamp back-shifting is done later in ++ * __bfq_activate_entity. ++ * ++ * 2. Reduce latency. Even if timestamps are not backshifted to let ++ * the process associated with bfqq recover a service hole, bfqq may ++ * however happen to have, after being (re)activated, a lower finish ++ * timestamp than the in-service queue. That is, the next budget of ++ * bfqq may have to be completed before the one of the in-service ++ * queue. If this is the case, then preempting the in-service queue ++ * allows this goal to be achieved, apart from the unpreemptible, ++ * outstanding requests mentioned above. ++ * ++ * Unfortunately, regardless of which of the above two goals one wants ++ * to achieve, service trees need first to be updated to know whether ++ * the in-service queue must be preempted. To have service trees ++ * correctly updated, the in-service queue must be expired and ++ * rescheduled, and bfqq must be scheduled too. This is one of the ++ * most costly operations (in future versions, the scheduling ++ * mechanism may be re-designed in such a way to make it possible to ++ * know whether preemption is needed without needing to update service ++ * trees). In addition, queue preemptions almost always cause random ++ * I/O, and thus loss of throughput. Because of these facts, the next ++ * function adopts the following simple scheme to avoid both costly ++ * operations and too frequent preemptions: it requests the expiration ++ * of the in-service queue (unconditionally) only for queues that need ++ * to recover a hole, or that either are weight-raised or deserve to ++ * be weight-raised. ++ */ ++static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool arrived_in_time, ++ bool wr_or_deserves_wr) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ /* ++ * In the next compound condition, we check also whether there ++ * is some budget left, because otherwise there is no point in ++ * trying to go on serving bfqq with this same budget: bfqq ++ * would be expired immediately after being selected for ++ * service. This would only cause useless overhead. ++ */ ++ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && ++ bfq_bfqq_budget_left(bfqq) > 0) { ++ /* ++ * We do not clear the flag non_blocking_wait_rq here, as ++ * the latter is used in bfq_activate_bfqq to signal ++ * that timestamps need to be back-shifted (and is ++ * cleared right after). ++ */ ++ ++ /* ++ * In next assignment we rely on that either ++ * entity->service or entity->budget are not updated ++ * on expiration if bfqq is empty (see ++ * __bfq_bfqq_recalc_budget). Thus both quantities ++ * remain unchanged after such an expiration, and the ++ * following statement therefore assigns to ++ * entity->budget the remaining budget on such an ++ * expiration. ++ */ ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = min_t(unsigned long, ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->max_budget); ++ ++ BUG_ON(entity->budget < 0); ++ ++ /* ++ * At this point, we have used entity->service to get ++ * the budget left (needed for updating ++ * entity->budget). Thus we finally can, and have to, ++ * reset entity->service. The latter must be reset ++ * because bfqq would otherwise be charged again for ++ * the service it has received during its previous ++ * service slot(s). ++ */ ++ entity->service = 0; ++ ++ return true; ++ } ++ ++ /* ++ * We can finally complete expiration, by setting service to 0. ++ */ ++ entity->service = 0; ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(bfqq->next_rq, bfqq)); ++ BUG_ON(entity->budget < 0); ++ ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); ++ return wr_or_deserves_wr; ++} ++ ++/* ++ * Return the farthest past time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_smallest_from_now(void) ++{ ++ return jiffies - MAX_JIFFY_OFFSET; ++} ++ ++static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ unsigned int old_wr_coeff, ++ bool wr_or_deserves_wr, ++ bool interactive, ++ bool in_burst, ++ bool soft_rt) ++{ ++ if (old_wr_coeff == 1 && wr_or_deserves_wr) { ++ /* start a weight-raising period */ ++ if (interactive) { ++ bfqq->service_from_wr = 0; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else { ++ /* ++ * No interactive weight raising in progress ++ * here: assign minus infinity to ++ * wr_start_at_switch_to_srt, to make sure ++ * that, at the end of the soft-real-time ++ * weight raising periods that is starting ++ * now, no interactive weight-raising period ++ * may be wrongly considered as still in ++ * progress (and thus actually started by ++ * mistake). ++ */ ++ bfqq->wr_start_at_switch_to_srt = ++ bfq_smallest_from_now(); ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ } ++ /* ++ * If needed, further reduce budget to make sure it is ++ * close to bfqq's backlog, so as to reduce the ++ * scheduling-error component due to a too large ++ * budget. Do not care about throughput consequences, ++ * but only about latency. Finally, do not assign a ++ * too small budget either, to avoid increasing ++ * latency by causing too frequent expirations. ++ */ ++ bfqq->entity.budget = min_t(unsigned long, ++ bfqq->entity.budget, ++ 2 * bfq_min_budget(bfqd)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } else if (old_wr_coeff > 1) { ++ if (interactive) { /* update wr coeff and duration */ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else if (in_burst) { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq-> ++ wr_cur_max_time)); ++ } else if (soft_rt) { ++ /* ++ * The application is now or still meeting the ++ * requirements for being deemed soft rt. We ++ * can then correctly and safely (re)charge ++ * the weight-raising duration for the ++ * application with the weight-raising ++ * duration for soft rt applications. ++ * ++ * In particular, doing this recharge now, i.e., ++ * before the weight-raising period for the ++ * application finishes, reduces the probability ++ * of the following negative scenario: ++ * 1) the weight of a soft rt application is ++ * raised at startup (as for any newly ++ * created application), ++ * 2) since the application is not interactive, ++ * at a certain time weight-raising is ++ * stopped for the application, ++ * 3) at that time the application happens to ++ * still have pending requests, and hence ++ * is destined to not have a chance to be ++ * deemed soft rt before these requests are ++ * completed (see the comments to the ++ * function bfq_bfqq_softrt_next_start() ++ * for details on soft rt detection), ++ * 4) these pending requests experience a high ++ * latency because the application is not ++ * weight-raised while they are pending. ++ */ ++ if (bfqq->wr_cur_max_time != ++ bfqd->bfq_wr_rt_max_time) { ++ bfqq->wr_start_at_switch_to_srt = ++ bfqq->last_wr_start_finish; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfq_log_bfqq(bfqd, bfqq, ++ "switching to soft_rt wr"); ++ } else ++ bfq_log_bfqq(bfqd, bfqq, ++ "moving forward soft_rt wr duration"); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++} ++ ++static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ return bfqq->dispatched == 0 && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ bfqd->bfq_wr_min_idle_time); ++} ++ ++static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ int old_wr_coeff, ++ struct request *rq, ++ bool *interactive) ++{ ++ bool soft_rt, in_burst, wr_or_deserves_wr, ++ bfqq_wants_to_preempt, ++ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), ++ /* ++ * See the comments on ++ * bfq_bfqq_update_budg_for_activation for ++ * details on the usage of the next variable. ++ */ ++ arrived_in_time = ktime_get_ns() <= ++ bfqq->ttime.last_end_request + ++ bfqd->bfq_slice_idle * 3; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request non-busy: " ++ "jiffies %lu, in_time %d, idle_long %d busyw %d " ++ "wr_coeff %u", ++ jiffies, arrived_in_time, ++ idle_for_long_time, ++ bfq_bfqq_non_blocking_wait_rq(bfqq), ++ old_wr_coeff); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ ++ /* ++ * bfqq deserves to be weight-raised if: ++ * - it is sync, ++ * - it does not belong to a large burst, ++ * - it has been idle for enough time or is soft real-time, ++ * - is linked to a bfq_io_cq (it is not shared in any sense) ++ */ ++ in_burst = bfq_bfqq_in_large_burst(bfqq); ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && ++ !in_burst && ++ time_is_before_jiffies(bfqq->soft_rt_next_start) && ++ bfqq->dispatched == 0; ++ *interactive = ++ !in_burst && ++ idle_for_long_time; ++ wr_or_deserves_wr = bfqd->low_latency && ++ (bfqq->wr_coeff > 1 || ++ (bfq_bfqq_sync(bfqq) && ++ bfqq->bic && (*interactive || soft_rt))); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request: " ++ "in_burst %d, " ++ "soft_rt %d (next %lu), inter %d, bic %p", ++ bfq_bfqq_in_large_burst(bfqq), soft_rt, ++ bfqq->soft_rt_next_start, ++ *interactive, ++ bfqq->bic); ++ ++ /* ++ * Using the last flag, update budget and check whether bfqq ++ * may want to preempt the in-service queue. ++ */ ++ bfqq_wants_to_preempt = ++ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, ++ arrived_in_time, ++ wr_or_deserves_wr); ++ ++ /* ++ * If bfqq happened to be activated in a burst, but has been ++ * idle for much more than an interactive queue, then we ++ * assume that, in the overall I/O initiated in the burst, the ++ * I/O associated with bfqq is finished. So bfqq does not need ++ * to be treated as a queue belonging to a burst ++ * anymore. Accordingly, we reset bfqq's in_large_burst flag ++ * if set, and remove bfqq from the burst list if it's ++ * there. We do not decrement burst_size, because the fact ++ * that bfqq does not need to belong to the burst list any ++ * more does not invalidate the fact that bfqq was created in ++ * a burst. ++ */ ++ if (likely(!bfq_bfqq_just_created(bfqq)) && ++ idle_for_long_time && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ msecs_to_jiffies(10000))) { ++ hlist_del_init(&bfqq->burst_list_node); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ } ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ ++ if (!bfq_bfqq_IO_bound(bfqq)) { ++ if (arrived_in_time) { ++ bfqq->requests_within_timer++; ++ if (bfqq->requests_within_timer >= ++ bfqd->bfq_requests_within_timer) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ } else ++ bfqq->requests_within_timer = 0; ++ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", ++ bfqq->requests_within_timer); ++ } ++ ++ if (bfqd->low_latency) { ++ if (unlikely(time_is_after_jiffies(bfqq->split_time))) ++ /* wraparound */ ++ bfqq->split_time = ++ jiffies - bfqd->bfq_wr_min_idle_time - 1; ++ ++ if (time_is_before_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) { ++ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, ++ old_wr_coeff, ++ wr_or_deserves_wr, ++ *interactive, ++ in_burst, ++ soft_rt); ++ ++ if (old_wr_coeff != bfqq->wr_coeff) ++ bfqq->entity.prio_changed = 1; ++ } ++ } ++ ++ bfqq->last_idle_bklogged = jiffies; ++ bfqq->service_from_backlogged = 0; ++ bfq_clear_bfqq_softrt_update(bfqq); ++ ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ ++ /* ++ * Expire in-service queue only if preemption may be needed ++ * for guarantees. In this respect, the function ++ * next_queue_may_preempt just checks a simple, necessary ++ * condition, and not a sufficient condition based on ++ * timestamps. In fact, for the latter condition to be ++ * evaluated, timestamps would need first to be updated, and ++ * this operation is quite costly (see the comments on the ++ * function bfq_bfqq_update_budg_for_activation). ++ */ ++ if (bfqd->in_service_queue && bfqq_wants_to_preempt && ++ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && ++ next_queue_may_preempt(bfqd)) { ++ struct bfq_queue *in_serv = ++ bfqd->in_service_queue; ++ BUG_ON(in_serv == bfqq); ++ ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ } ++} ++ ++static void bfq_add_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *next_rq, *prev; ++ unsigned int old_wr_coeff = bfqq->wr_coeff; ++ bool interactive = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "size %u %s", ++ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); ++ ++ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ bfqq->queued[rq_is_sync(rq)]++; ++ bfqd->queued++; ++ ++ BUG_ON(!RQ_BFQQ(rq)); ++ BUG_ON(RQ_BFQQ(rq) != bfqq); ++ WARN_ON(blk_rq_sectors(rq) == 0); ++ ++ elv_rb_add(&bfqq->sort_list, rq); ++ ++ /* ++ * Check if this request is a better next-to-serve candidate. ++ */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); ++ BUG_ON(!next_rq); ++ BUG_ON(!RQ_BFQQ(next_rq)); ++ BUG_ON(RQ_BFQQ(next_rq) != bfqq); ++ bfqq->next_rq = next_rq; ++ ++ /* ++ * Adjust priority tree position, if next_rq changes. ++ */ ++ if (prev != bfqq->next_rq) ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ ++ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ ++ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, ++ rq, &interactive); ++ else { ++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && ++ time_is_before_jiffies( ++ bfqq->last_wr_start_finish + ++ bfqd->bfq_wr_min_inter_arr_async)) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "non-idle wrais starting, " ++ "wr_max_time %u wr_busy %d", ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqd->wr_busy_queues); ++ } ++ if (prev != bfqq->next_rq) ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ /* ++ * Assign jiffies to last_wr_start_finish in the following ++ * cases: ++ * ++ * . if bfqq is not going to be weight-raised, because, for ++ * non weight-raised queues, last_wr_start_finish stores the ++ * arrival time of the last request; as of now, this piece ++ * of information is used only for deciding whether to ++ * weight-raise async queues ++ * ++ * . if bfqq is not weight-raised, because, if bfqq is now ++ * switching to weight-raised, then last_wr_start_finish ++ * stores the time when weight-raising starts ++ * ++ * . if bfqq is interactive, because, regardless of whether ++ * bfqq is currently weight-raised, the weight-raising ++ * period must start or restart (this case is considered ++ * separately because it is not detected by the above ++ * conditions, if bfqq is already weight-raised) ++ * ++ * last_wr_start_finish has to be updated also if bfqq is soft ++ * real-time, because the weight-raising period is constantly ++ * restarted on idle-to-busy transitions for these queues, but ++ * this is already done in bfq_bfqq_handle_idle_busy_switch if ++ * needed. ++ */ ++ if (bfqd->low_latency && ++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) ++ bfqq->last_wr_start_finish = jiffies; ++} ++ ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, ++ struct bio *bio, ++ struct request_queue *q) ++{ ++ struct bfq_queue *bfqq = bfqd->bio_bfqq; ++ ++ BUG_ON(!bfqd->bio_bfqq_set); ++ ++ if (bfqq) ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); ++ ++ return NULL; ++} ++ ++static sector_t get_sdist(sector_t last_pos, struct request *rq) ++{ ++ sector_t sdist = 0; ++ ++ if (last_pos) { ++ if (last_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - last_pos; ++ else ++ sdist = last_pos - blk_rq_pos(rq); ++ } ++ ++ return sdist; ++} ++ ++#if 0 /* Still not clear if we can do without next two functions */ ++static void bfq_activate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bfqd->rq_in_driver++; ++} ++ ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ BUG_ON(bfqd->rq_in_driver == 0); ++ bfqd->rq_in_driver--; ++} ++#endif ++ ++static void bfq_remove_request(struct request_queue *q, ++ struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ BUG_ON(bfqq->entity.service > bfqq->entity.budget); ++ ++ if (bfqq->next_rq == rq) { ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ if (bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)) { ++ pr_crit("no bfqq! for next rq %p bfqq %p\n", ++ bfqq->next_rq, bfqq); ++ } ++ ++ BUG_ON(bfqq->next_rq && !RQ_BFQQ(bfqq->next_rq)); ++ if (bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq) { ++ pr_crit( ++ "wrong bfqq! for next rq %p, rq_bfqq %p bfqq %p\n", ++ bfqq->next_rq, RQ_BFQQ(bfqq->next_rq), bfqq); ++ } ++ BUG_ON(bfqq->next_rq && RQ_BFQQ(bfqq->next_rq) != bfqq); ++ ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if (rq->queuelist.prev != &rq->queuelist) ++ list_del_init(&rq->queuelist); ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ elv_rqhash_del(q, rq); ++ if (q->last_merge == rq) ++ q->last_merge = NULL; ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ bfqq->next_rq = NULL; ++ ++ BUG_ON(bfqq->entity.budget < 0); ++ ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { ++ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ ++ bfq_del_bfqq_busy(bfqd, bfqq, false); ++ /* ++ * bfqq emptied. In normal operation, when ++ * bfqq is empty, bfqq->entity.service and ++ * bfqq->entity.budget must contain, ++ * respectively, the service received and the ++ * budget used last time bfqq emptied. These ++ * facts do not hold in this case, as at least ++ * this last removal occurred while bfqq is ++ * not in service. To avoid inconsistencies, ++ * reset both bfqq->entity.service and ++ * bfqq->entity.budget, if bfqq has still a ++ * process that may issue I/O requests to it. ++ */ ++ bfqq->entity.budget = bfqq->entity.service = 0; ++ } ++ ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } else { ++ BUG_ON(!bfqq->next_rq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ ++ if (rq->cmd_flags & REQ_META) { ++ BUG_ON(bfqq->meta_pending == 0); ++ bfqq->meta_pending--; ++ } ++} ++ ++static bool bfq_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) ++{ ++ struct request_queue *q = hctx->queue; ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *free = NULL; ++ /* ++ * bfq_bic_lookup grabs the queue_lock: invoke it now and ++ * store its return value for later use, to avoid nesting ++ * queue_lock inside the bfqd->lock. We assume that the bic ++ * returned by bfq_bic_lookup does not go away before ++ * bfqd->lock is taken. ++ */ ++ struct bfq_io_cq *bic = bfq_bic_lookup(bfqd, current->io_context, q); ++ bool ret; ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ if (bic) ++ bfqd->bio_bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); ++ else ++ bfqd->bio_bfqq = NULL; ++ bfqd->bio_bic = bic; ++ /* Set next flag just for testing purposes */ ++ bfqd->bio_bfqq_set = true; ++ ++ ret = blk_mq_sched_try_merge(q, bio, &free); ++ ++ /* ++ * XXX Not yet freeing without lock held, to avoid an ++ * inconsistency with respect to the lock-protected invocation ++ * of blk_mq_sched_try_insert_merge in bfq_bio_merge. Waiting ++ * for clarifications from Jens. ++ */ ++ if (free) ++ blk_mq_free_request(free); ++ bfqd->bio_bfqq_set = false; ++ spin_unlock_irq(&bfqd->lock); ++ ++ return ret; ++} ++ ++static int bfq_request_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *__rq; ++ ++ __rq = bfq_find_rq_fmerge(bfqd, bio, q); ++ if (__rq && elv_bio_merge_ok(__rq, bio)) { ++ *req = __rq; ++ bfq_log(bfqd, "req %p", __rq); ++ ++ return ELEVATOR_FRONT_MERGE; ++ } ++ ++ return ELEVATOR_NO_MERGE; ++} ++ ++static struct bfq_queue *bfq_init_rq(struct request *rq); ++ ++static void bfq_request_merged(struct request_queue *q, struct request *req, ++ enum elv_merge type) ++{ ++ BUG_ON(req->rq_flags & RQF_DISP_LIST); ++ ++ if (type == ELEVATOR_FRONT_MERGE && ++ rb_prev(&req->rb_node) && ++ blk_rq_pos(req) < ++ blk_rq_pos(container_of(rb_prev(&req->rb_node), ++ struct request, rb_node))) { ++ struct bfq_queue *bfqq = bfq_init_rq(req); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *prev, *next_rq; ++ ++ /* Reposition request in its sort_list */ ++ elv_rb_del(&bfqq->sort_list, req); ++ BUG_ON(!RQ_BFQQ(req)); ++ BUG_ON(RQ_BFQQ(req) != bfqq); ++ elv_rb_add(&bfqq->sort_list, req); ++ ++ /* Choose next request to be served for bfqq */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, ++ bfqd->last_position); ++ BUG_ON(!next_rq); ++ ++ bfqq->next_rq = next_rq; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "req %p prev %p next_rq %p bfqq %p", ++ req, prev, next_rq, bfqq); ++ ++ /* ++ * If next_rq changes, update both the queue's budget to ++ * fit the new request and the queue's position in its ++ * rq_pos_tree. ++ */ ++ if (prev != bfqq->next_rq) { ++ bfq_updated_next_req(bfqd, bfqq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ } ++} ++ ++/* ++ * This function is called to notify the scheduler that the requests ++ * rq and 'next' have been merged, with 'next' going away. BFQ ++ * exploits this hook to address the following issue: if 'next' has a ++ * fifo_time lower that rq, then the fifo_time of rq must be set to ++ * the value of 'next', to not forget the greater age of 'next'. ++ * ++ * NOTE: in this function we assume that rq is in a bfq_queue, basing ++ * on that rq is picked from the hash table q->elevator->hash, which, ++ * in its turn, is filled only with I/O requests present in ++ * bfq_queues, while BFQ is in use for the request queue q. In fact, ++ * the function that fills this hash table (elv_rqhash_add) is called ++ * only by bfq_insert_request. ++ */ ++static void bfq_requests_merged(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ struct bfq_queue *bfqq = bfq_init_rq(rq), ++ *next_bfqq = bfq_init_rq(next); ++ ++ BUG_ON(!RQ_BFQQ(rq)); ++ BUG_ON(!RQ_BFQQ(next)); /* this does not imply next is in a bfqq */ ++ BUG_ON(rq->rq_flags & RQF_DISP_LIST); ++ BUG_ON(next->rq_flags & RQF_DISP_LIST); ++ ++ lockdep_assert_held(&bfqq->bfqd->lock); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "rq %p next %p bfqq %p next_bfqq %p", ++ rq, next, bfqq, next_bfqq); ++ ++ /* ++ * If next and rq belong to the same bfq_queue and next is older ++ * than rq, then reposition rq in the fifo (by substituting next ++ * with rq). Otherwise, if next and rq belong to different ++ * bfq_queues, never reposition rq: in fact, we would have to ++ * reposition it with respect to next's position in its own fifo, ++ * which would most certainly be too expensive with respect to ++ * the benefits. ++ */ ++ if (bfqq == next_bfqq && ++ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && ++ next->fifo_time < rq->fifo_time) { ++ list_del_init(&rq->queuelist); ++ list_replace_init(&next->queuelist, &rq->queuelist); ++ rq->fifo_time = next->fifo_time; ++ } ++ ++ if (bfqq->next_rq == next) ++ bfqq->next_rq = rq; ++ ++ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); ++} ++ ++/* Must be called with bfqq != NULL */ ++static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) ++{ ++ BUG_ON(!bfqq); ++ ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqq->bfqd->wr_busy_queues--; ++ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); ++ } ++ bfqq->wr_coeff = 1; ++ bfqq->wr_cur_max_time = 0; ++ bfqq->last_wr_start_finish = jiffies; ++ /* ++ * Trigger a weight change on the next invocation of ++ * __bfq_entity_update_weight_prio. ++ */ ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ bfqq->last_wr_start_finish, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", ++ bfqq->bfqd->wr_busy_queues); ++} ++ ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ if (bfqg->async_bfqq[i][j]) ++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); ++ if (bfqg->async_idle_bfqq) ++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); ++} ++ ++static void bfq_end_wr(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ bfq_end_wr_async(bfqd); ++ ++ spin_unlock_irq(&bfqd->lock); ++} ++ ++static sector_t bfq_io_struct_pos(void *io_struct, bool request) ++{ ++ if (request) ++ return blk_rq_pos(io_struct); ++ else ++ return ((struct bio *)io_struct)->bi_iter.bi_sector; ++} ++ ++static int bfq_rq_close_to_sector(void *io_struct, bool request, ++ sector_t sector) ++{ ++ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= ++ BFQQ_CLOSE_THR; ++} ++ ++static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ sector_t sector) ++{ ++ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ struct rb_node *parent, *node; ++ struct bfq_queue *__bfqq; ++ ++ if (RB_EMPTY_ROOT(root)) ++ return NULL; ++ ++ /* ++ * First, if we find a request starting at the end of the last ++ * request, choose it. ++ */ ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); ++ if (__bfqq) ++ return __bfqq; ++ ++ /* ++ * If the exact sector wasn't found, the parent of the NULL leaf ++ * will contain the closest sector (rq_pos_tree sorted by ++ * next_request position). ++ */ ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ if (blk_rq_pos(__bfqq->next_rq) < sector) ++ node = rb_next(&__bfqq->pos_node); ++ else ++ node = rb_prev(&__bfqq->pos_node); ++ if (!node) ++ return NULL; ++ ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ return NULL; ++} ++ ++static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, ++ struct bfq_queue *cur_bfqq, ++ sector_t sector) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * We shall notice if some of the queues are cooperating, ++ * e.g., working closely on the same area of the device. In ++ * that case, we can group them together and: 1) don't waste ++ * time idling, and 2) serve the union of their requests in ++ * the best possible order for throughput. ++ */ ++ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); ++ if (!bfqq || bfqq == cur_bfqq) ++ return NULL; ++ ++ return bfqq; ++} ++ ++static struct bfq_queue * ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ int process_refs, new_process_refs; ++ struct bfq_queue *__bfqq; ++ ++ /* ++ * If there are no process references on the new_bfqq, then it is ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain ++ * may have dropped their last reference (not just their last process ++ * reference). ++ */ ++ if (!bfqq_process_refs(new_bfqq)) ++ return NULL; ++ ++ /* Avoid a circular list and skip interim queue merges. */ ++ while ((__bfqq = new_bfqq->new_bfqq)) { ++ if (__bfqq == bfqq) ++ return NULL; ++ new_bfqq = __bfqq; ++ } ++ ++ process_refs = bfqq_process_refs(bfqq); ++ new_process_refs = bfqq_process_refs(new_bfqq); ++ /* ++ * If the process for the bfqq has gone away, there is no ++ * sense in merging the queues. ++ */ ++ if (process_refs == 0 || new_process_refs == 0) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", ++ new_bfqq->pid); ++ ++ /* ++ * Merging is just a redirection: the requests of the process ++ * owning one of the two queues are redirected to the other queue. ++ * The latter queue, in its turn, is set as shared if this is the ++ * first time that the requests of some process are redirected to ++ * it. ++ * ++ * We redirect bfqq to new_bfqq and not the opposite, because ++ * we are in the context of the process owning bfqq, thus we ++ * have the io_cq of this process. So we can immediately ++ * configure this io_cq to redirect the requests of the ++ * process to new_bfqq. In contrast, the io_cq of new_bfqq is ++ * not available any more (new_bfqq->bic == NULL). ++ * ++ * Anyway, even in case new_bfqq coincides with the in-service ++ * queue, redirecting requests the in-service queue is the ++ * best option, as we feed the in-service queue with new ++ * requests close to the last request served and, by doing so, ++ * are likely to increase the throughput. ++ */ ++ bfqq->new_bfqq = new_bfqq; ++ new_bfqq->ref += process_refs; ++ return new_bfqq; ++} ++ ++static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, ++ struct bfq_queue *new_bfqq) ++{ ++ if (bfq_too_late_for_merging(new_bfqq)) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "too late for bfq%d to be merged", ++ new_bfqq->pid); ++ return false; ++ } ++ ++ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || ++ (bfqq->ioprio_class != new_bfqq->ioprio_class)) ++ return false; ++ ++ /* ++ * If either of the queues has already been detected as seeky, ++ * then merging it with the other queue is unlikely to lead to ++ * sequential I/O. ++ */ ++ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) ++ return false; ++ ++ /* ++ * Interleaved I/O is known to be done by (some) applications ++ * only for reads, so it does not make sense to merge async ++ * queues. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Attempt to schedule a merge of bfqq with the currently in-service ++ * queue or with a close queue among the scheduled queues. Return ++ * NULL if no merge was scheduled, a pointer to the shared bfq_queue ++ * structure otherwise. ++ * ++ * The OOM queue is not allowed to participate to cooperation: in fact, since ++ * the requests temporarily redirected to the OOM queue could be redirected ++ * again to dedicated queues at any time, the state needed to correctly ++ * handle merging with the OOM queue would be quite complex and expensive ++ * to maintain. Besides, in such a critical condition as an out of memory, ++ * the benefits of queue merging may be little relevant, or even negligible. ++ * ++ * WARNING: queue merging may impair fairness among non-weight raised ++ * queues, for at least two reasons: 1) the original weight of a ++ * merged queue may change during the merged state, 2) even being the ++ * weight the same, a merged queue may be bloated with many more ++ * requests than the ones produced by its originally-associated ++ * process. ++ */ ++static struct bfq_queue * ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ void *io_struct, bool request) ++{ ++ struct bfq_queue *in_service_bfqq, *new_bfqq; ++ ++ /* ++ * Prevent bfqq from being merged if it has been created too ++ * long ago. The idea is that true cooperating processes, and ++ * thus their associated bfq_queues, are supposed to be ++ * created shortly after each other. This is the case, e.g., ++ * for KVM/QEMU and dump I/O threads. Basing on this ++ * assumption, the following filtering greatly reduces the ++ * probability that two non-cooperating processes, which just ++ * happen to do close I/O for some short time interval, have ++ * their queues merged by mistake. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have looked for coop, but too late"); ++ return NULL; ++ } ++ ++ if (bfqq->new_bfqq) ++ return bfqq->new_bfqq; ++ ++ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) ++ return NULL; ++ ++ /* If there is only one backlogged queue, don't search. */ ++ if (bfq_tot_busy_queues(bfqd) == 1) ++ return NULL; ++ ++ in_service_bfqq = bfqd->in_service_queue; ++ ++ if (in_service_bfqq && in_service_bfqq != bfqq && ++ likely(in_service_bfqq != &bfqd->oom_bfqq) && ++ bfq_rq_close_to_sector(io_struct, request, bfqd->in_serv_last_pos) && ++ bfqq->entity.parent == in_service_bfqq->entity.parent && ++ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); ++ if (new_bfqq) ++ return new_bfqq; ++ } ++ /* ++ * Check whether there is a cooperator among currently scheduled ++ * queues. The only thing we need is that the bio/request is not ++ * NULL, as we need it to establish whether a cooperator exists. ++ */ ++ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, ++ bfq_io_struct_pos(io_struct, request)); ++ ++ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); ++ ++ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && ++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) ++ return bfq_setup_merge(bfqq, new_bfqq); ++ ++ return NULL; ++} ++ ++static void bfq_bfqq_save_state(struct bfq_queue *bfqq) ++{ ++ struct bfq_io_cq *bic = bfqq->bic; ++ ++ /* ++ * If !bfqq->bic, the queue is already shared or its requests ++ * have already been redirected to a shared queue; both idle window ++ * and weight raising state have already been saved. Do nothing. ++ */ ++ if (!bic) ++ return; ++ ++ bic->saved_ttime = bfqq->ttime; ++ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); ++ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); ++ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); ++ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); ++ if (unlikely(bfq_bfqq_just_created(bfqq) && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ bfqq->bfqd->low_latency)) { ++ /* ++ * bfqq being merged ritgh after being created: bfqq ++ * would have deserved interactive weight raising, but ++ * did not make it to be set in a weight-raised state, ++ * because of this early merge. Store directly the ++ * weight-raising state that would have been assigned ++ * to bfqq, so that to avoid that bfqq unjustly fails ++ * to enjoy weight raising if split soon. ++ */ ++ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; ++ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); ++ bic->saved_last_wr_start_finish = jiffies; ++ } else { ++ bic->saved_wr_coeff = bfqq->wr_coeff; ++ bic->saved_wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; ++ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ } ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "bic %p wr_coeff %d start_finish %lu max_time %lu", ++ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, ++ bfqq->wr_cur_max_time); ++} ++ ++static void ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ (unsigned long) new_bfqq->pid); ++ BUG_ON(bfqq->bic && bfqq->bic == new_bfqq->bic); ++ /* Save weight raising and idle window of the merged queues */ ++ bfq_bfqq_save_state(bfqq); ++ bfq_bfqq_save_state(new_bfqq); ++ ++ if (bfq_bfqq_IO_bound(bfqq)) ++ bfq_mark_bfqq_IO_bound(new_bfqq); ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ /* ++ * If bfqq is weight-raised, then let new_bfqq inherit ++ * weight-raising. To reduce false positives, neglect the case ++ * where bfqq has just been created, but has not yet made it ++ * to be weight-raised (which may happen because EQM may merge ++ * bfqq even before bfq_add_request is executed for the first ++ * time for bfqq). Handling this case would however be very ++ * easy, thanks to the flag just_created. ++ */ ++ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ new_bfqq->wr_coeff = bfqq->wr_coeff; ++ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; ++ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; ++ new_bfqq->wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ if (bfq_bfqq_busy(new_bfqq)) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > ++ bfq_tot_busy_queues(bfqd)); ++ } ++ ++ new_bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, new_bfqq, ++ "wr start after merge with %d, rais_max_time %u", ++ bfqq->pid, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ ++ bfqq->wr_coeff = 1; ++ bfqq->entity.prio_changed = 1; ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++ ++ } ++ ++ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", ++ bfqd->wr_busy_queues); ++ ++ /* ++ * Merge queues (that is, let bic redirect its requests to new_bfqq) ++ */ ++ bic_set_bfqq(bic, new_bfqq, 1); ++ bfq_mark_bfqq_coop(new_bfqq); ++ /* ++ * new_bfqq now belongs to at least two bics (it is a shared queue): ++ * set new_bfqq->bic to NULL. bfqq either: ++ * - does not belong to any bic any more, and hence bfqq->bic must ++ * be set to NULL, or ++ * - is a queue whose owning bics have already been redirected to a ++ * different queue, hence the queue is destined to not belong to ++ * any bic soon and bfqq->bic is already NULL (therefore the next ++ * assignment causes no harm). ++ */ ++ new_bfqq->bic = NULL; ++ bfqq->bic = NULL; ++ /* release process reference to bfqq */ ++ bfq_put_queue(bfqq); ++} ++ ++static bool bfq_allow_bio_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bool is_sync = op_is_sync(bio->bi_opf); ++ struct bfq_queue *bfqq = bfqd->bio_bfqq, *new_bfqq; ++ ++ assert_spin_locked(&bfqd->lock); ++ /* ++ * Disallow merge of a sync bio into an async request. ++ */ ++ if (is_sync && !rq_is_sync(rq)) ++ return false; ++ ++ /* ++ * Lookup the bfqq that this bio will be queued with. Allow ++ * merge only if rq is queued there. ++ */ ++ BUG_ON(!bfqd->bio_bfqq_set); ++ if (!bfqq) ++ return false; ++ ++ /* ++ * We take advantage of this function to perform an early merge ++ * of the queues of possible cooperating processes. ++ */ ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); ++ BUG_ON(new_bfqq == bfqq); ++ if (new_bfqq) { ++ /* ++ * bic still points to bfqq, then it has not yet been ++ * redirected to some other bfq_queue, and a queue ++ * merge beween bfqq and new_bfqq can be safely ++ * fulfillled, i.e., bic can be redirected to new_bfqq ++ * and bfqq can be put. ++ */ ++ bfq_merge_bfqqs(bfqd, bfqd->bio_bic, bfqq, ++ new_bfqq); ++ /* ++ * If we get here, bio will be queued into new_queue, ++ * so use new_bfqq to decide whether bio and rq can be ++ * merged. ++ */ ++ bfqq = new_bfqq; ++ ++ /* ++ * Change also bqfd->bio_bfqq, as ++ * bfqd->bio_bic now points to new_bfqq, and ++ * this function may be invoked again (and then may ++ * use again bqfd->bio_bfqq). ++ */ ++ bfqd->bio_bfqq = bfqq; ++ } ++ return bfqq == RQ_BFQQ(rq); ++} ++ ++/* ++ * Set the maximum time for the in-service queue to consume its ++ * budget. This prevents seeky processes from lowering the throughput. ++ * In practice, a time-slice service scheme is used with seeky ++ * processes. ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ unsigned int timeout_coeff; ++ ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "%u", ++ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); ++} ++ ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (bfqq) { ++ bfq_clear_bfqq_fifo_expire(bfqq); ++ ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && ++ bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_before_jiffies(bfqq->budget_timeout)) { ++ /* ++ * For soft real-time queues, move the start ++ * of the weight-raising period forward by the ++ * time the queue has not received any ++ * service. Otherwise, a relatively long ++ * service delay is likely to cause the ++ * weight-raising period of the queue to end, ++ * because of the short duration of the ++ * weight-raising period of a soft real-time ++ * queue. It is worth noting that this move ++ * is not so dangerous for the other queues, ++ * because soft real-time queues are not ++ * greedy. ++ * ++ * To not add a further variable, we use the ++ * overloaded field budget_timeout to ++ * determine for how long the queue has not ++ * received service, i.e., how much time has ++ * elapsed since the queue expired. However, ++ * this is a little imprecise, because ++ * budget_timeout is set to jiffies if bfqq ++ * not only expires, but also remains with no ++ * request. ++ */ ++ if (time_after(bfqq->budget_timeout, ++ bfqq->last_wr_start_finish)) ++ bfqq->last_wr_start_finish += ++ jiffies - bfqq->budget_timeout; ++ else ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { ++ pr_crit( ++ "BFQ WARNING:last %lu budget %lu jiffies %lu", ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout, ++ jiffies); ++ pr_crit("diff %lu", jiffies - ++ max_t(unsigned long, ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout)); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++ ++ bfq_set_budget_timeout(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "cur-budget = %d prio_class %d", ++ bfqq->entity.budget, bfqq->ioprio_class); ++ } else ++ bfq_log(bfqd, "NULL"); ++ ++ bfqd->in_service_queue = bfqq; ++} ++ ++/* ++ * Get and set a new queue for service. ++ */ ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); ++ ++ __bfq_set_in_service_queue(bfqd, bfqq); ++ return bfqq; ++} ++ ++static void bfq_arm_slice_timer(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ u32 sl; ++ ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ bfq_mark_bfqq_wait_request(bfqq); ++ ++ /* ++ * We don't want to idle for seeks, but we do want to allow ++ * fair distribution of slice time for a process doing back-to-back ++ * seeks. So allow a little bit of time for him to submit a new rq. ++ * ++ * To prevent processes with (partly) seeky workloads from ++ * being too ill-treated, grant them a small fraction of the ++ * assigned budget before reducing the waiting time to ++ * BFQ_MIN_TT. This happened to help reduce latency. ++ */ ++ sl = bfqd->bfq_slice_idle; ++ /* ++ * Unless the queue is being weight-raised or the scenario is ++ * asymmetric, grant only minimum idle time if the queue ++ * is seeky. A long idling is preserved for a weight-raised ++ * queue, or, more in general, in an asymemtric scenario, ++ * because a long idling is needed for guaranteeing to a queue ++ * its reserved share of the throughput (in particular, it is ++ * needed if the queue has a higher weight than some other ++ * queue). ++ */ ++ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ bfq_symmetric_scenario(bfqd)) ++ sl = min_t(u32, sl, BFQ_MIN_TT); ++ ++ bfqd->last_idling_start = ktime_get(); ++ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), ++ HRTIMER_MODE_REL); ++ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); ++ bfq_log(bfqd, "arm idle: %ld/%ld ms", ++ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); ++} ++ ++/* ++ * In autotuning mode, max_budget is dynamically recomputed as the ++ * amount of sectors transferred in timeout at the estimated peak ++ * rate. This enables BFQ to utilize a full timeslice with a full ++ * budget, even if the in-service queue is served at peak rate. And ++ * this maximises throughput with sequential workloads. ++ */ ++static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) ++{ ++ return (u64)bfqd->peak_rate * USEC_PER_MSEC * ++ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; ++} ++ ++/* ++ * Update parameters related to throughput and responsiveness, as a ++ * function of the estimated peak rate. See comments on ++ * bfq_calc_max_budget(), and on the ref_wr_duration array. ++ */ ++static void update_thr_responsiveness_params(struct bfq_data *bfqd) ++{ ++ if (bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd); ++ BUG_ON(bfqd->bfq_max_budget < 0); ++ bfq_log(bfqd, "new max_budget = %d", ++ bfqd->bfq_max_budget); ++ } ++} ++ ++static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) ++{ ++ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ ++ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; ++ bfqd->peak_rate_samples = 1; ++ bfqd->sequential_samples = 0; ++ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = ++ blk_rq_sectors(rq); ++ } else /* no new rq dispatched, just reset the number of samples */ ++ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ ++ ++ bfq_log(bfqd, ++ "at end, sample %u/%u tot_sects %llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched); ++} ++ ++static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) ++{ ++ u32 rate, weight, divisor; ++ ++ /* ++ * For the convergence property to hold (see comments on ++ * bfq_update_peak_rate()) and for the assessment to be ++ * reliable, a minimum number of samples must be present, and ++ * a minimum amount of time must have elapsed. If not so, do ++ * not compute new rate. Just reset parameters, to get ready ++ * for a new evaluation attempt. ++ */ ++ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || ++ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { ++ bfq_log(bfqd, ++ "only resetting, delta_first %lluus samples %d", ++ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); ++ goto reset_computation; ++ } ++ ++ /* ++ * If a new request completion has occurred after last ++ * dispatch, then, to approximate the rate at which requests ++ * have been served by the device, it is more precise to ++ * extend the observation interval to the last completion. ++ */ ++ bfqd->delta_from_first = ++ max_t(u64, bfqd->delta_from_first, ++ bfqd->last_completion - bfqd->first_dispatch); ++ ++ BUG_ON(bfqd->delta_from_first == 0); ++ /* ++ * Rate computed in sects/usec, and not sects/nsec, for ++ * precision issues. ++ */ ++ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, ++ div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); ++ ++ bfq_log(bfqd, ++"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", ++ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ rate > 20<<BFQ_RATE_SHIFT); ++ ++ /* ++ * Peak rate not updated if: ++ * - the percentage of sequential dispatches is below 3/4 of the ++ * total, and rate is below the current estimated peak rate ++ * - rate is unreasonably high (> 20M sectors/sec) ++ */ ++ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && ++ rate <= bfqd->peak_rate) || ++ rate > 20<<BFQ_RATE_SHIFT) { ++ bfq_log(bfqd, ++ "goto reset, samples %u/%u rate/peak %llu/%llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ goto reset_computation; ++ } else { ++ bfq_log(bfqd, ++ "do update, samples %u/%u rate/peak %llu/%llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ } ++ ++ /* ++ * We have to update the peak rate, at last! To this purpose, ++ * we use a low-pass filter. We compute the smoothing constant ++ * of the filter as a function of the 'weight' of the new ++ * measured rate. ++ * ++ * As can be seen in next formulas, we define this weight as a ++ * quantity proportional to how sequential the workload is, ++ * and to how long the observation time interval is. ++ * ++ * The weight runs from 0 to 8. The maximum value of the ++ * weight, 8, yields the minimum value for the smoothing ++ * constant. At this minimum value for the smoothing constant, ++ * the measured rate contributes for half of the next value of ++ * the estimated peak rate. ++ * ++ * So, the first step is to compute the weight as a function ++ * of how sequential the workload is. Note that the weight ++ * cannot reach 9, because bfqd->sequential_samples cannot ++ * become equal to bfqd->peak_rate_samples, which, in its ++ * turn, holds true because bfqd->sequential_samples is not ++ * incremented for the first sample. ++ */ ++ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; ++ ++ /* ++ * Second step: further refine the weight as a function of the ++ * duration of the observation interval. ++ */ ++ weight = min_t(u32, 8, ++ div_u64(weight * bfqd->delta_from_first, ++ BFQ_RATE_REF_INTERVAL)); ++ ++ /* ++ * Divisor ranging from 10, for minimum weight, to 2, for ++ * maximum weight. ++ */ ++ divisor = 10 - weight; ++ BUG_ON(divisor == 0); ++ ++ /* ++ * Finally, update peak rate: ++ * ++ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor ++ */ ++ bfqd->peak_rate *= divisor-1; ++ bfqd->peak_rate /= divisor; ++ rate /= divisor; /* smoothing constant alpha = 1/divisor */ ++ ++ bfq_log(bfqd, ++ "divisor %d tmp_peak_rate %llu tmp_rate %u", ++ divisor, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), ++ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); ++ ++ BUG_ON(bfqd->peak_rate == 0); ++ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); ++ ++ bfqd->peak_rate += rate; ++ ++ /* ++ * For a very slow device, bfqd->peak_rate can reach 0 (see ++ * the minimum representable values reported in the comments ++ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid ++ * divisions by zero where bfqd->peak_rate is used as a ++ * divisor. ++ */ ++ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); ++ ++ update_thr_responsiveness_params(bfqd); ++ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); ++ ++reset_computation: ++ bfq_reset_rate_computation(bfqd, rq); ++} ++ ++/* ++ * Update the read/write peak rate (the main quantity used for ++ * auto-tuning, see update_thr_responsiveness_params()). ++ * ++ * It is not trivial to estimate the peak rate (correctly): because of ++ * the presence of sw and hw queues between the scheduler and the ++ * device components that finally serve I/O requests, it is hard to ++ * say exactly when a given dispatched request is served inside the ++ * device, and for how long. As a consequence, it is hard to know ++ * precisely at what rate a given set of requests is actually served ++ * by the device. ++ * ++ * On the opposite end, the dispatch time of any request is trivially ++ * available, and, from this piece of information, the "dispatch rate" ++ * of requests can be immediately computed. So, the idea in the next ++ * function is to use what is known, namely request dispatch times ++ * (plus, when useful, request completion times), to estimate what is ++ * unknown, namely in-device request service rate. ++ * ++ * The main issue is that, because of the above facts, the rate at ++ * which a certain set of requests is dispatched over a certain time ++ * interval can vary greatly with respect to the rate at which the ++ * same requests are then served. But, since the size of any ++ * intermediate queue is limited, and the service scheme is lossless ++ * (no request is silently dropped), the following obvious convergence ++ * property holds: the number of requests dispatched MUST become ++ * closer and closer to the number of requests completed as the ++ * observation interval grows. This is the key property used in ++ * the next function to estimate the peak service rate as a function ++ * of the observed dispatch rate. The function assumes to be invoked ++ * on every request dispatch. ++ */ ++static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) ++{ ++ u64 now_ns = ktime_get_ns(); ++ ++ if (bfqd->peak_rate_samples == 0) { /* first dispatch */ ++ bfq_log(bfqd, ++ "goto reset, samples %d", ++ bfqd->peak_rate_samples) ; ++ bfq_reset_rate_computation(bfqd, rq); ++ goto update_last_values; /* will add one sample */ ++ } ++ ++ /* ++ * Device idle for very long: the observation interval lasting ++ * up to this dispatch cannot be a valid observation interval ++ * for computing a new peak rate (similarly to the late- ++ * completion event in bfq_completed_request()). Go to ++ * update_rate_and_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - start a new observation interval with this dispatch ++ */ ++ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && ++ bfqd->rq_in_driver == 0) { ++ bfq_log(bfqd, ++"jumping to updating&resetting delta_last %lluus samples %d", ++ (now_ns - bfqd->last_dispatch)>>10, ++ bfqd->peak_rate_samples) ; ++ goto update_rate_and_reset; ++ } ++ ++ /* Update sampling information */ ++ bfqd->peak_rate_samples++; ++ ++ if ((bfqd->rq_in_driver > 0 || ++ now_ns - bfqd->last_completion < BFQ_MIN_TT) ++ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) ++ bfqd->sequential_samples++; ++ ++ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); ++ ++ /* Reset max observed rq size every 32 dispatches */ ++ if (likely(bfqd->peak_rate_samples % 32)) ++ bfqd->last_rq_max_size = ++ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); ++ else ++ bfqd->last_rq_max_size = blk_rq_sectors(rq); ++ ++ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; ++ ++ bfq_log(bfqd, ++ "added samples %u/%u tot_sects %llu delta_first %lluus", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched, ++ bfqd->delta_from_first>>10); ++ ++ /* Target observation interval not yet reached, go on sampling */ ++ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) ++ goto update_last_values; ++ ++update_rate_and_reset: ++ bfq_update_rate_reset(bfqd, rq); ++update_last_values: ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ if (RQ_BFQQ(rq) == bfqd->in_service_queue) ++ bfqd->in_serv_last_pos = bfqd->last_position; ++ bfqd->last_dispatch = now_ns; ++ ++ bfq_log(bfqd, ++ "delta_first %lluus last_pos %llu peak_rate %llu", ++ (now_ns - bfqd->first_dispatch)>>10, ++ (unsigned long long) bfqd->last_position, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ bfq_log(bfqd, ++ "samples at end %d", bfqd->peak_rate_samples); ++} ++ ++/* ++ * Remove request from internal lists. ++ */ ++static void bfq_dispatch_remove(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * For consistency, the next instruction should have been ++ * executed after removing the request from the queue and ++ * dispatching it. We execute instead this instruction before ++ * bfq_remove_request() (and hence introduce a temporary ++ * inconsistency), for efficiency. In fact, should this ++ * dispatch occur for a non in-service bfqq, this anticipated ++ * increment prevents two counters related to bfqq->dispatched ++ * from risking to be, first, uselessly decremented, and then ++ * incremented again when the (new) value of bfqq->dispatched ++ * happens to be taken into account. ++ */ ++ bfqq->dispatched++; ++ bfq_update_peak_rate(q->elevator->elevator_data, rq); ++ ++ bfq_remove_request(q, rq); ++} ++ ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * If this bfqq is shared between multiple processes, check ++ * to make sure that those processes are still issuing I/Os ++ * within the mean seek distance. If not, it may be time to ++ * break the queues apart again. ++ */ ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) ++ bfq_mark_bfqq_split_coop(bfqq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfqq->dispatched == 0) ++ /* ++ * Overloading budget_timeout field to store ++ * the time at which the queue remains with no ++ * backlog and no outstanding request; used by ++ * the weight-raising mechanism. ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_del_bfqq_busy(bfqd, bfqq, true); ++ } else { ++ bfq_requeue_bfqq(bfqd, bfqq, true); ++ /* ++ * Resort priority tree of potential close cooperators. ++ */ ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ ++ /* ++ * All in-service entities must have been properly deactivated ++ * or requeued before executing the next function, which ++ * resets all in-service entites as no more in service. ++ */ ++ __bfq_bfqd_reset_in_service(bfqd); ++} ++ ++/** ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. ++ * @bfqd: device data. ++ * @bfqq: queue to update. ++ * @reason: reason for expiration. ++ * ++ * Handle the feedback on @bfqq budget at queue expiration. ++ * See the body for detailed comments. ++ */ ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ enum bfqq_expiration reason) ++{ ++ struct request *next_rq; ++ int budget, min_budget; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ min_budget = bfq_min_budget(bfqd); ++ ++ if (bfqq->wr_coeff == 1) ++ budget = bfqq->max_budget; ++ else /* ++ * Use a constant, low budget for weight-raised queues, ++ * to help achieve a low latency. Keep it slightly higher ++ * than the minimum possible budget, to cause a little ++ * bit fewer expirations. ++ */ ++ budget = 2 * min_budget; ++ ++ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); ++ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", ++ budget, bfq_min_budget(bfqd)); ++ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); ++ ++ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { ++ switch (reason) { ++ /* ++ * Caveat: in all the following cases we trade latency ++ * for throughput. ++ */ ++ case BFQ_BFQQ_TOO_IDLE: ++ /* ++ * This is the only case where we may reduce ++ * the budget: if there is no request of the ++ * process still waiting for completion, then ++ * we assume (tentatively) that the timer has ++ * expired because the batch of requests of ++ * the process could have been served with a ++ * smaller budget. Hence, betting that ++ * process will behave in the same way when it ++ * becomes backlogged again, we reduce its ++ * next budget. As long as we guess right, ++ * this budget cut reduces the latency ++ * experienced by the process. ++ * ++ * However, if there are still outstanding ++ * requests, then the process may have not yet ++ * issued its next request just because it is ++ * still waiting for the completion of some of ++ * the still outstanding ones. So in this ++ * subcase we do not reduce its budget, on the ++ * contrary we increase it to possibly boost ++ * the throughput, as discussed in the ++ * comments to the BUDGET_TIMEOUT case. ++ */ ++ if (bfqq->dispatched > 0) /* still outstanding reqs */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ else { ++ if (budget > 5 * min_budget) ++ budget -= 4 * min_budget; ++ else ++ budget = min_budget; ++ } ++ break; ++ case BFQ_BFQQ_BUDGET_TIMEOUT: ++ /* ++ * We double the budget here because it gives ++ * the chance to boost the throughput if this ++ * is not a seeky process (and has bumped into ++ * this timeout because of, e.g., ZBR). ++ */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_BUDGET_EXHAUSTED: ++ /* ++ * The process still has backlog, and did not ++ * let either the budget timeout or the disk ++ * idling timeout expire. Hence it is not ++ * seeky, has a short thinktime and may be ++ * happy with a higher budget too. So ++ * definitely increase the budget of this good ++ * candidate to boost the disk throughput. ++ */ ++ budget = min(budget * 4, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_NO_MORE_REQUESTS: ++ /* ++ * For queues that expire for this reason, it ++ * is particularly important to keep the ++ * budget close to the actual service they ++ * need. Doing so reduces the timestamp ++ * misalignment problem described in the ++ * comments in the body of ++ * __bfq_activate_entity. In fact, suppose ++ * that a queue systematically expires for ++ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a ++ * new request in time to enjoy timestamp ++ * back-shifting. The larger the budget of the ++ * queue is with respect to the service the ++ * queue actually requests in each service ++ * slot, the more times the queue can be ++ * reactivated with the same virtual finish ++ * time. It follows that, even if this finish ++ * time is pushed to the system virtual time ++ * to reduce the consequent timestamp ++ * misalignment, the queue unjustly enjoys for ++ * many re-activations a lower finish time ++ * than all newly activated queues. ++ * ++ * The service needed by bfqq is measured ++ * quite precisely by bfqq->entity.service. ++ * Since bfqq does not enjoy device idling, ++ * bfqq->entity.service is equal to the number ++ * of sectors that the process associated with ++ * bfqq requested to read/write before waiting ++ * for request completions, or blocking for ++ * other reasons. ++ */ ++ budget = max_t(int, bfqq->entity.service, min_budget); ++ break; ++ default: ++ return; ++ } ++ } else if (!bfq_bfqq_sync(bfqq)) ++ /* ++ * Async queues get always the maximum possible ++ * budget, as for them we do not care about latency ++ * (in addition, their ability to dispatch is limited ++ * by the charging factor). ++ */ ++ budget = bfqd->bfq_max_budget; ++ ++ bfqq->max_budget = budget; ++ ++ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && ++ !bfqd->bfq_user_max_budget) ++ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); ++ ++ /* ++ * If there is still backlog, then assign a new budget, making ++ * sure that it is large enough for the next request. Since ++ * the finish time of bfqq must be kept in sync with the ++ * budget, be sure to call __bfq_bfqq_expire() *after* this ++ * update. ++ * ++ * If there is no backlog, then no need to update the budget; ++ * it will be updated on the arrival of a new request. ++ */ ++ next_rq = bfqq->next_rq; ++ if (next_rq) { ++ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || ++ reason == BFQ_BFQQ_NO_MORE_REQUESTS); ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", ++ next_rq ? blk_rq_sectors(next_rq) : 0, ++ bfqq->entity.budget); ++} ++ ++/* ++ * Return true if the process associated with bfqq is "slow". The slow ++ * flag is used, in addition to the budget timeout, to reduce the ++ * amount of service provided to seeky processes, and thus reduce ++ * their chances to lower the throughput. More details in the comments ++ * on the function bfq_bfqq_expire(). ++ * ++ * An important observation is in order: as discussed in the comments ++ * on the function bfq_update_peak_rate(), with devices with internal ++ * queues, it is hard if ever possible to know when and for how long ++ * an I/O request is processed by the device (apart from the trivial ++ * I/O pattern where a new request is dispatched only after the ++ * previous one has been completed). This makes it hard to evaluate ++ * the real rate at which the I/O requests of each bfq_queue are ++ * served. In fact, for an I/O scheduler like BFQ, serving a ++ * bfq_queue means just dispatching its requests during its service ++ * slot (i.e., until the budget of the queue is exhausted, or the ++ * queue remains idle, or, finally, a timeout fires). But, during the ++ * service slot of a bfq_queue, around 100 ms at most, the device may ++ * be even still processing requests of bfq_queues served in previous ++ * service slots. On the opposite end, the requests of the in-service ++ * bfq_queue may be completed after the service slot of the queue ++ * finishes. ++ * ++ * Anyway, unless more sophisticated solutions are used ++ * (where possible), the sum of the sizes of the requests dispatched ++ * during the service slot of a bfq_queue is probably the only ++ * approximation available for the service received by the bfq_queue ++ * during its service slot. And this sum is the quantity used in this ++ * function to evaluate the I/O speed of a process. ++ */ ++static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool compensate, enum bfqq_expiration reason, ++ unsigned long *delta_ms) ++{ ++ ktime_t delta_ktime; ++ u32 delta_usecs; ++ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ ++ ++ if (!bfq_bfqq_sync(bfqq)) ++ return false; ++ ++ if (compensate) ++ delta_ktime = bfqd->last_idling_start; ++ else ++ delta_ktime = ktime_get(); ++ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); ++ delta_usecs = ktime_to_us(delta_ktime); ++ ++ /* don't use too short time intervals */ ++ if (delta_usecs < 1000) { ++ if (blk_queue_nonrot(bfqd->queue)) ++ /* ++ * give same worst-case guarantees as idling ++ * for seeky ++ */ ++ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; ++ else /* charge at least one seek */ ++ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; ++ ++ bfq_log(bfqd, "too short %u", delta_usecs); ++ ++ return slow; ++ } ++ ++ *delta_ms = delta_usecs / USEC_PER_MSEC; ++ ++ /* ++ * Use only long (> 20ms) intervals to filter out excessive ++ * spikes in service rate estimation. ++ */ ++ if (delta_usecs > 20000) { ++ /* ++ * Caveat for rotational devices: processes doing I/O ++ * in the slower disk zones tend to be slow(er) even ++ * if not seeky. In this respect, the estimated peak ++ * rate is likely to be an average over the disk ++ * surface. Accordingly, to not be too harsh with ++ * unlucky processes, a process is deemed slow only if ++ * its rate has been lower than half of the estimated ++ * peak rate. ++ */ ++ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; ++ bfq_log(bfqd, "relative rate %d/%d", ++ bfqq->entity.service, bfqd->bfq_max_budget); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); ++ ++ return slow; ++} ++ ++/* ++ * To be deemed as soft real-time, an application must meet two ++ * requirements. First, the application must not require an average ++ * bandwidth higher than the approximate bandwidth required to playback or ++ * record a compressed high-definition video. ++ * The next function is invoked on the completion of the last request of a ++ * batch, to compute the next-start time instant, soft_rt_next_start, such ++ * that, if the next request of the application does not arrive before ++ * soft_rt_next_start, then the above requirement on the bandwidth is met. ++ * ++ * The second requirement is that the request pattern of the application is ++ * isochronous, i.e., that, after issuing a request or a batch of requests, ++ * the application stops issuing new requests until all its pending requests ++ * have been completed. After that, the application may issue a new batch, ++ * and so on. ++ * For this reason the next function is invoked to compute ++ * soft_rt_next_start only for applications that meet this requirement, ++ * whereas soft_rt_next_start is set to infinity for applications that do ++ * not. ++ * ++ * Unfortunately, even a greedy (i.e., I/O-bound) application may ++ * happen to meet, occasionally or systematically, both the above ++ * bandwidth and isochrony requirements. This may happen at least in ++ * the following circumstances. First, if the CPU load is high. The ++ * application may stop issuing requests while the CPUs are busy ++ * serving other processes, then restart, then stop again for a while, ++ * and so on. The other circumstances are related to the storage ++ * device: the storage device is highly loaded or reaches a low-enough ++ * throughput with the I/O of the application (e.g., because the I/O ++ * is random and/or the device is slow). In all these cases, the ++ * I/O of the application may be simply slowed down enough to meet ++ * the bandwidth and isochrony requirements. To reduce the probability ++ * that greedy applications are deemed as soft real-time in these ++ * corner cases, a further rule is used in the computation of ++ * soft_rt_next_start: the return value of this function is forced to ++ * be higher than the maximum between the following two quantities. ++ * ++ * (a) Current time plus: (1) the maximum time for which the arrival ++ * of a request is waited for when a sync queue becomes idle, ++ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We ++ * postpone for a moment the reason for adding a few extra ++ * jiffies; we get back to it after next item (b). Lower-bounding ++ * the return value of this function with the current time plus ++ * bfqd->bfq_slice_idle tends to filter out greedy applications, ++ * because the latter issue their next request as soon as possible ++ * after the last one has been completed. In contrast, a soft ++ * real-time application spends some time processing data, after a ++ * batch of its requests has been completed. ++ * ++ * (b) Current value of bfqq->soft_rt_next_start. As pointed out ++ * above, greedy applications may happen to meet both the ++ * bandwidth and isochrony requirements under heavy CPU or ++ * storage-device load. In more detail, in these scenarios, these ++ * applications happen, only for limited time periods, to do I/O ++ * slowly enough to meet all the requirements described so far, ++ * including the filtering in above item (a). These slow-speed ++ * time intervals are usually interspersed between other time ++ * intervals during which these applications do I/O at a very high ++ * speed. Fortunately, exactly because of the high speed of the ++ * I/O in the high-speed intervals, the values returned by this ++ * function happen to be so high, near the end of any such ++ * high-speed interval, to be likely to fall *after* the end of ++ * the low-speed time interval that follows. These high values are ++ * stored in bfqq->soft_rt_next_start after each invocation of ++ * this function. As a consequence, if the last value of ++ * bfqq->soft_rt_next_start is constantly used to lower-bound the ++ * next value that this function may return, then, from the very ++ * beginning of a low-speed interval, bfqq->soft_rt_next_start is ++ * likely to be constantly kept so high that any I/O request ++ * issued during the low-speed interval is considered as arriving ++ * to soon for the application to be deemed as soft ++ * real-time. Then, in the high-speed interval that follows, the ++ * application will not be deemed as soft real-time, just because ++ * it will do I/O at a high speed. And so on. ++ * ++ * Getting back to the filtering in item (a), in the following two ++ * cases this filtering might be easily passed by a greedy ++ * application, if the reference quantity was just ++ * bfqd->bfq_slice_idle: ++ * 1) HZ is so low that the duration of a jiffy is comparable to or ++ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow ++ * devices with HZ=100. The time granularity may be so coarse ++ * that the approximation, in jiffies, of bfqd->bfq_slice_idle ++ * is rather lower than the exact value. ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing ++ * for a while, then suddenly 'jump' by several units to recover the lost ++ * increments. This seems to happen, e.g., inside virtual machines. ++ * To address this issue, in the filtering in (a) we do not use as a ++ * reference time interval just bfqd->bfq_slice_idle, but ++ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the ++ * minimum number of jiffies for which the filter seems to be quite ++ * precise also in embedded systems and KVM/QEMU virtual machines. ++ */ ++static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, ++"service_blkg %lu soft_rate %u sects/sec interval %u", ++ bfqq->service_from_backlogged, ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate)); ++ ++ return max3(bfqq->soft_rt_next_start, ++ bfqq->last_idle_bklogged + ++ HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); ++} ++ ++static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) ++{ ++ return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ blk_queue_nonrot(bfqq->bfqd->queue) && ++ bfqq->bfqd->hw_tag; ++} ++ ++/** ++ * bfq_bfqq_expire - expire a queue. ++ * @bfqd: device owning the queue. ++ * @bfqq: the queue to expire. ++ * @compensate: if true, compensate for the time spent idling. ++ * @reason: the reason causing the expiration. ++ * ++ * If the process associated with bfqq does slow I/O (e.g., because it ++ * issues random requests), we charge bfqq with the time it has been ++ * in service instead of the service it has received (see ++ * bfq_bfqq_charge_time for details on how this goal is achieved). As ++ * a consequence, bfqq will typically get higher timestamps upon ++ * reactivation, and hence it will be rescheduled as if it had ++ * received more service than what it has actually received. In the ++ * end, bfqq receives less service in proportion to how slowly its ++ * associated process consumes its budgets (and hence how seriously it ++ * tends to lower the throughput). In addition, this time-charging ++ * strategy guarantees time fairness among slow processes. In ++ * contrast, if the process associated with bfqq is not slow, we ++ * charge bfqq exactly with the service it has received. ++ * ++ * Charging time to the first type of queues and the exact service to ++ * the other has the effect of using the WF2Q+ policy to schedule the ++ * former on a timeslice basis, without violating service domain ++ * guarantees among the latter. ++ */ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason) ++{ ++ bool slow; ++ unsigned long delta = 0; ++ struct bfq_entity *entity = &bfqq->entity; ++ int ref; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * Check whether the process is slow (see bfq_bfqq_is_slow). ++ */ ++ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); ++ ++ /* ++ * As above explained, charge slow (typically seeky) and ++ * timed-out queues with the time and not the service ++ * received, to favor sequential workloads. ++ * ++ * Processes doing I/O in the slower disk zones will tend to ++ * be slow(er) even if not seeky. Therefore, since the ++ * estimated peak rate is actually an average over the disk ++ * surface, these processes may timeout just for bad luck. To ++ * avoid punishing them, do not charge time to processes that ++ * succeeded in consuming at least 2/3 of their budget. This ++ * allows BFQ to preserve enough elasticity to still perform ++ * bandwidth, and not time, distribution with little unlucky ++ * or quasi-sequential processes. ++ */ ++ if (bfqq->wr_coeff == 1 && ++ (slow || ++ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) ++ bfq_bfqq_charge_time(bfqd, bfqq, delta); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ if (reason == BFQ_BFQQ_TOO_IDLE && ++ entity->service <= 2 * entity->budget / 10) ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (bfqd->low_latency && bfqq->wr_coeff == 1) ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ /* ++ * If we get here, and there are no outstanding ++ * requests, then the request pattern is isochronous ++ * (see the comments on the function ++ * bfq_bfqq_softrt_next_start()). Thus we can compute ++ * soft_rt_next_start. And we do it, unless bfqq is in ++ * interactive weight raising. We do not do it in the ++ * latter subcase, for the following reason. bfqq may ++ * be conveying the I/O needed to load a soft ++ * real-time application. Such an application will ++ * actually exhibit a soft real-time I/O pattern after ++ * it finally starts doing its job. But, if ++ * soft_rt_next_start is computed here for an ++ * interactive bfqq, and bfqq had received a lot of ++ * service before remaining with no outstanding ++ * request (likely to happen on a fast device), then ++ * soft_rt_next_start would be assigned such a high ++ * value that, for a very long time, bfqq would be ++ * prevented from being possibly considered as soft ++ * real time. ++ * ++ * If, instead, the queue still has outstanding ++ * requests, then we have to wait for the completion ++ * of all the outstanding requests to discover whether ++ * the request pattern is actually isochronous. ++ */ ++ BUG_ON(bfq_tot_busy_queues(bfqd) < 1); ++ if (bfqq->dispatched == 0 && ++ bfqq->wr_coeff != bfqd->bfq_wr_coeff) { ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", ++ bfqq->soft_rt_next_start); ++ } else if (bfqq->dispatched > 0) { ++ /* ++ * Schedule an update of soft_rt_next_start to when ++ * the task may be discovered to be isochronous. ++ */ ++ bfq_mark_bfqq_softrt_update(bfqq); ++ } ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "expire (%s, slow %d, num_disp %d, short %d, weight %d, serv %d/%d)", ++ reason_name[reason], slow, bfqq->dispatched, ++ bfq_bfqq_has_short_ttime(bfqq), entity->weight, ++ entity->service, entity->budget); ++ ++ /* ++ * Increase, decrease or leave budget unchanged according to ++ * reason. ++ */ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); ++ ref = bfqq->ref; ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ if (ref == 1) /* bfqq is gone, no more actions on it */ ++ return; ++ ++ BUG_ON(ref > 1 && ++ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && ++ !bfq_class_idle(bfqq)); ++ ++ bfqq->injected_service = 0; ++ ++ /* mark bfqq as waiting a request only if a bic still points to it */ ++ if (!bfq_bfqq_busy(bfqq) && ++ reason != BFQ_BFQQ_BUDGET_TIMEOUT && ++ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(bfqq->next_rq); ++ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); ++ /* ++ * Not setting service to 0, because, if the next rq ++ * arrives in time, the queue will go on receiving ++ * service with this same budget (as if it never expired) ++ */ ++ } else { ++ entity->service = 0; ++ bfq_log_bfqq(bfqd, bfqq, "resetting service"); ++ } ++ ++ /* ++ * Reset the received-service counter for every parent entity. ++ * Differently from what happens with bfqq->entity.service, ++ * the resetting of this counter never needs to be postponed ++ * for parent entities. In fact, in case bfqq may have a ++ * chance to go on being served using the last, partially ++ * consumed budget, bfqq->entity.service needs to be kept, ++ * because if bfqq then actually goes on being served using ++ * the same budget, the last value of bfqq->entity.service is ++ * needed to properly decrement bfqq->entity.budget by the ++ * portion already consumed. In contrast, it is not necessary ++ * to keep entity->service for parent entities too, because ++ * the bubble up of the new value of bfqq->entity.budget will ++ * make sure that the budgets of parent entities are correct, ++ * even in case bfqq and thus parent entities go on receiving ++ * service with the same budget. ++ */ ++ entity = entity->parent; ++ for_each_entity(entity) ++ entity->service = 0; ++} ++ ++/* ++ * Budget timeout is not implemented through a dedicated timer, but ++ * just checked on request arrivals and completions, as well as on ++ * idle timer expirations. ++ */ ++static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) ++{ ++ return time_is_before_eq_jiffies(bfqq->budget_timeout); ++} ++ ++/* ++ * If we expire a queue that is actively waiting (i.e., with the ++ * device idled) for the arrival of a new request, then we may incur ++ * the timestamp misalignment problem described in the body of the ++ * function __bfq_activate_entity. Hence we return true only if this ++ * condition does not hold, or if the queue is slow enough to deserve ++ * only to be kicked off for preserving a high throughput. ++ */ ++static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "wait_request %d left %d timeout %d", ++ bfq_bfqq_wait_request(bfqq), ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, ++ bfq_bfqq_budget_timeout(bfqq)); ++ ++ return (!bfq_bfqq_wait_request(bfqq) || ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) ++ && ++ bfq_bfqq_budget_timeout(bfqq); ++} ++ ++static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bool rot_without_queueing = ++ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, ++ bfqq_sequential_and_IO_bound, ++ idling_boosts_thr; ++ ++ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && ++ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); ++ /* ++ * The next variable takes into account the cases where idling ++ * boosts the throughput. ++ * ++ * The value of the variable is computed considering, first, that ++ * idling is virtually always beneficial for the throughput if: ++ * (a) the device is not NCQ-capable and rotational, or ++ * (b) regardless of the presence of NCQ, the device is rotational and ++ * the request pattern for bfqq is I/O-bound and sequential, or ++ * (c) regardless of whether it is rotational, the device is ++ * not NCQ-capable and the request pattern for bfqq is ++ * I/O-bound and sequential. ++ * ++ * Secondly, and in contrast to the above item (b), idling an ++ * NCQ-capable flash-based device would not boost the ++ * throughput even with sequential I/O; rather it would lower ++ * the throughput in proportion to how fast the device ++ * is. Accordingly, the next variable is true if any of the ++ * above conditions (a), (b) or (c) is true, and, in ++ * particular, happens to be false if bfqd is an NCQ-capable ++ * flash-based device. ++ */ ++ idling_boosts_thr = rot_without_queueing || ++ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && ++ bfqq_sequential_and_IO_bound); ++ ++ bfq_log_bfqq(bfqd, bfqq, "idling_boosts_thr %d", idling_boosts_thr); ++ ++ /* ++ * The return value of this function is equal to that of ++ * idling_boosts_thr, unless a special case holds. In this ++ * special case, described below, idling may cause problems to ++ * weight-raised queues. ++ * ++ * When the request pool is saturated (e.g., in the presence ++ * of write hogs), if the processes associated with ++ * non-weight-raised queues ask for requests at a lower rate, ++ * then processes associated with weight-raised queues have a ++ * higher probability to get a request from the pool ++ * immediately (or at least soon) when they need one. Thus ++ * they have a higher probability to actually get a fraction ++ * of the device throughput proportional to their high ++ * weight. This is especially true with NCQ-capable drives, ++ * which enqueue several requests in advance, and further ++ * reorder internally-queued requests. ++ * ++ * For this reason, we force to false the return value if ++ * there are weight-raised busy queues. In this case, and if ++ * bfqq is not weight-raised, this guarantees that the device ++ * is not idled for bfqq (if, instead, bfqq is weight-raised, ++ * then idling will be guaranteed by another variable, see ++ * below). Combined with the timestamping rules of BFQ (see ++ * [1] for details), this behavior causes bfqq, and hence any ++ * sync non-weight-raised queue, to get a lower number of ++ * requests served, and thus to ask for a lower number of ++ * requests from the request pool, before the busy ++ * weight-raised queues get served again. This often mitigates ++ * starvation problems in the presence of heavy write ++ * workloads and NCQ, thereby guaranteeing a higher ++ * application and system responsiveness in these hostile ++ * scenarios. ++ */ ++ return idling_boosts_thr && ++ bfqd->wr_busy_queues == 0; ++} ++ ++/* ++ * There is a case where idling must be performed not for ++ * throughput concerns, but to preserve service guarantees. ++ * ++ * To introduce this case, we can note that allowing the drive ++ * to enqueue more than one request at a time, and hence ++ * delegating de facto final scheduling decisions to the ++ * drive's internal scheduler, entails loss of control on the ++ * actual request service order. In particular, the critical ++ * situation is when requests from different processes happen ++ * to be present, at the same time, in the internal queue(s) ++ * of the drive. In such a situation, the drive, by deciding ++ * the service order of the internally-queued requests, does ++ * determine also the actual throughput distribution among ++ * these processes. But the drive typically has no notion or ++ * concern about per-process throughput distribution, and ++ * makes its decisions only on a per-request basis. Therefore, ++ * the service distribution enforced by the drive's internal ++ * scheduler is likely to coincide with the desired ++ * device-throughput distribution only in a completely ++ * symmetric scenario where: ++ * (i) each of these processes must get the same throughput as ++ * the others; ++ * (ii) the I/O of each process has the same properties, in ++ * terms of locality (sequential or random), direction ++ * (reads or writes), request sizes, greediness ++ * (from I/O-bound to sporadic), and so on. ++ * In fact, in such a scenario, the drive tends to treat ++ * the requests of each of these processes in about the same ++ * way as the requests of the others, and thus to provide ++ * each of these processes with about the same throughput ++ * (which is exactly the desired throughput distribution). In ++ * contrast, in any asymmetric scenario, device idling is ++ * certainly needed to guarantee that bfqq receives its ++ * assigned fraction of the device throughput (see [1] for ++ * details). ++ * The problem is that idling may significantly reduce ++ * throughput with certain combinations of types of I/O and ++ * devices. An important example is sync random I/O, on flash ++ * storage with command queueing. So, unless bfqq falls in the ++ * above cases where idling also boosts throughput, it would ++ * be important to check conditions (i) and (ii) accurately, ++ * so as to avoid idling when not strictly needed for service ++ * guarantees. ++ * ++ * Unfortunately, it is extremely difficult to thoroughly ++ * check condition (ii). And, in case there are active groups, ++ * it becomes very difficult to check condition (i) too. In ++ * fact, if there are active groups, then, for condition (i) ++ * to become false, it is enough that an active group contains ++ * more active processes or sub-groups than some other active ++ * group. More precisely, for condition (i) to hold because of ++ * such a group, it is not even necessary that the group is ++ * (still) active: it is sufficient that, even if the group ++ * has become inactive, some of its descendant processes still ++ * have some request already dispatched but still waiting for ++ * completion. In fact, requests have still to be guaranteed ++ * their share of the throughput even after being ++ * dispatched. In this respect, it is easy to show that, if a ++ * group frequently becomes inactive while still having ++ * in-flight requests, and if, when this happens, the group is ++ * not considered in the calculation of whether the scenario ++ * is asymmetric, then the group may fail to be guaranteed its ++ * fair share of the throughput (basically because idling may ++ * not be performed for the descendant processes of the group, ++ * but it had to be). We address this issue with the ++ * following bi-modal behavior, implemented in the function ++ * bfq_symmetric_scenario(). ++ * ++ * If there are groups with requests waiting for completion ++ * (as commented above, some of these groups may even be ++ * already inactive), then the scenario is tagged as ++ * asymmetric, conservatively, without checking any of the ++ * conditions (i) and (ii). So the device is idled for bfqq. ++ * This behavior matches also the fact that groups are created ++ * exactly if controlling I/O is a primary concern (to ++ * preserve bandwidth and latency guarantees). ++ * ++ * On the opposite end, if there are no groups with requests ++ * waiting for completion, then only condition (i) is actually ++ * controlled, i.e., provided that condition (i) holds, idling ++ * is not performed, regardless of whether condition (ii) ++ * holds. In other words, only if condition (i) does not hold, ++ * then idling is allowed, and the device tends to be ++ * prevented from queueing many requests, possibly of several ++ * processes. Since there are no groups with requests waiting ++ * for completion, then, to control condition (i) it is enough ++ * to check just whether all the queues with requests waiting ++ * for completion also have the same weight. ++ * ++ * Not checking condition (ii) evidently exposes bfqq to the ++ * risk of getting less throughput than its fair share. ++ * However, for queues with the same weight, a further ++ * mechanism, preemption, mitigates or even eliminates this ++ * problem. And it does so without consequences on overall ++ * throughput. This mechanism and its benefits are explained ++ * in the next three paragraphs. ++ * ++ * Even if a queue, say Q, is expired when it remains idle, Q ++ * can still preempt the new in-service queue if the next ++ * request of Q arrives soon (see the comments on ++ * bfq_bfqq_update_budg_for_activation). If all queues and ++ * groups have the same weight, this form of preemption, ++ * combined with the hole-recovery heuristic described in the ++ * comments on function bfq_bfqq_update_budg_for_activation, ++ * are enough to preserve a correct bandwidth distribution in ++ * the mid term, even without idling. In fact, even if not ++ * idling allows the internal queues of the device to contain ++ * many requests, and thus to reorder requests, we can rather ++ * safely assume that the internal scheduler still preserves a ++ * minimum of mid-term fairness. ++ * ++ * More precisely, this preemption-based, idleless approach ++ * provides fairness in terms of IOPS, and not sectors per ++ * second. This can be seen with a simple example. Suppose ++ * that there are two queues with the same weight, but that ++ * the first queue receives requests of 8 sectors, while the ++ * second queue receives requests of 1024 sectors. In ++ * addition, suppose that each of the two queues contains at ++ * most one request at a time, which implies that each queue ++ * always remains idle after it is served. Finally, after ++ * remaining idle, each queue receives very quickly a new ++ * request. It follows that the two queues are served ++ * alternatively, preempting each other if needed. This ++ * implies that, although both queues have the same weight, ++ * the queue with large requests receives a service that is ++ * 1024/8 times as high as the service received by the other ++ * queue. ++ * ++ * The motivation for using preemption instead of idling (for ++ * queues with the same weight) is that, by not idling, ++ * service guarantees are preserved (completely or at least in ++ * part) without minimally sacrificing throughput. And, if ++ * there is no active group, then the primary expectation for ++ * this device is probably a high throughput. ++ * ++ * We are now left only with explaining the additional ++ * compound condition that is checked below for deciding ++ * whether the scenario is asymmetric. To explain this ++ * compound condition, we need to add that the function ++ * bfq_symmetric_scenario checks the weights of only ++ * non-weight-raised queues, for efficiency reasons (see ++ * comments on bfq_weights_tree_add()). Then the fact that ++ * bfqq is weight-raised is checked explicitly here. More ++ * precisely, the compound condition below takes into account ++ * also the fact that, even if bfqq is being weight-raised, ++ * the scenario is still symmetric if all queues with requests ++ * waiting for completion happen to be ++ * weight-raised. Actually, we should be even more precise ++ * here, and differentiate between interactive weight raising ++ * and soft real-time weight raising. ++ * ++ * As a side note, it is worth considering that the above ++ * device-idling countermeasures may however fail in the ++ * following unlucky scenario: if idling is (correctly) ++ * disabled in a time period during which all symmetry ++ * sub-conditions hold, and hence the device is allowed to ++ * enqueue many requests, but at some later point in time some ++ * sub-condition stops to hold, then it may become impossible ++ * to let requests be served in the desired order until all ++ * the requests already queued in the device have been served. ++ */ ++static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bool asymmetric_scenario = (bfqq->wr_coeff > 1 && ++ bfqd->wr_busy_queues < ++ bfq_tot_busy_queues(bfqd)) || ++ !bfq_symmetric_scenario(bfqd); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wr_coeff %d wr_busy %d busy %d asymmetric %d", ++ bfqq->wr_coeff, ++ bfqd->wr_busy_queues, ++ bfq_tot_busy_queues(bfqd), ++ asymmetric_scenario); ++ ++ return asymmetric_scenario; ++} ++ ++/* ++ * For a queue that becomes empty, device idling is allowed only if ++ * this function returns true for that queue. As a consequence, since ++ * device idling plays a critical role for both throughput boosting ++ * and service guarantees, the return value of this function plays a ++ * critical role as well. ++ * ++ * In a nutshell, this function returns true only if idling is ++ * beneficial for throughput or, even if detrimental for throughput, ++ * idling is however necessary to preserve service guarantees (low ++ * latency, desired throughput distribution, ...). In particular, on ++ * NCQ-capable devices, this function tries to return false, so as to ++ * help keep the drives' internal queues full, whenever this helps the ++ * device boost the throughput without causing any service-guarantee ++ * issue. ++ * ++ * Most of the issues taken into account to get the return value of ++ * this function are not trivial. We discuss these issues in the two ++ * functions providing the main pieces of information needed by this ++ * function. ++ */ ++static bool bfq_better_to_idle(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; ++ ++ if (unlikely(bfqd->strict_guarantees)) ++ return true; ++ ++ /* ++ * Idling is performed only if slice_idle > 0. In addition, we ++ * do not idle if ++ * (a) bfqq is async ++ * (b) bfqq is in the idle io prio class: in this case we do ++ * not idle because we want to minimize the bandwidth that ++ * queues in this class can steal to higher-priority queues ++ */ ++ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || ++ bfq_class_idle(bfqq)) ++ return false; ++ ++ idling_boosts_thr_with_no_issue = ++ idling_boosts_thr_without_issues(bfqd, bfqq); ++ ++ idling_needed_for_service_guar = ++ idling_needed_for_service_guarantees(bfqd, bfqq); ++ ++ /* ++ * We have now the two components we need to compute the ++ * return value of the function, which is true only if idling ++ * either boosts the throughput (without issues), or is ++ * necessary to preserve service guarantees. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wr_busy %d boosts %d IO-bound %d guar %d", ++ bfqd->wr_busy_queues, ++ idling_boosts_thr_with_no_issue, ++ bfq_bfqq_IO_bound(bfqq), ++ idling_needed_for_service_guar); ++ ++ return idling_boosts_thr_with_no_issue || ++ idling_needed_for_service_guar; ++} ++ ++/* ++ * If the in-service queue is empty but the function bfq_better_to_idle ++ * returns true, then: ++ * 1) the queue must remain in service and cannot be expired, and ++ * 2) the device must be idled to wait for the possible arrival of a new ++ * request for the queue. ++ * See the comments on the function bfq_better_to_idle for the reasons ++ * why performing device idling is the best choice to boost the throughput ++ * and preserve service guarantees when bfq_better_to_idle itself ++ * returns true. ++ */ ++static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) ++{ ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); ++} ++ ++static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * A linear search; but, with a high probability, very few ++ * steps are needed to find a candidate queue, i.e., a queue ++ * with enough budget left for its next request. In fact: ++ * - BFQ dynamically updates the budget of every queue so as ++ * to accomodate the expected backlog of the queue; ++ * - if a queue gets all its requests dispatched as injected ++ * service, then the queue is removed from the active list ++ * (and re-added only if it gets new requests, but with ++ * enough budget for its new backlog). ++ */ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && ++ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= ++ bfq_bfqq_budget_left(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); ++ return bfqq; ++ } ++ ++ bfq_log(bfqd, "no queue found"); ++ return NULL; ++} ++ ++/* ++ * Select a queue for service. If we have a current queue in service, ++ * check whether to continue servicing it, or retrieve and set a new one. ++ */ ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ struct request *next_rq; ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ ++ bfqq = bfqd->in_service_queue; ++ if (!bfqq) ++ goto new_queue; ++ ++ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); ++ ++ /* ++ * Do not expire bfqq for budget timeout if bfqq may be about ++ * to enjoy device idling. The reason why, in this case, we ++ * prevent bfqq from expiring is the same as in the comments ++ * on the case where bfq_bfqq_must_idle() returns true, in ++ * bfq_completed_request(). ++ */ ++ if (bfq_may_expire_for_budg_timeout(bfqq) && ++ !bfq_bfqq_must_idle(bfqq)) ++ goto expire; ++ ++check_queue: ++ /* ++ * This loop is rarely executed more than once. Even when it ++ * happens, it is much more convenient to re-execute this loop ++ * than to return NULL and trigger a new dispatch to get a ++ * request served. ++ */ ++ next_rq = bfqq->next_rq; ++ /* ++ * If bfqq has requests queued and it has enough budget left to ++ * serve them, keep the queue, otherwise expire it. ++ */ ++ if (next_rq) { ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (bfq_serv_to_charge(next_rq, bfqq) > ++ bfq_bfqq_budget_left(bfqq)) { ++ /* ++ * Expire the queue for budget exhaustion, ++ * which makes sure that the next budget is ++ * enough to serve the next request, even if ++ * it comes from the fifo expired path. ++ */ ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; ++ goto expire; ++ } else { ++ /* ++ * The idle timer may be pending because we may ++ * not disable disk idling even when a new request ++ * arrives. ++ */ ++ if (bfq_bfqq_wait_request(bfqq)) { ++ /* ++ * If we get here: 1) at least a new request ++ * has arrived but we have not disabled the ++ * timer because the request was too small, ++ * 2) then the block layer has unplugged ++ * the device, causing the dispatch to be ++ * invoked. ++ * ++ * Since the device is unplugged, now the ++ * requests are probably large enough to ++ * provide a reasonable throughput. ++ * So we disable idling. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ } ++ goto keep_queue; ++ } ++ } ++ ++ /* ++ * No requests pending. However, if the in-service queue is idling ++ * for a new request, or has requests waiting for a completion and ++ * may idle after their completion, then keep it anyway. ++ * ++ * Yet, to boost throughput, inject service from other queues if ++ * possible. ++ */ ++ if (bfq_bfqq_wait_request(bfqq) || ++ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { ++ if (bfq_bfqq_injectable(bfqq) && ++ bfqq->injected_service * bfqq->inject_coeff < ++ bfqq->entity.service * 10) { ++ bfq_log_bfqq(bfqd, bfqq, "looking for queue for injection"); ++ bfqq = bfq_choose_bfqq_for_injection(bfqd); ++ } else { ++ if (BFQQ_SEEKY(bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "injection saturated %d * %d >= %d * 10", ++ bfqq->injected_service, bfqq->inject_coeff, ++ bfqq->entity.service); ++ bfqq = NULL; ++ } ++ goto keep_queue; ++ } ++ ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, reason); ++new_queue: ++ bfqq = bfq_set_in_service_queue(bfqd); ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); ++ goto check_queue; ++ } ++keep_queue: ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); ++ else ++ bfq_log(bfqd, "no queue returned"); ++ ++ return bfqq; ++} ++ ++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ ++ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != ++ entity->orig_weight * bfqq->wr_coeff); ++ if (entity->prio_changed) ++ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); ++ ++ /* ++ * If the queue was activated in a burst, or too much ++ * time has elapsed from the beginning of this ++ * weight-raising period, then end weight raising. ++ */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bfq_bfqq_end_wr(bfqq); ++ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time)) { ++ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || ++ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) ++ bfq_bfqq_end_wr(bfqq); ++ else { ++ switch_back_to_interactive_wr(bfqq, bfqd); ++ BUG_ON(time_is_after_jiffies( ++ bfqq->last_wr_start_finish)); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "back to interactive wr"); ++ } ++ } ++ if (bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && ++ bfqq->service_from_wr > max_service_from_wr) { ++ /* see comments on max_service_from_wr */ ++ bfq_bfqq_end_wr(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "too much service"); ++ } ++ } ++ /* ++ * To improve latency (for this or other queues), immediately ++ * update weight both if it must be raised and if it must be ++ * lowered. Since, entity may be on some active tree here, and ++ * might have a pending change of its ioprio class, invoke ++ * next function with the last parameter unset (see the ++ * comments on the function). ++ */ ++ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) ++ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), ++ entity, false); ++} ++ ++/* ++ * Dispatch next request from bfqq. ++ */ ++static struct request *bfq_dispatch_rq_from_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct request *rq = bfqq->next_rq; ++ unsigned long service_to_charge; ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!rq); ++ service_to_charge = bfq_serv_to_charge(rq, bfqq); ++ ++ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_bfqq_served(bfqq, service_to_charge); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_dispatch_remove(bfqd->queue, rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", ++ blk_rq_sectors(rq), ++ (unsigned long long) blk_rq_pos(rq), ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->dispatched); ++ ++ if (bfqq != bfqd->in_service_queue) { ++ if (likely(bfqd->in_service_queue)) { ++ bfqd->in_service_queue->injected_service += ++ bfq_serv_to_charge(rq, bfqq); ++ bfq_log_bfqq(bfqd, bfqd->in_service_queue, ++ "injected_service increased to %d", ++ bfqd->in_service_queue->injected_service); ++ } ++ goto return_rq; ++ } ++ ++ /* ++ * If weight raising has to terminate for bfqq, then next ++ * function causes an immediate update of bfqq's weight, ++ * without waiting for next activation. As a consequence, on ++ * expiration, bfqq will be timestamped as if has never been ++ * weight-raised during this service slot, even if it has ++ * received part or even most of the service as a ++ * weight-raised queue. This inflates bfqq's timestamps, which ++ * is beneficial, as bfqq is then more willing to leave the ++ * device immediately to possible other weight-raised queues. ++ */ ++ bfq_update_wr_data(bfqd, bfqq); ++ ++ /* ++ * Expire bfqq, pretending that its budget expired, if bfqq ++ * belongs to CLASS_IDLE and other queues are waiting for ++ * service. ++ */ ++ if (!(bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq))) ++ goto return_rq; ++ ++ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); ++ ++return_rq: ++ return rq; ++} ++ ++static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) ++{ ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ ++ bfq_log(bfqd, "dispatch_non_empty %d busy_queues %d", ++ !list_empty_careful(&bfqd->dispatch), bfq_tot_busy_queues(bfqd) > 0); ++ ++ /* ++ * Avoiding lock: a race on bfqd->busy_queues should cause at ++ * most a call to dispatch for nothing ++ */ ++ return !list_empty_careful(&bfqd->dispatch) || ++ bfq_tot_busy_queues(bfqd) > 0; ++} ++ ++static struct request *__bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) ++{ ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ struct request *rq = NULL; ++ struct bfq_queue *bfqq = NULL; ++ ++ if (!list_empty(&bfqd->dispatch)) { ++ rq = list_first_entry(&bfqd->dispatch, struct request, ++ queuelist); ++ list_del_init(&rq->queuelist); ++ rq->rq_flags &= ~RQF_DISP_LIST; ++ ++ bfq_log(bfqd, ++ "picked %p from dispatch list", rq); ++ bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq) { ++ /* ++ * Increment counters here, because this ++ * dispatch does not follow the standard ++ * dispatch flow (where counters are ++ * incremented) ++ */ ++ bfqq->dispatched++; ++ ++ /* ++ * TESTING: reset DISP_LIST flag, because: 1) ++ * this rq this request has passed through ++ * bfq_prepare_request, 2) then it will have ++ * bfq_finish_requeue_request invoked on it, and 3) in ++ * bfq_finish_requeue_request we use this flag to check ++ * that bfq_finish_requeue_request is not invoked on ++ * requests for which bfq_prepare_request has ++ * been invoked. ++ */ ++ rq->rq_flags &= ~RQF_DISP_LIST; ++ goto inc_in_driver_start_rq; ++ } ++ ++ /* ++ * We exploit the bfq_finish_requeue_request hook to decrement ++ * rq_in_driver, but bfq_finish_requeue_request will not be ++ * invoked on this request. So, to avoid unbalance, ++ * just start this request, without incrementing ++ * rq_in_driver. As a negative consequence, ++ * rq_in_driver is deceptively lower than it should be ++ * while this request is in service. This may cause ++ * bfq_schedule_dispatch to be invoked uselessly. ++ * ++ * As for implementing an exact solution, the ++ * bfq_finish_requeue_request hook, if defined, is probably ++ * invoked also on this request. So, by exploiting ++ * this hook, we could 1) increment rq_in_driver here, ++ * and 2) decrement it in bfq_finish_requeue_request. Such a ++ * solution would let the value of the counter be ++ * always accurate, but it would entail using an extra ++ * interface function. This cost seems higher than the ++ * benefit, being the frequency of non-elevator-private ++ * requests very low. ++ */ ++ goto start_rq; ++ } ++ ++ bfq_log(bfqd, "%d busy queues", bfq_tot_busy_queues(bfqd)); ++ ++ if (bfq_tot_busy_queues(bfqd) == 0) ++ goto exit; ++ ++ /* ++ * Force device to serve one request at a time if ++ * strict_guarantees is true. Forcing this service scheme is ++ * currently the ONLY way to guarantee that the request ++ * service order enforced by the scheduler is respected by a ++ * queueing device. Otherwise the device is free even to make ++ * some unlucky request wait for as long as the device ++ * wishes. ++ * ++ * Of course, serving one request at at time may cause loss of ++ * throughput. ++ */ ++ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) ++ goto exit; ++ ++ bfqq = bfq_select_queue(bfqd); ++ if (!bfqq) ++ goto exit; ++ ++ BUG_ON(bfqq == bfqd->in_service_queue && ++ bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue && ++ bfq_bfqq_wait_request(bfqq)); ++ ++ rq = bfq_dispatch_rq_from_bfqq(bfqd, bfqq); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ if (rq) { ++ inc_in_driver_start_rq: ++ bfqd->rq_in_driver++; ++ start_rq: ++ rq->rq_flags |= RQF_STARTED; ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "%s request %p, rq_in_driver %d", ++ bfq_bfqq_sync(bfqq) ? "sync" : "async", ++ rq, ++ bfqd->rq_in_driver); ++ else ++ bfq_log(bfqd, ++ "request %p from dispatch list, rq_in_driver %d", ++ rq, bfqd->rq_in_driver); ++ } else ++ bfq_log(bfqd, ++ "returned NULL request, rq_in_driver %d", ++ bfqd->rq_in_driver); ++ ++exit: ++ return rq; ++} ++ ++ ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++static void bfq_update_dispatch_stats(struct request_queue *q, ++ struct request *rq, ++ struct bfq_queue *in_serv_queue, ++ bool idle_timer_disabled) ++{ ++ struct bfq_queue *bfqq = rq ? RQ_BFQQ(rq) : NULL; ++ ++ if (!idle_timer_disabled && !bfqq) ++ return; ++ ++ /* ++ * rq and bfqq are guaranteed to exist until this function ++ * ends, for the following reasons. First, rq can be ++ * dispatched to the device, and then can be completed and ++ * freed, only after this function ends. Second, rq cannot be ++ * merged (and thus freed because of a merge) any longer, ++ * because it has already started. Thus rq cannot be freed ++ * before this function ends, and, since rq has a reference to ++ * bfqq, the same guarantee holds for bfqq too. ++ * ++ * In addition, the following queue lock guarantees that ++ * bfqq_group(bfqq) exists as well. ++ */ ++ spin_lock_irq(q->queue_lock); ++ if (idle_timer_disabled) ++ /* ++ * Since the idle timer has been disabled, ++ * in_serv_queue contained some request when ++ * __bfq_dispatch_request was invoked above, which ++ * implies that rq was picked exactly from ++ * in_serv_queue. Thus in_serv_queue == bfqq, and is ++ * therefore guaranteed to exist because of the above ++ * arguments. ++ */ ++ bfqg_stats_update_idle_time(bfqq_group(in_serv_queue)); ++ if (bfqq) { ++ struct bfq_group *bfqg = bfqq_group(bfqq); ++ ++ bfqg_stats_update_avg_queue_size(bfqg); ++ bfqg_stats_set_start_empty_time(bfqg); ++ bfqg_stats_update_io_remove(bfqg, rq->cmd_flags); ++ } ++ spin_unlock_irq(q->queue_lock); ++} ++#else ++static inline void bfq_update_dispatch_stats(struct request_queue *q, ++ struct request *rq, ++ struct bfq_queue *in_serv_queue, ++ bool idle_timer_disabled) {} ++#endif ++static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) ++{ ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ struct request *rq; ++ struct bfq_queue *in_serv_queue; ++ bool waiting_rq, idle_timer_disabled; ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ in_serv_queue = bfqd->in_service_queue; ++ waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); ++ ++ rq = __bfq_dispatch_request(hctx); ++ ++ idle_timer_disabled = ++ waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); ++ ++ spin_unlock_irq(&bfqd->lock); ++ ++ bfq_update_dispatch_stats(hctx->queue, rq, in_serv_queue, ++ idle_timer_disabled); ++ ++ return rq; ++} ++ ++/* ++ * Task holds one reference to the queue, dropped when task exits. Each rq ++ * in-flight on this queue also holds a reference, dropped when rq is freed. ++ * ++ * Scheduler lock must be held here. Recall not to use bfqq after calling ++ * this function on it. ++ */ ++static void bfq_put_queue(struct bfq_queue *bfqq) ++{ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_group *bfqg = bfqq_group(bfqq); ++#endif ++ ++ assert_spin_locked(&bfqq->bfqd->lock); ++ ++ BUG_ON(bfqq->ref <= 0); ++ ++ if (bfqq->bfqd) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); ++ ++ bfqq->ref--; ++ if (bfqq->ref) ++ return; ++ ++ BUG_ON(rb_first(&bfqq->sort_list)); ++ BUG_ON(bfqq->allocated != 0); ++ BUG_ON(bfqq->entity.tree); ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ ++ if (!hlist_unhashed(&bfqq->burst_list_node)) { ++ hlist_del_init(&bfqq->burst_list_node); ++ /* ++ * Decrement also burst size after the removal, if the ++ * process associated with bfqq is exiting, and thus ++ * does not contribute to the burst any longer. This ++ * decrement helps filter out false positives of large ++ * bursts, when some short-lived process (often due to ++ * the execution of commands by some service) happens ++ * to start and exit while a complex application is ++ * starting, and thus spawning several processes that ++ * do I/O (and that *must not* be treated as a large ++ * burst, see comments on bfq_handle_burst). ++ * ++ * In particular, the decrement is performed only if: ++ * 1) bfqq is not a merged queue, because, if it is, ++ * then this free of bfqq is not triggered by the exit ++ * of the process bfqq is associated with, but exactly ++ * by the fact that bfqq has just been merged. ++ * 2) burst_size is greater than 0, to handle ++ * unbalanced decrements. Unbalanced decrements may ++ * happen in te following case: bfqq is inserted into ++ * the current burst list--without incrementing ++ * bust_size--because of a split, but the current ++ * burst list is not the burst list bfqq belonged to ++ * (see comments on the case of a split in ++ * bfq_set_request). ++ */ ++ if (bfqq->bic && bfqq->bfqd->burst_size > 0) ++ bfqq->bfqd->burst_size--; ++ } ++ ++ if (bfqq->bfqd) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "putting blkg and bfqg %p\n", bfqg); ++ bfqg_and_blkg_put(bfqg); ++#endif ++ kmem_cache_free(bfq_pool, bfqq); ++} ++ ++static void bfq_put_cooperator(struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *__bfqq, *next; ++ ++ /* ++ * If this queue was scheduled to merge with another queue, be ++ * sure to drop the reference taken on that queue (and others in ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. ++ */ ++ __bfqq = bfqq->new_bfqq; ++ while (__bfqq) { ++ if (__bfqq == bfqq) ++ break; ++ next = __bfqq->new_bfqq; ++ bfq_put_queue(__bfqq); ++ __bfqq = next; ++ } ++} ++ ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq == bfqd->in_service_queue) { ++ __bfq_bfqq_expire(bfqd, bfqq); ++ bfq_schedule_dispatch(bfqd); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); /* release process reference */ ++} ++ ++static void bfq_exit_icq_bfqq(struct bfq_io_cq *bic, bool is_sync) ++{ ++ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); ++ struct bfq_data *bfqd; ++ ++ if (bfqq) ++ bfqd = bfqq->bfqd; /* NULL if scheduler already exited */ ++ ++ if (bfqq && bfqd) { ++ unsigned long flags; ++ ++ spin_lock_irqsave(&bfqd->lock, flags); ++ bfq_exit_bfqq(bfqd, bfqq); ++ bic_set_bfqq(bic, NULL, is_sync); ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ } ++} ++ ++static void bfq_exit_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ ++ BUG_ON(!bic); ++ bfq_exit_icq_bfqq(bic, true); ++ bfq_exit_icq_bfqq(bic, false); ++} ++ ++/* ++ * Update the entity prio values; note that the new values will not ++ * be used until the next (re)activation. ++ */ ++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ struct task_struct *tsk = current; ++ int ioprio_class; ++ struct bfq_data *bfqd = bfqq->bfqd; ++ ++ WARN_ON(!bfqd); ++ if (!bfqd) ++ return; ++ ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ switch (ioprio_class) { ++ default: ++ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, ++ "bfq: bad prio class %d\n", ioprio_class); ++ case IOPRIO_CLASS_NONE: ++ /* ++ * No prio set, inherit CPU scheduling settings. ++ */ ++ bfqq->new_ioprio = task_nice_ioprio(tsk); ++ bfqq->new_ioprio_class = task_nice_ioclass(tsk); ++ break; ++ case IOPRIO_CLASS_RT: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; ++ break; ++ case IOPRIO_CLASS_BE: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; ++ break; ++ case IOPRIO_CLASS_IDLE: ++ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; ++ bfqq->new_ioprio = 7; ++ break; ++ } ++ ++ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { ++ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", ++ bfqq->new_ioprio); ++ BUG(); ++ } ++ ++ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "bic_class %d prio %d class %d", ++ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); ++} ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_queue *bfqq; ++ unsigned long uninitialized_var(flags); ++ int ioprio = bic->icq.ioc->ioprio; ++ ++ /* ++ * This condition may trigger on a newly created bic, be sure to ++ * drop the lock before returning. ++ */ ++ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) ++ return; ++ ++ bic->ioprio = ioprio; ++ ++ bfqq = bic_to_bfqq(bic, false); ++ if (bfqq) { ++ /* release process reference on this queue */ ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); ++ bic_set_bfqq(bic, bfqq, false); ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfqq %p %d", ++ bfqq, bfqq->ref); ++ } ++ ++ bfqq = bic_to_bfqq(bic, true); ++ if (bfqq) ++ bfq_set_next_ioprio_data(bfqq, bic); ++} ++ ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic, pid_t pid, int is_sync) ++{ ++ RB_CLEAR_NODE(&bfqq->entity.rb_node); ++ INIT_LIST_HEAD(&bfqq->fifo); ++ INIT_HLIST_NODE(&bfqq->burst_list_node); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bfqq->ref = 0; ++ bfqq->bfqd = bfqd; ++ ++ if (bic) ++ bfq_set_next_ioprio_data(bfqq, bic); ++ ++ if (is_sync) { ++ /* ++ * No need to mark as has_short_ttime if in ++ * idle_class, because no device idling is performed ++ * for queues in idle class ++ */ ++ if (!bfq_class_idle(bfqq)) ++ /* tentatively mark as has_short_ttime */ ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ bfq_mark_bfqq_sync(bfqq); ++ bfq_mark_bfqq_just_created(bfqq); ++ /* ++ * Aggressively inject a lot of service: up to 90%. ++ * This coefficient remains constant during bfqq life, ++ * but this behavior might be changed, after enough ++ * testing and tuning. ++ */ ++ bfqq->inject_coeff = 1; ++ } else ++ bfq_clear_bfqq_sync(bfqq); ++ ++ bfqq->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); ++ ++ bfq_mark_bfqq_IO_bound(bfqq); ++ ++ /* Tentative initial value to trade off between thr and lat */ ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; ++ bfqq->pid = pid; ++ ++ bfqq->wr_coeff = 1; ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); ++ bfqq->budget_timeout = bfq_smallest_from_now(); ++ bfqq->split_time = bfq_smallest_from_now(); ++ ++ /* ++ * To not forget the possibly high bandwidth consumed by a ++ * process/queue in the recent past, ++ * bfq_bfqq_softrt_next_start() returns a value at least equal ++ * to the current value of bfqq->soft_rt_next_start (see ++ * comments on bfq_bfqq_softrt_next_start). Set ++ * soft_rt_next_start to now, to mean that bfqq has consumed ++ * no bandwidth so far. ++ */ ++ bfqq->soft_rt_next_start = jiffies; ++ ++ /* first request is almost certainly seeky */ ++ bfqq->seek_history = 1; ++} ++ ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int ioprio_class, int ioprio) ++{ ++ switch (ioprio_class) { ++ case IOPRIO_CLASS_RT: ++ return &bfqg->async_bfqq[0][ioprio]; ++ case IOPRIO_CLASS_NONE: ++ ioprio = IOPRIO_NORM; ++ /* fall through */ ++ case IOPRIO_CLASS_BE: ++ return &bfqg->async_bfqq[1][ioprio]; ++ case IOPRIO_CLASS_IDLE: ++ return &bfqg->async_idle_bfqq; ++ default: ++ BUG(); ++ } ++} ++ ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic) ++{ ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ struct bfq_queue **async_bfqq = NULL; ++ struct bfq_queue *bfqq; ++ struct bfq_group *bfqg; ++ ++ rcu_read_lock(); ++ ++ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); ++ if (!bfqg) { ++ bfqq = &bfqd->oom_bfqq; ++ goto out; ++ } ++ ++ if (!is_sync) { ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ++ ioprio); ++ bfqq = *async_bfqq; ++ if (bfqq) ++ goto out; ++ } ++ ++ bfqq = kmem_cache_alloc_node(bfq_pool, ++ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, ++ bfqd->queue->node); ++ ++ if (bfqq) { ++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, ++ is_sync); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ goto out; ++ } ++ ++ /* ++ * Pin the queue now that it's allocated, scheduler exit will ++ * prune it. ++ */ ++ if (async_bfqq) { ++ bfqq->ref++; /* ++ * Extra group reference, w.r.t. sync ++ * queue. This extra reference is removed ++ * only if bfqq->bfqg disappears, to ++ * guarantee that this queue is not freed ++ * until its group goes away. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", ++ bfqq, bfqq->ref); ++ *async_bfqq = bfqq; ++ } ++ ++out: ++ bfqq->ref++; /* get a process reference to this queue */ ++ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); ++ rcu_read_unlock(); ++ return bfqq; ++} ++ ++static void bfq_update_io_thinktime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_ttime *ttime = &bfqq->ttime; ++ u64 elapsed = ktime_get_ns() - bfqq->ttime.last_end_request; ++ ++ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); ++ ++ ttime->ttime_samples = (7*bfqq->ttime.ttime_samples + 256) / 8; ++ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ++ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ++ ttime->ttime_samples); ++} ++ ++static void ++bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ bfqq->seek_history <<= 1; ++ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); ++} ++ ++static void bfq_update_has_short_ttime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ bool has_short_ttime = true; ++ ++ /* ++ * No need to update has_short_ttime if bfqq is async or in ++ * idle io prio class, or if bfq_slice_idle is zero, because ++ * no device idling is performed for bfqq in this case. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || ++ bfqd->bfq_slice_idle == 0) ++ return; ++ ++ /* Idle window just restored, statistics are meaningless. */ ++ if (time_is_after_eq_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) ++ return; ++ ++ /* Think time is infinite if no process is linked to ++ * bfqq. Otherwise check average think time to ++ * decide whether to mark as has_short_ttime ++ */ ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || ++ (bfq_sample_valid(bfqq->ttime.ttime_samples) && ++ bfqq->ttime.ttime_mean > bfqd->bfq_slice_idle)) ++ has_short_ttime = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", ++ has_short_ttime); ++ ++ if (has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ else ++ bfq_clear_bfqq_has_short_ttime(bfqq); ++} ++ ++/* ++ * Called when a new fs request (rq) is added to bfqq. Check if there's ++ * something we should do about it. ++ */ ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ struct bfq_io_cq *bic = RQ_BIC(rq); ++ ++ if (rq->cmd_flags & REQ_META) ++ bfqq->meta_pending++; ++ ++ bfq_update_io_thinktime(bfqd, bfqq); ++ bfq_update_has_short_ttime(bfqd, bfqq, bic); ++ bfq_update_io_seektime(bfqd, bfqq, rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "has_short_ttime=%d (seeky %d)", ++ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); ++ ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { ++ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32; ++ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); ++ ++ /* ++ * There is just this request queued: if ++ * - the request is small, and ++ * - we are idling to boost throughput, and ++ * - the queue is not to be expired, ++ * then just exit. ++ * ++ * In this way, if the device is being idled to wait ++ * for a new request from the in-service queue, we ++ * avoid unplugging the device and committing the ++ * device to serve just a small request. In contrast ++ * we wait for the block layer to decide when to ++ * unplug the device: hopefully, new requests will be ++ * merged to this one quickly, then the device will be ++ * unplugged and larger requests will be dispatched. ++ */ ++ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && ++ !budget_timeout) ++ return; ++ ++ /* ++ * A large enough request arrived, or idling is being ++ * performed to preserve service guarantees, or ++ * finally the queue is to be expired: in all these ++ * cases disk idling is to be stopped, so clear ++ * wait_request flag and reset timer. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ ++ /* ++ * The queue is not empty, because a new request just ++ * arrived. Hence we can safely expire the queue, in ++ * case of budget timeout, without risking that the ++ * timestamps of the queue are not updated correctly. ++ * See [1] for more details. ++ */ ++ if (budget_timeout) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ } ++} ++ ++/* returns true if it causes the idle timer to be disabled */ ++static bool __bfq_insert_request(struct bfq_data *bfqd, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; ++ bool waiting, idle_timer_disabled = false; ++ BUG_ON(!bfqq); ++ ++ assert_spin_locked(&bfqd->lock); ++ ++ bfq_log_bfqq(bfqd, bfqq, "rq %p bfqq %p", rq, bfqq); ++ ++ /* ++ * An unplug may trigger a requeue of a request from the device ++ * driver: make sure we are in process context while trying to ++ * merge two bfq_queues. ++ */ ++ if (!in_interrupt()) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); ++ if (new_bfqq) { ++ BUG_ON(bic_to_bfqq(RQ_BIC(rq), 1) != bfqq); ++ /* ++ * Release the request's reference to the old bfqq ++ * and make sure one is taken to the shared queue. ++ */ ++ new_bfqq->allocated++; ++ bfqq->allocated--; ++ bfq_log_bfqq(bfqd, bfqq, ++ "new allocated %d", bfqq->allocated); ++ bfq_log_bfqq(bfqd, new_bfqq, ++ "new_bfqq new allocated %d", ++ bfqq->allocated); ++ ++ new_bfqq->ref++; ++ /* ++ * If the bic associated with the process ++ * issuing this request still points to bfqq ++ * (and thus has not been already redirected ++ * to new_bfqq or even some other bfq_queue), ++ * then complete the merge and redirect it to ++ * new_bfqq. ++ */ ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), ++ bfqq, new_bfqq); ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ /* ++ * rq is about to be enqueued into new_bfqq, ++ * release rq reference on bfqq ++ */ ++ bfq_put_queue(bfqq); ++ rq->elv.priv[1] = new_bfqq; ++ bfqq = new_bfqq; ++ } ++ } ++ ++ waiting = bfqq && bfq_bfqq_wait_request(bfqq); ++ bfq_add_request(rq); ++ idle_timer_disabled = waiting && !bfq_bfqq_wait_request(bfqq); ++ ++ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; ++ list_add_tail(&rq->queuelist, &bfqq->fifo); ++ ++ bfq_rq_enqueued(bfqd, bfqq, rq); ++ ++ return idle_timer_disabled; ++} ++ ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++static void bfq_update_insert_stats(struct request_queue *q, ++ struct bfq_queue *bfqq, ++ bool idle_timer_disabled, ++ unsigned int cmd_flags) ++{ ++ if (!bfqq) ++ return; ++ ++ /* ++ * bfqq still exists, because it can disappear only after ++ * either it is merged with another queue, or the process it ++ * is associated with exits. But both actions must be taken by ++ * the same process currently executing this flow of ++ * instructions. ++ * ++ * In addition, the following queue lock guarantees that ++ * bfqq_group(bfqq) exists as well. ++ */ ++ spin_lock_irq(q->queue_lock); ++ bfqg_stats_update_io_add(bfqq_group(bfqq), bfqq, cmd_flags); ++ if (idle_timer_disabled) ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ spin_unlock_irq(q->queue_lock); ++} ++#else ++static inline void bfq_update_insert_stats(struct request_queue *q, ++ struct bfq_queue *bfqq, ++ bool idle_timer_disabled, ++ unsigned int cmd_flags) {} ++#endif ++ ++static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, ++ bool at_head) ++{ ++ struct request_queue *q = hctx->queue; ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq; ++ bool idle_timer_disabled = false; ++ unsigned int cmd_flags; ++ ++ spin_lock_irq(&bfqd->lock); ++ if (blk_mq_sched_try_insert_merge(q, rq)) { ++ spin_unlock_irq(&bfqd->lock); ++ return; ++ } ++ ++ spin_unlock_irq(&bfqd->lock); ++ ++ blk_mq_sched_request_inserted(rq); ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ bfqq = bfq_init_rq(rq); ++ BUG_ON(!bfqq && !(at_head || blk_rq_is_passthrough(rq))); ++ BUG_ON(bfqq && bic_to_bfqq(RQ_BIC(rq), rq_is_sync(rq)) != bfqq); ++ ++ if (at_head || blk_rq_is_passthrough(rq)) { ++ if (at_head) ++ list_add(&rq->queuelist, &bfqd->dispatch); ++ else ++ list_add_tail(&rq->queuelist, &bfqd->dispatch); ++ ++ rq->rq_flags |= RQF_DISP_LIST; ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "%p in disp: at_head %d", ++ rq, at_head); ++ else ++ bfq_log(bfqd, ++ "%p in disp: at_head %d", ++ rq, at_head); ++ } else { /* bfqq is assumed to be non null here */ ++ BUG_ON(!bfqq); ++ BUG_ON(!(rq->rq_flags & RQF_GOT)); ++ rq->rq_flags &= ~RQF_GOT; ++ ++ idle_timer_disabled = __bfq_insert_request(bfqd, rq); ++ /* ++ * Update bfqq, because, if a queue merge has occurred ++ * in __bfq_insert_request, then rq has been ++ * redirected into a new queue. ++ */ ++ bfqq = RQ_BFQQ(rq); ++ ++ if (rq_mergeable(rq)) { ++ elv_rqhash_add(q, rq); ++ if (!q->last_merge) ++ q->last_merge = rq; ++ } ++ } ++ ++ /* ++ * Cache cmd_flags before releasing scheduler lock, because rq ++ * may disappear afterwards (for example, because of a request ++ * merge). ++ */ ++ cmd_flags = rq->cmd_flags; ++ ++ spin_unlock_irq(&bfqd->lock); ++ bfq_update_insert_stats(q, bfqq, idle_timer_disabled, ++ cmd_flags); ++} ++ ++static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, ++ struct list_head *list, bool at_head) ++{ ++ while (!list_empty(list)) { ++ struct request *rq; ++ ++ rq = list_first_entry(list, struct request, queuelist); ++ list_del_init(&rq->queuelist); ++ bfq_insert_request(hctx, rq, at_head); ++ } ++} ++ ++static void bfq_update_hw_tag(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ ++ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); ++ ++ if (bfqd->hw_tag == 1) ++ return; ++ ++ /* ++ * This sample is valid if the number of outstanding requests ++ * is large enough to allow a queueing behavior. Note that the ++ * sum is not exact, as it's not taking into account deactivated ++ * requests. ++ */ ++ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ /* ++ * If active queue hasn't enough requests and can idle, bfq might not ++ * dispatch sufficient requests to hardware. Don't zero hw_tag in this ++ * case ++ */ ++ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && ++ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < ++ BFQ_HW_QUEUE_THRESHOLD && bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) ++ return; ++ ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; ++ bfqd->max_rq_in_driver = 0; ++ bfqd->hw_tag_samples = 0; ++} ++ ++static void bfq_completed_request(struct bfq_queue *bfqq, struct bfq_data *bfqd) ++{ ++ u64 now_ns; ++ u32 delta_us; ++ ++ bfq_update_hw_tag(bfqd); ++ ++ BUG_ON(!bfqd->rq_in_driver); ++ BUG_ON(!bfqq->dispatched); ++ bfqd->rq_in_driver--; ++ bfqq->dispatched--; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "new disp %d, new rq_in_driver %d", ++ bfqq->dispatched, bfqd->rq_in_driver); ++ ++ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ /* ++ * Set budget_timeout (which we overload to store the ++ * time at which the queue remains with no backlog and ++ * no outstanding request; used by the weight-raising ++ * mechanism). ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_weights_tree_remove(bfqd, bfqq); ++ } ++ ++ now_ns = ktime_get_ns(); ++ ++ bfqq->ttime.last_end_request = now_ns; ++ ++ /* ++ * Using us instead of ns, to get a reasonable precision in ++ * computing rate in next check. ++ */ ++ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "delta %uus/%luus max_size %u rate %llu/%llu", ++ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, ++ delta_us > 0 ? ++ (USEC_PER_SEC* ++ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) ++ >>BFQ_RATE_SHIFT : ++ (USEC_PER_SEC* ++ (u64)(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT))>>BFQ_RATE_SHIFT, ++ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); ++ ++ /* ++ * If the request took rather long to complete, and, according ++ * to the maximum request size recorded, this completion latency ++ * implies that the request was certainly served at a very low ++ * rate (less than 1M sectors/sec), then the whole observation ++ * interval that lasts up to this time instant cannot be a ++ * valid time interval for computing a new peak rate. Invoke ++ * bfq_update_rate_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - reset to zero samples, which will trigger a proper ++ * re-initialization of the observation interval on next ++ * dispatch ++ */ ++ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && ++ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < ++ 1UL<<(BFQ_RATE_SHIFT - 10)) ++ bfq_update_rate_reset(bfqd, NULL); ++ bfqd->last_completion = now_ns; ++ ++ /* ++ * If we are waiting to discover whether the request pattern ++ * of the task associated with the queue is actually ++ * isochronous, and both requisites for this condition to hold ++ * are now satisfied, then compute soft_rt_next_start (see the ++ * comments on the function bfq_bfqq_softrt_next_start()). We ++ * do not compute soft_rt_next_start if bfqq is in interactive ++ * weight raising (see the comments in bfq_bfqq_expire() for ++ * an explanation). We schedule this delayed update when bfqq ++ * expires, if it still has in-flight requests. ++ */ ++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list) && ++ bfqq->wr_coeff != bfqd->bfq_wr_coeff) ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ ++ /* ++ * If this is the in-service queue, check if it needs to be expired, ++ * or if we want to idle in case it has no pending requests. ++ */ ++ if (bfqd->in_service_queue == bfqq) { ++ if (bfq_bfqq_must_idle(bfqq)) { ++ if (bfqq->dispatched == 0) ++ bfq_arm_slice_timer(bfqd); ++ /* ++ * If we get here, we do not expire bfqq, even ++ * if bfqq was in budget timeout or had no ++ * more requests (as controlled in the next ++ * conditional instructions). The reason for ++ * not expiring bfqq is as follows. ++ * ++ * Here bfqq->dispatched > 0 holds, but ++ * bfq_bfqq_must_idle() returned true. This ++ * implies that, even if no request arrives ++ * for bfqq before bfqq->dispatched reaches 0, ++ * bfqq will, however, not be expired on the ++ * completion event that causes bfqq->dispatch ++ * to reach zero. In contrast, on this event, ++ * bfqq will start enjoying device idling ++ * (I/O-dispatch plugging). ++ * ++ * But, if we expired bfqq here, bfqq would ++ * not have the chance to enjoy device idling ++ * when bfqq->dispatched finally reaches ++ * zero. This would expose bfqq to violation ++ * of its reserved service guarantees. ++ */ ++ return; ++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && ++ (bfqq->dispatched == 0 || ++ !bfq_better_to_idle(bfqq))) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_NO_MORE_REQUESTS); ++ } ++} ++ ++static void bfq_finish_requeue_request_body(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "allocated %d", bfqq->allocated); ++ BUG_ON(!bfqq->allocated); ++ bfqq->allocated--; ++ ++ bfq_put_queue(bfqq); ++} ++ ++/* ++ * Handle either a requeue or a finish for rq. The things to do are ++ * the same in both cases: all references to rq are to be dropped. In ++ * particular, rq is considered completed from the point of view of ++ * the scheduler. ++ */ ++static void bfq_finish_requeue_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd; ++ struct bfq_io_cq *bic; ++ ++ BUG_ON(!rq); ++ ++ bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * Requeue and finish hooks are invoked in blk-mq without ++ * checking whether the involved request is actually still ++ * referenced in the scheduler. To handle this fact, the ++ * following two checks make this function exit in case of ++ * spurious invocations, for which there is nothing to do. ++ * ++ * First, check whether rq has nothing to do with an elevator. ++ */ ++ if (unlikely(!(rq->rq_flags & RQF_ELVPRIV))) ++ return; ++ ++ /* ++ * rq either is not associated with any icq, or is an already ++ * requeued request that has not (yet) been re-inserted into ++ * a bfq_queue. ++ */ ++ if (!rq->elv.icq || !bfqq) ++ return; ++ ++ bic = RQ_BIC(rq); ++ BUG_ON(!bic); ++ ++ bfqd = bfqq->bfqd; ++ BUG_ON(!bfqd); ++ ++ if (rq->rq_flags & RQF_DISP_LIST) { ++ pr_crit("putting disp rq %p for %d", rq, bfqq->pid); ++ BUG(); ++ } ++ BUG_ON(rq->rq_flags & RQF_QUEUED); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "putting rq %p with %u sects left, STARTED %d", ++ rq, blk_rq_sectors(rq), ++ rq->rq_flags & RQF_STARTED); ++ ++ if (rq->rq_flags & RQF_STARTED) ++ bfqg_stats_update_completion(bfqq_group(bfqq), ++ rq->start_time_ns, ++ rq->io_start_time_ns, ++ rq->cmd_flags); ++ ++ WARN_ON(blk_rq_sectors(rq) == 0 && !(rq->rq_flags & RQF_STARTED)); ++ ++ if (likely(rq->rq_flags & RQF_STARTED)) { ++ unsigned long flags; ++ ++ spin_lock_irqsave(&bfqd->lock, flags); ++ ++ bfq_completed_request(bfqq, bfqd); ++ bfq_finish_requeue_request_body(bfqq); ++ ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ } else { ++ /* ++ * Request rq may be still/already in the scheduler, ++ * in which case we need to remove it (this should ++ * never happen in case of requeue). And we cannot ++ * defer such a check and removal, to avoid ++ * inconsistencies in the time interval from the end ++ * of this function to the start of the deferred work. ++ * This situation seems to occur only in process ++ * context, as a consequence of a merge. In the ++ * current version of the code, this implies that the ++ * lock is held. ++ */ ++ BUG_ON(in_interrupt()); ++ ++ assert_spin_locked(&bfqd->lock); ++ if (!RB_EMPTY_NODE(&rq->rb_node)) { ++ bfq_remove_request(rq->q, rq); ++ bfqg_stats_update_io_remove(bfqq_group(bfqq), ++ rq->cmd_flags); ++ } ++ bfq_finish_requeue_request_body(bfqq); ++ } ++ ++ /* ++ * Reset private fields. In case of a requeue, this allows ++ * this function to correctly do nothing if it is spuriously ++ * invoked again on this same request (see the check at the ++ * beginning of the function). Probably, a better general ++ * design would be to prevent blk-mq from invoking the requeue ++ * or finish hooks of an elevator, for a request that is not ++ * referred by that elevator. ++ * ++ * Resetting the following fields would break the ++ * request-insertion logic if rq is re-inserted into a bfq ++ * internal queue, without a re-preparation. Here we assume ++ * that re-insertions of requeued requests, without ++ * re-preparation, can happen only for pass_through or at_head ++ * requests (which are not re-inserted into bfq internal ++ * queues). ++ */ ++ rq->elv.priv[0] = NULL; ++ rq->elv.priv[1] = NULL; ++} ++ ++/* ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this ++ * was the last process referring to that bfqq. ++ */ ++static struct bfq_queue * ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); ++ ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->pid = current->pid; ++ bfq_clear_bfqq_coop(bfqq); ++ bfq_clear_bfqq_split_coop(bfqq); ++ return bfqq; ++ } ++ ++ bic_set_bfqq(bic, NULL, 1); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++ return NULL; ++} ++ ++static struct bfq_queue *bfq_get_bfqq_handle_split(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, ++ struct bio *bio, ++ bool split, bool is_sync, ++ bool *new_queue) ++{ ++ struct bfq_queue *bfqq = bic_to_bfqq(bic, is_sync); ++ ++ if (likely(bfqq && bfqq != &bfqd->oom_bfqq)) ++ return bfqq; ++ ++ if (new_queue) ++ *new_queue = true; ++ ++ if (bfqq) ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bic_set_bfqq(bic, bfqq, is_sync); ++ if (split && is_sync) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_request: was_in_list %d " ++ "was_in_large_burst %d " ++ "large burst in progress %d", ++ bic->was_in_burst_list, ++ bic->saved_in_large_burst, ++ bfqd->large_burst); ++ ++ if ((bic->was_in_burst_list && bfqd->large_burst) || ++ bic->saved_in_large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_request: marking in " ++ "large burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ } else { ++ bfq_log_bfqq(bfqd, bfqq, ++ "get_request: clearing in " ++ "large burst"); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ if (bic->was_in_burst_list) ++ /* ++ * If bfqq was in the current ++ * burst list before being ++ * merged, then we have to add ++ * it back. And we do not need ++ * to increase burst_size, as ++ * we did not decrement ++ * burst_size when we removed ++ * bfqq from the burst list as ++ * a consequence of a merge ++ * (see comments in ++ * bfq_put_queue). In this ++ * respect, it would be rather ++ * costly to know whether the ++ * current burst list is still ++ * the same burst list from ++ * which bfqq was removed on ++ * the merge. To avoid this ++ * cost, if bfqq was in a ++ * burst list, then we add ++ * bfqq to the current burst ++ * list without any further ++ * check. This can cause ++ * inappropriate insertions, ++ * but rarely enough to not ++ * harm the detection of large ++ * bursts significantly. ++ */ ++ hlist_add_head(&bfqq->burst_list_node, ++ &bfqd->burst_list); ++ } ++ bfqq->split_time = jiffies; ++ } ++ ++ return bfqq; ++} ++ ++/* ++ * Only reset private fields. The actual request preparation will be ++ * performed by bfq_init_rq, when rq is either inserted or merged. See ++ * comments on bfq_init_rq for the reason behind this delayed ++ * preparation. ++*/ ++static void bfq_prepare_request(struct request *rq, struct bio *bio) ++{ ++ /* ++ * Regardless of whether we have an icq attached, we have to ++ * clear the scheduler pointers, as they might point to ++ * previously allocated bic/bfqq structs. ++ */ ++ rq->elv.priv[0] = rq->elv.priv[1] = NULL; ++} ++ ++/* ++ * If needed, init rq, allocate bfq data structures associated with ++ * rq, and increment reference counters in the destination bfq_queue ++ * for rq. Return the destination bfq_queue for rq, or NULL is rq is ++ * not associated with any bfq_queue. ++ * ++ * This function is invoked by the functions that perform rq insertion ++ * or merging. One may have expected the above preparation operations ++ * to be performed in bfq_prepare_request, and not delayed to when rq ++ * is inserted or merged. The rationale behind this delayed ++ * preparation is that, after the prepare_request hook is invoked for ++ * rq, rq may still be transformed into a request with no icq, i.e., a ++ * request not associated with any queue. No bfq hook is invoked to ++ * signal this tranformation. As a consequence, should these ++ * preparation operations be performed when the prepare_request hook ++ * is invoked, and should rq be transformed one moment later, bfq ++ * would end up in an inconsistent state, because it would have ++ * incremented some queue counters for an rq destined to ++ * transformation, without any chance to correctly lower these ++ * counters back. In contrast, no transformation can still happen for ++ * rq after rq has been inserted or merged. So, it is safe to execute ++ * these preparation operations when rq is finally inserted or merged. ++ */ ++static struct bfq_queue *bfq_init_rq(struct request *rq) ++{ ++ struct request_queue *q = rq->q; ++ struct bio *bio = rq->bio; ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic; ++ const int is_sync = rq_is_sync(rq); ++ struct bfq_queue *bfqq; ++ bool bfqq_already_existing = false, split = false; ++ bool new_queue = false; ++ ++ if (unlikely(!rq->elv.icq)) ++ return NULL; ++ ++ /* ++ * Assuming that elv.priv[1] is set only if everything is set ++ * for this rq. This holds true, because this function is ++ * invoked only for insertion or merging, and, after such ++ * events, a request cannot be manipulated any longer before ++ * being removed from bfq. ++ */ ++ if (rq->elv.priv[1]) { ++ BUG_ON(!(rq->rq_flags & RQF_ELVPRIV)); ++ return rq->elv.priv[1]; ++ } ++ ++ bic = icq_to_bic(rq->elv.icq); ++ ++ bfq_check_ioprio_change(bic, bio); ++ ++ bfq_bic_update_cgroup(bic, bio); ++ ++ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, false, is_sync, ++ &new_queue); ++ ++ if (likely(!new_queue)) { ++ /* If the queue was seeky for too long, break it apart. */ ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ BUG_ON(!is_sync); ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ ++ /* Update bic before losing reference to bfqq */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bic->saved_in_large_burst = true; ++ ++ bfqq = bfq_split_bfqq(bic, bfqq); ++ split = true; ++ ++ if (!bfqq) ++ bfqq = bfq_get_bfqq_handle_split(bfqd, bic, bio, ++ true, is_sync, ++ NULL); ++ else ++ bfqq_already_existing = true; ++ ++ BUG_ON(!bfqq); ++ BUG_ON(bfqq == &bfqd->oom_bfqq); ++ } ++ } ++ ++ bfqq->allocated++; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "new allocated %d", bfqq->allocated); ++ ++ bfqq->ref++; ++ bfq_log_bfqq(bfqd, bfqq, "%p: bfqq %p, %d", rq, bfqq, bfqq->ref); ++ ++ rq->elv.priv[0] = bic; ++ rq->elv.priv[1] = bfqq; ++ rq->rq_flags &= ~RQF_DISP_LIST; ++ ++ /* ++ * If a bfq_queue has only one process reference, it is owned ++ * by only this bic: we can then set bfqq->bic = bic. in ++ * addition, if the queue has also just been split, we have to ++ * resume its state. ++ */ ++ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { ++ bfqq->bic = bic; ++ if (split) { ++ /* ++ * The queue has just been split from a shared ++ * queue: restore the idle window and the ++ * possible weight raising period. ++ */ ++ bfq_bfqq_resume_state(bfqq, bfqd, bic, ++ bfqq_already_existing); ++ } ++ } ++ ++ if (unlikely(bfq_bfqq_just_created(bfqq))) ++ bfq_handle_burst(bfqd, bfqq); ++ ++ rq->rq_flags |= RQF_GOT; ++ ++ return bfqq; ++} ++ ++static void bfq_idle_slice_timer_body(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ enum bfqq_expiration reason; ++ unsigned long flags; ++ ++ BUG_ON(!bfqd); ++ spin_lock_irqsave(&bfqd->lock, flags); ++ ++ bfq_log_bfqq(bfqd, bfqq, "handling slice_timer expiration"); ++ bfq_clear_bfqq_wait_request(bfqq); ++ ++ if (bfqq != bfqd->in_service_queue) { ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ return; ++ } ++ ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the ++ * first request of the in-service queue arrives ++ * during disk idling. ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, true, reason); ++ ++schedule_dispatch: ++ spin_unlock_irqrestore(&bfqd->lock, flags); ++ bfq_schedule_dispatch(bfqd); ++} ++ ++/* ++ * Handler of the expiration of the timer running if the in-service queue ++ * is idling inside its time slice. ++ */ ++static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) ++{ ++ struct bfq_data *bfqd = container_of(timer, struct bfq_data, ++ idle_slice_timer); ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ ++ bfq_log(bfqd, "expired"); ++ ++ /* ++ * Theoretical race here: the in-service queue can be NULL or ++ * different from the queue that was idling if a new request ++ * arrives for the current queue and there is a full dispatch ++ * cycle that changes the in-service queue. This can hardly ++ * happen, but in the worst case we just expire a queue too ++ * early. ++ */ ++ if (bfqq) ++ bfq_idle_slice_timer_body(bfqq); ++ ++ return HRTIMER_NORESTART; ++} ++ ++static void __bfq_put_async_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue **bfqq_ptr) ++{ ++ struct bfq_group *root_group = bfqd->root_group; ++ struct bfq_queue *bfqq = *bfqq_ptr; ++ ++ bfq_log(bfqd, "%p", bfqq); ++ if (bfqq) { ++ bfq_bfqq_move(bfqd, bfqq, root_group); ++ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ *bfqq_ptr = NULL; ++ } ++} ++ ++/* ++ * Release all the bfqg references to its async queues. If we are ++ * deallocating the group these queues may still contain requests, so ++ * we reparent them to the root cgroup (i.e., the only one that will ++ * exist for sure until all the requests on a device are gone). ++ */ ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++} ++ ++/* ++ * See the comments on bfq_limit_depth for the purpose of ++ * the depths set in the function. Return minimum shallow depth we'll use. ++ */ ++static unsigned int bfq_update_depths(struct bfq_data *bfqd, ++ struct sbitmap_queue *bt) ++{ ++ unsigned int i, j, min_shallow = UINT_MAX; ++ ++ /* ++ * In-word depths if no bfq_queue is being weight-raised: ++ * leaving 25% of tags only for sync reads. ++ * ++ * In next formulas, right-shift the value ++ * (1U<<bt->sb.shift), instead of computing directly ++ * (1U<<(bt->sb.shift - something)), to be robust against ++ * any possible value of bt->sb.shift, without having to ++ * limit 'something'. ++ */ ++ /* no more than 50% of tags for async I/O */ ++ bfqd->word_depths[0][0] = max((1U<<bt->sb.shift)>>1, 1U); ++ /* ++ * no more than 75% of tags for sync writes (25% extra tags ++ * w.r.t. async I/O, to prevent async I/O from starving sync ++ * writes) ++ */ ++ bfqd->word_depths[0][1] = max(((1U<<bt->sb.shift) * 3)>>2, 1U); ++ ++ /* ++ * In-word depths in case some bfq_queue is being weight- ++ * raised: leaving ~63% of tags for sync reads. This is the ++ * highest percentage for which, in our tests, application ++ * start-up times didn't suffer from any regression due to tag ++ * shortage. ++ */ ++ /* no more than ~18% of tags for async I/O */ ++ bfqd->word_depths[1][0] = max(((1U<<bt->sb.shift) * 3)>>4, 1U); ++ /* no more than ~37% of tags for sync writes (~20% extra tags) */ ++ bfqd->word_depths[1][1] = max(((1U<<bt->sb.shift) * 6)>>4, 1U); ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < 2; j++) ++ min_shallow = min(min_shallow, bfqd->word_depths[i][j]); ++ ++ return min_shallow; ++} ++ ++static void bfq_depth_updated(struct blk_mq_hw_ctx *hctx) ++{ ++ struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; ++ struct blk_mq_tags *tags = hctx->sched_tags; ++ unsigned int min_shallow; ++ ++ min_shallow = bfq_update_depths(bfqd, &tags->bitmap_tags); ++ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, min_shallow); ++} ++ ++static int bfq_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int index) ++{ ++ bfq_depth_updated(hctx); ++ return 0; ++} ++ ++static void bfq_exit_queue(struct elevator_queue *e) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ struct bfq_queue *bfqq, *n; ++ ++ bfq_log(bfqd, "starting ..."); ++ ++ hrtimer_cancel(&bfqd->idle_slice_timer); ++ ++ BUG_ON(bfqd->in_service_queue); ++ BUG_ON(!list_empty(&bfqd->active_list)); ++ ++ spin_lock_irq(&bfqd->lock); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) ++ bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ spin_unlock_irq(&bfqd->lock); ++ ++ hrtimer_cancel(&bfqd->idle_slice_timer); ++ ++ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ /* release oom-queue reference to root group */ ++ bfqg_and_blkg_put(bfqd->root_group); ++ ++ blkcg_deactivate_policy(bfqd->queue, &blkcg_policy_bfq); ++#else ++ spin_lock_irq(&bfqd->lock); ++ bfq_put_async_queues(bfqd, bfqd->root_group); ++ kfree(bfqd->root_group); ++ spin_unlock_irq(&bfqd->lock); ++#endif ++ ++ bfq_log(bfqd, "finished ..."); ++ kfree(bfqd); ++} ++ ++static void bfq_init_root_group(struct bfq_group *root_group, ++ struct bfq_data *bfqd) ++{ ++ int i; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ root_group->entity.parent = NULL; ++ root_group->my_entity = NULL; ++ root_group->bfqd = bfqd; ++#endif ++ root_group->rq_pos_tree = RB_ROOT; ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ root_group->sched_data.bfq_class_idle_last_service = jiffies; ++} ++ ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ++{ ++ struct bfq_data *bfqd; ++ struct elevator_queue *eq; ++ ++ eq = elevator_alloc(q, e); ++ if (!eq) ++ return -ENOMEM; ++ ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); ++ if (!bfqd) { ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++ } ++ eq->elevator_data = bfqd; ++ ++ spin_lock_irq(q->queue_lock); ++ q->elevator = eq; ++ spin_unlock_irq(q->queue_lock); ++ ++ /* ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. ++ * Grab a permanent reference to it, so that the normal code flow ++ * will not attempt to free it. ++ */ ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); ++ bfqd->oom_bfqq.ref++; ++ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; ++ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; ++ bfqd->oom_bfqq.entity.new_weight = ++ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); ++ ++ /* oom_bfqq does not participate to bursts */ ++ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); ++ /* ++ * Trigger weight initialization, according to ioprio, at the ++ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio ++ * class won't be changed any more. ++ */ ++ bfqd->oom_bfqq.entity.prio_changed = 1; ++ ++ bfqd->queue = q; ++ INIT_LIST_HEAD(&bfqd->dispatch); ++ ++ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; ++ ++ bfqd->queue_weights_tree = RB_ROOT; ++ bfqd->num_groups_with_pending_reqs = 0; ++ ++ INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->idle_list); ++ INIT_HLIST_HEAD(&bfqd->burst_list); ++ ++ bfqd->hw_tag = -1; ++ ++ bfqd->bfq_max_budget = bfq_default_max_budget; ++ ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; ++ bfqd->bfq_back_max = bfq_back_max; ++ bfqd->bfq_back_penalty = bfq_back_penalty; ++ bfqd->bfq_slice_idle = bfq_slice_idle; ++ bfqd->bfq_timeout = bfq_timeout; ++ ++ bfqd->bfq_requests_within_timer = 120; ++ ++ bfqd->bfq_large_burst_thresh = 8; ++ bfqd->bfq_burst_interval = msecs_to_jiffies(180); ++ ++ bfqd->low_latency = true; ++ ++ /* ++ * Trade-off between responsiveness and fairness. ++ */ ++ bfqd->bfq_wr_coeff = 30; ++ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_wr_max_time = 0; ++ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_wr_max_softrt_rate = 7000; /* ++ * Approximate rate required ++ * to playback or record a ++ * high-definition compressed ++ * video. ++ */ ++ bfqd->wr_busy_queues = 0; ++ ++ /* ++ * Begin by assuming, optimistically, that the device peak ++ * rate is equal to 2/3 of the highest reference rate. ++ */ ++ bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * ++ ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; ++ ++ spin_lock_init(&bfqd->lock); ++ ++ /* ++ * The invocation of the next bfq_create_group_hierarchy ++ * function is the head of a chain of function calls ++ * (bfq_create_group_hierarchy->blkcg_activate_policy-> ++ * blk_mq_freeze_queue) that may lead to the invocation of the ++ * has_work hook function. For this reason, ++ * bfq_create_group_hierarchy is invoked only after all ++ * scheduler data has been initialized, apart from the fields ++ * that can be initialized only after invoking ++ * bfq_create_group_hierarchy. This, in particular, enables ++ * has_work to correctly return false. Of course, to avoid ++ * other inconsistencies, the blk-mq stack must then refrain ++ * from invoking further scheduler hooks before this init ++ * function is finished. ++ */ ++ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); ++ if (!bfqd->root_group) ++ goto out_free; ++ bfq_init_root_group(bfqd->root_group, bfqd); ++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); ++ ++ wbt_disable_default(q); ++ return 0; ++ ++out_free: ++ kfree(bfqd); ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++} ++ ++static void bfq_slab_kill(void) ++{ ++ kmem_cache_destroy(bfq_pool); ++} ++ ++static int __init bfq_slab_setup(void) ++{ ++ bfq_pool = KMEM_CACHE(bfq_queue, 0); ++ if (!bfq_pool) ++ return -ENOMEM; ++ return 0; ++} ++ ++static ssize_t bfq_var_show(unsigned int var, char *page) ++{ ++ return sprintf(page, "%u\n", var); ++} ++ ++static ssize_t bfq_var_store(unsigned long *var, const char *page, ++ size_t count) ++{ ++ unsigned long new_val; ++ int ret = kstrtoul(page, 10, &new_val); ++ ++ if (ret == 0) ++ *var = new_val; ++ ++ return count; ++} ++ ++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ ++ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? ++ jiffies_to_msecs(bfqd->bfq_wr_max_time) : ++ jiffies_to_msecs(bfq_wr_duration(bfqd))); ++} ++ ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = e->elevator_data; ++ ssize_t num_char = 0; ++ ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", ++ bfqd->queued); ++ ++ spin_lock_irq(&bfqd->lock); ++ ++ num_char += sprintf(page + num_char, "Active:\n"); ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, nr_queued %d %d, ", ++ bfqq->pid, ++ bfqq->entity.weight, ++ bfqq->queued[0], ++ bfqq->queued[1]); ++ num_char += sprintf(page + num_char, ++ "dur %d/%u\n", ++ jiffies_to_msecs( ++ jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ num_char += sprintf(page + num_char, "Idle:\n"); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ spin_unlock_irq(&bfqd->lock); ++ ++ return num_char; ++} ++ ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ if (__CONV == 1) \ ++ __data = jiffies_to_msecs(__data); \ ++ else if (__CONV == 2) \ ++ __data = div_u64(__data, NSEC_PER_MSEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); ++SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); ++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); ++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); ++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); ++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, ++ 1); ++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); ++#undef SHOW_FUNCTION ++ ++#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ __data = div_u64(__data, NSEC_PER_USEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); ++#undef USEC_SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ++static ssize_t \ ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ if (__CONV == 1) \ ++ *(__PTR) = msecs_to_jiffies(__data); \ ++ else if (__CONV == 2) \ ++ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ ++ else \ ++ *(__PTR) = __data; \ ++ return ret; \ ++} ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); ++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, ++ 1); ++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, ++ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, ++ INT_MAX, 0); ++#undef STORE_FUNCTION ++ ++#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ ++static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ ++ return ret; \ ++} ++USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, ++ UINT_MAX); ++#undef USEC_STORE_FUNCTION ++ ++/* do nothing for the moment */ ++static ssize_t bfq_weights_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ return count; ++} ++ ++static ssize_t bfq_max_budget_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ else { ++ if (__data > INT_MAX) ++ __data = INT_MAX; ++ bfqd->bfq_max_budget = __data; ++ } ++ ++ bfqd->bfq_user_max_budget = __data; ++ ++ return ret; ++} ++ ++/* ++ * Leaving this name to preserve name compatibility with cfq ++ * parameters, but this timeout is used for both sync and async. ++ */ ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data < 1) ++ __data = 1; ++ else if (__data > INT_MAX) ++ __data = INT_MAX; ++ ++ bfqd->bfq_timeout = msecs_to_jiffies(__data); ++ if (bfqd->bfq_user_max_budget == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ ++ return ret; ++} ++ ++static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (!bfqd->strict_guarantees && __data == 1 ++ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) ++ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; ++ ++ bfqd->strict_guarantees = __data; ++ ++ return ret; ++} ++ ++static ssize_t bfq_low_latency_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (__data == 0 && bfqd->low_latency != 0) ++ bfq_end_wr(bfqd); ++ bfqd->low_latency = __data; ++ ++ return ret; ++} ++ ++#define BFQ_ATTR(name) \ ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) ++ ++static struct elv_fs_entry bfq_attrs[] = { ++ BFQ_ATTR(fifo_expire_sync), ++ BFQ_ATTR(fifo_expire_async), ++ BFQ_ATTR(back_seek_max), ++ BFQ_ATTR(back_seek_penalty), ++ BFQ_ATTR(slice_idle), ++ BFQ_ATTR(slice_idle_us), ++ BFQ_ATTR(max_budget), ++ BFQ_ATTR(timeout_sync), ++ BFQ_ATTR(strict_guarantees), ++ BFQ_ATTR(low_latency), ++ BFQ_ATTR(wr_coeff), ++ BFQ_ATTR(wr_max_time), ++ BFQ_ATTR(wr_rt_max_time), ++ BFQ_ATTR(wr_min_idle_time), ++ BFQ_ATTR(wr_min_inter_arr_async), ++ BFQ_ATTR(wr_max_softrt_rate), ++ BFQ_ATTR(weights), ++ __ATTR_NULL ++}; ++ ++static struct elevator_type iosched_bfq_mq = { ++ .ops.mq = { ++ .limit_depth = bfq_limit_depth, ++ .prepare_request = bfq_prepare_request, ++ .requeue_request = bfq_finish_requeue_request, ++ .finish_request = bfq_finish_requeue_request, ++ .exit_icq = bfq_exit_icq, ++ .insert_requests = bfq_insert_requests, ++ .dispatch_request = bfq_dispatch_request, ++ .next_request = elv_rb_latter_request, ++ .former_request = elv_rb_former_request, ++ .allow_merge = bfq_allow_bio_merge, ++ .bio_merge = bfq_bio_merge, ++ .request_merge = bfq_request_merge, ++ .requests_merged = bfq_requests_merged, ++ .request_merged = bfq_request_merged, ++ .has_work = bfq_has_work, ++ .depth_updated = bfq_depth_updated, ++ .init_hctx = bfq_init_hctx, ++ .init_sched = bfq_init_queue, ++ .exit_sched = bfq_exit_queue, ++ }, ++ ++ .uses_mq = true, ++ .icq_size = sizeof(struct bfq_io_cq), ++ .icq_align = __alignof__(struct bfq_io_cq), ++ .elevator_attrs = bfq_attrs, ++ .elevator_name = "bfq-mq", ++ .elevator_owner = THIS_MODULE, ++}; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct blkcg_policy blkcg_policy_bfq = { ++ .dfl_cftypes = bfq_blkg_files, ++ .legacy_cftypes = bfq_blkcg_legacy_files, ++ ++ .cpd_alloc_fn = bfq_cpd_alloc, ++ .cpd_init_fn = bfq_cpd_init, ++ .cpd_bind_fn = bfq_cpd_init, ++ .cpd_free_fn = bfq_cpd_free, ++ ++ .pd_alloc_fn = bfq_pd_alloc, ++ .pd_init_fn = bfq_pd_init, ++ .pd_offline_fn = bfq_pd_offline, ++ .pd_free_fn = bfq_pd_free, ++ .pd_reset_stats_fn = bfq_pd_reset_stats, ++}; ++#endif ++ ++static int __init bfq_init(void) ++{ ++ int ret; ++ char msg[60] = "BFQ I/O-scheduler: v9"; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ ret = blkcg_policy_register(&blkcg_policy_bfq); ++ if (ret) ++ return ret; ++#endif ++ ++ ret = -ENOMEM; ++ if (bfq_slab_setup()) ++ goto err_pol_unreg; ++ ++ /* ++ * Times to load large popular applications for the typical ++ * systems installed on the reference devices (see the ++ * comments before the definition of the next ++ * array). Actually, we use slightly lower values, as the ++ * estimated peak rate tends to be smaller than the actual ++ * peak rate. The reason for this last fact is that estimates ++ * are computed over much shorter time intervals than the long ++ * intervals typically used for benchmarking. Why? First, to ++ * adapt more quickly to variations. Second, because an I/O ++ * scheduler cannot rely on a peak-rate-evaluation workload to ++ * be run for a long time. ++ */ ++ ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */ ++ ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */ ++ ++ ret = elv_register(&iosched_bfq_mq); ++ if (ret) ++ goto slab_kill; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ strcat(msg, " (with cgroups support)"); ++#endif ++ pr_info("%s", msg); ++ ++ return 0; ++ ++slab_kill: ++ bfq_slab_kill(); ++err_pol_unreg: ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ return ret; ++} ++ ++static void __exit bfq_exit(void) ++{ ++ elv_unregister(&iosched_bfq_mq); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ bfq_slab_kill(); ++} ++ ++module_init(bfq_init); ++module_exit(bfq_exit); ++ ++MODULE_AUTHOR("Paolo Valente"); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION("MQ Budget Fair Queueing I/O Scheduler"); +diff --git a/block/bfq-mq.h b/block/bfq-mq.h +new file mode 100644 +index 000000000000..ceb291132a1a +--- /dev/null ++++ b/block/bfq-mq.h +@@ -0,0 +1,1077 @@ ++/* ++ * BFQ v9: data structures and common functions prototypes. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> ++ * ++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> ++ * Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> ++ */ ++ ++#ifndef _BFQ_H ++#define _BFQ_H ++ ++#include <linux/hrtimer.h> ++#include <linux/blk-cgroup.h> ++ ++/* see comments on CONFIG_BFQ_GROUP_IOSCHED in bfq.h */ ++#ifdef CONFIG_MQ_BFQ_GROUP_IOSCHED ++#define BFQ_GROUP_IOSCHED_ENABLED ++#endif ++ ++#define BFQ_IOPRIO_CLASSES 3 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) ++ ++#define BFQ_MIN_WEIGHT 1 ++#define BFQ_MAX_WEIGHT 1000 ++#define BFQ_WEIGHT_CONVERSION_COEFF 10 ++ ++#define BFQ_DEFAULT_QUEUE_IOPRIO 4 ++ ++#define BFQ_WEIGHT_LEGACY_DFL 100 ++#define BFQ_DEFAULT_GRP_IOPRIO 0 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE ++ ++/* ++ * Soft real-time applications are extremely more latency sensitive ++ * than interactive ones. Over-raise the weight of the former to ++ * privilege them against the latter. ++ */ ++#define BFQ_SOFTRT_WEIGHT_FACTOR 100 ++ ++struct bfq_entity; ++ ++/** ++ * struct bfq_service_tree - per ioprio_class service tree. ++ * ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each ++ * ioprio_class has its own independent scheduler, and so its own ++ * bfq_service_tree. All the fields are protected by the queue lock ++ * of the containing bfqd. ++ */ ++struct bfq_service_tree { ++ /* tree for active entities (i.e., those backlogged) */ ++ struct rb_root active; ++ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ ++ struct rb_root idle; ++ ++ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ ++ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ ++ ++ u64 vtime; /* scheduler virtual time */ ++ /* scheduler weight sum; active and idle entities contribute to it */ ++ unsigned long wsum; ++}; ++ ++/** ++ * struct bfq_sched_data - multi-class scheduler. ++ * ++ * bfq_sched_data is the basic scheduler queue. It supports three ++ * ioprio_classes, and can be used either as a toplevel queue or as an ++ * intermediate queue in a hierarchical setup. ++ * ++ * The supported ioprio_classes are the same as in CFQ, in descending ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. ++ * Requests from higher priority queues are served before all the ++ * requests from lower priority queues; among requests of the same ++ * queue requests are served according to B-WF2Q+. ++ * ++ * The schedule is implemented by the service trees, plus the field ++ * @next_in_service, which points to the entity on the active trees ++ * that will be served next, if 1) no changes in the schedule occurs ++ * before the current in-service entity is expired, 2) the in-service ++ * queue becomes idle when it expires, and 3) if the entity pointed by ++ * in_service_entity is not a queue, then the in-service child entity ++ * of the entity pointed by in_service_entity becomes idle on ++ * expiration. This peculiar definition allows for the following ++ * optimization, not yet exploited: while a given entity is still in ++ * service, we already know which is the best candidate for next ++ * service among the other active entitities in the same parent ++ * entity. We can then quickly compare the timestamps of the ++ * in-service entity with those of such best candidate. ++ * ++ * All the fields are protected by the queue lock of the containing ++ * bfqd. ++ */ ++struct bfq_sched_data { ++ struct bfq_entity *in_service_entity; /* entity in service */ ++ /* head-of-the-line entity in the scheduler (see comments above) */ ++ struct bfq_entity *next_in_service; ++ /* array of service trees, one per ioprio_class */ ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; ++ /* last time CLASS_IDLE was served */ ++ unsigned long bfq_class_idle_last_service; ++ ++}; ++ ++/** ++ * struct bfq_weight_counter - counter of the number of all active queues ++ * with a given weight. ++ */ ++struct bfq_weight_counter { ++ unsigned int weight; /* weight of the queues this counter refers to */ ++ unsigned int num_active; /* nr of active queues with this weight */ ++ /* ++ * Weights tree member (see bfq_data's @queue_weights_tree) ++ */ ++ struct rb_node weights_node; ++}; ++ ++/** ++ * struct bfq_entity - schedulable entity. ++ * ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each ++ * entity belongs to the sched_data of the parent group in the cgroup ++ * hierarchy. Non-leaf entities have also their own sched_data, stored ++ * in @my_sched_data. ++ * ++ * Each entity stores independently its priority values; this would ++ * allow different weights on different devices, but this ++ * functionality is not exported to userspace by now. Priorities and ++ * weights are updated lazily, first storing the new values into the ++ * new_* fields, then setting the @prio_changed flag. As soon as ++ * there is a transition in the entity state that allows the priority ++ * update to take place the effective and the requested priority ++ * values are synchronized. ++ * ++ * Unless cgroups are used, the weight value is calculated from the ++ * ioprio to export the same interface as CFQ. When dealing with ++ * ``well-behaved'' queues (i.e., queues that do not spend too much ++ * time to consume their budget and have true sequential behavior, and ++ * when there are no external factors breaking anticipation) the ++ * relative weights at each level of the cgroups hierarchy should be ++ * guaranteed. All the fields are protected by the queue lock of the ++ * containing bfqd. ++ */ ++struct bfq_entity { ++ struct rb_node rb_node; /* service_tree member */ ++ ++ /* ++ * Flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree) or is in service. ++ */ ++ bool on_st; ++ ++ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ ++ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ ++ ++ /* tree the entity is enqueued into; %NULL if not on a tree */ ++ struct rb_root *tree; ++ ++ /* ++ * minimum start time of the (active) subtree rooted at this ++ * entity; used for O(log N) lookups into active trees ++ */ ++ u64 min_start; ++ ++ /* amount of service received during the last service slot */ ++ int service; ++ ++ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ ++ int budget; ++ ++ unsigned int weight; /* weight of the queue */ ++ unsigned int new_weight; /* next weight if a change is in progress */ ++ ++ /* original weight, used to implement weight boosting */ ++ unsigned int orig_weight; ++ ++ /* parent entity, for hierarchical scheduling */ ++ struct bfq_entity *parent; ++ ++ /* ++ * For non-leaf nodes in the hierarchy, the associated ++ * scheduler queue, %NULL on leaf nodes. ++ */ ++ struct bfq_sched_data *my_sched_data; ++ /* the scheduler queue this entity belongs to */ ++ struct bfq_sched_data *sched_data; ++ ++ /* flag, set to request a weight, ioprio or ioprio_class change */ ++ int prio_changed; ++ ++ /* flag, set if the entity is counted in groups_with_pending_reqs */ ++ bool in_groups_with_pending_reqs; ++}; ++ ++struct bfq_group; ++ ++/** ++ * struct bfq_ttime - per process thinktime stats. ++ */ ++struct bfq_ttime { ++ u64 last_end_request; /* completion time of last request */ ++ ++ u64 ttime_total; /* total process thinktime */ ++ unsigned long ttime_samples; /* number of thinktime samples */ ++ u64 ttime_mean; /* average process thinktime */ ++ ++}; ++ ++/** ++ * struct bfq_queue - leaf schedulable entity. ++ * ++ * A bfq_queue is a leaf request queue; it can be associated with an ++ * io_context or more, if it is async or shared between cooperating ++ * processes. @cgroup holds a reference to the cgroup, to be sure that it ++ * does not disappear while a bfqq still references it (mostly to avoid ++ * races between request issuing and task migration followed by cgroup ++ * destruction). ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_queue { ++ /* reference counter */ ++ int ref; ++ /* parent bfq_data */ ++ struct bfq_data *bfqd; ++ ++ /* current ioprio and ioprio class */ ++ unsigned short ioprio, ioprio_class; ++ /* next ioprio and ioprio class if a change is in progress */ ++ unsigned short new_ioprio, new_ioprio_class; ++ ++ /* ++ * Shared bfq_queue if queue is cooperating with one or more ++ * other queues. ++ */ ++ struct bfq_queue *new_bfqq; ++ /* request-position tree member (see bfq_group's @rq_pos_tree) */ ++ struct rb_node pos_node; ++ /* request-position tree root (see bfq_group's @rq_pos_tree) */ ++ struct rb_root *pos_root; ++ ++ /* sorted list of pending requests */ ++ struct rb_root sort_list; ++ /* if fifo isn't expired, next request to serve */ ++ struct request *next_rq; ++ /* number of sync and async requests queued */ ++ int queued[2]; ++ /* number of requests currently allocated */ ++ int allocated; ++ /* number of pending metadata requests */ ++ int meta_pending; ++ /* fifo list of requests in sort_list */ ++ struct list_head fifo; ++ ++ /* entity representing this queue in the scheduler */ ++ struct bfq_entity entity; ++ ++ /* pointer to the weight counter associated with this queue */ ++ struct bfq_weight_counter *weight_counter; ++ ++ /* maximum budget allowed from the feedback mechanism */ ++ int max_budget; ++ /* budget expiration (in jiffies) */ ++ unsigned long budget_timeout; ++ ++ /* number of requests on the dispatch list or inside driver */ ++ int dispatched; ++ ++ unsigned int flags; /* status flags.*/ ++ ++ /* node for active/idle bfqq list inside parent bfqd */ ++ struct list_head bfqq_list; ++ ++ /* associated @bfq_ttime struct */ ++ struct bfq_ttime ttime; ++ ++ /* bit vector: a 1 for each seeky requests in history */ ++ u32 seek_history; ++ ++ /* node for the device's burst list */ ++ struct hlist_node burst_list_node; ++ ++ /* position of the last request enqueued */ ++ sector_t last_request_pos; ++ ++ /* Number of consecutive pairs of request completion and ++ * arrival, such that the queue becomes idle after the ++ * completion, but the next request arrives within an idle ++ * time slice; used only if the queue's IO_bound flag has been ++ * cleared. ++ */ ++ unsigned int requests_within_timer; ++ ++ /* pid of the process owning the queue, used for logging purposes */ ++ pid_t pid; ++ ++ /* ++ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL ++ * if the queue is shared. ++ */ ++ struct bfq_io_cq *bic; ++ ++ /* current maximum weight-raising time for this queue */ ++ unsigned long wr_cur_max_time; ++ /* ++ * Minimum time instant such that, only if a new request is ++ * enqueued after this time instant in an idle @bfq_queue with ++ * no outstanding requests, then the task associated with the ++ * queue it is deemed as soft real-time (see the comments on ++ * the function bfq_bfqq_softrt_next_start()) ++ */ ++ unsigned long soft_rt_next_start; ++ /* ++ * Start time of the current weight-raising period if ++ * the @bfq-queue is being weight-raised, otherwise ++ * finish time of the last weight-raising period. ++ */ ++ unsigned long last_wr_start_finish; ++ /* factor by which the weight of this queue is multiplied */ ++ unsigned int wr_coeff; ++ /* ++ * Time of the last transition of the @bfq_queue from idle to ++ * backlogged. ++ */ ++ unsigned long last_idle_bklogged; ++ /* ++ * Cumulative service received from the @bfq_queue since the ++ * last transition from idle to backlogged. ++ */ ++ unsigned long service_from_backlogged; ++ /* ++ * Cumulative service received from the @bfq_queue since its ++ * last transition to weight-raised state. ++ */ ++ unsigned long service_from_wr; ++ /* ++ * Value of wr start time when switching to soft rt ++ */ ++ unsigned long wr_start_at_switch_to_srt; ++ ++ unsigned long split_time; /* time of last split */ ++ unsigned long first_IO_time; /* time of first I/O for this queue */ ++ ++ /* max service rate measured so far */ ++ u32 max_service_rate; ++ /* ++ * Ratio between the service received by bfqq while it is in ++ * service, and the cumulative service (of requests of other ++ * queues) that may be injected while bfqq is empty but still ++ * in service. To increase precision, the coefficient is ++ * measured in tenths of unit. Here are some example of (1) ++ * ratios, (2) resulting percentages of service injected ++ * w.r.t. to the total service dispatched while bfqq is in ++ * service, and (3) corresponding values of the coefficient: ++ * 1 (50%) -> 10 ++ * 2 (33%) -> 20 ++ * 10 (9%) -> 100 ++ * 9.9 (9%) -> 99 ++ * 1.5 (40%) -> 15 ++ * 0.5 (66%) -> 5 ++ * 0.1 (90%) -> 1 ++ * ++ * So, if the coefficient is lower than 10, then ++ * injected service is more than bfqq service. ++ */ ++ unsigned int inject_coeff; ++ /* amount of service injected in current service slot */ ++ unsigned int injected_service; ++}; ++ ++/** ++ * struct bfq_io_cq - per (request_queue, io_context) structure. ++ */ ++struct bfq_io_cq { ++ /* associated io_cq structure */ ++ struct io_cq icq; /* must be the first member */ ++ /* array of two process queues, the sync and the async */ ++ struct bfq_queue *bfqq[2]; ++ /* per (request_queue, blkcg) ioprio */ ++ int ioprio; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ uint64_t blkcg_serial_nr; /* the current blkcg serial */ ++#endif ++ ++ /* ++ * Snapshot of the has_short_time flag before merging; taken ++ * to remember its value while the queue is merged, so as to ++ * be able to restore it in case of split. ++ */ ++ bool saved_has_short_ttime; ++ /* ++ * Same purpose as the previous two fields for the I/O bound ++ * classification of a queue. ++ */ ++ bool saved_IO_bound; ++ ++ /* ++ * Same purpose as the previous fields for the value of the ++ * field keeping the queue's belonging to a large burst ++ */ ++ bool saved_in_large_burst; ++ /* ++ * True if the queue belonged to a burst list before its merge ++ * with another cooperating queue. ++ */ ++ bool was_in_burst_list; ++ ++ /* ++ * Similar to previous fields: save wr information. ++ */ ++ unsigned long saved_wr_coeff; ++ unsigned long saved_last_wr_start_finish; ++ unsigned long saved_wr_start_at_switch_to_srt; ++ unsigned int saved_wr_cur_max_time; ++ struct bfq_ttime saved_ttime; ++}; ++ ++/** ++ * struct bfq_data - per-device data structure. ++ * ++ * All the fields are protected by @lock. ++ */ ++struct bfq_data { ++ /* device request queue */ ++ struct request_queue *queue; ++ /* dispatch queue */ ++ struct list_head dispatch; ++ ++ /* root bfq_group for the device */ ++ struct bfq_group *root_group; ++ ++ /* ++ * rbtree of weight counters of @bfq_queues, sorted by ++ * weight. Used to keep track of whether all @bfq_queues have ++ * the same weight. The tree contains one counter for each ++ * distinct weight associated to some active and not ++ * weight-raised @bfq_queue (see the comments to the functions ++ * bfq_weights_tree_[add|remove] for further details). ++ */ ++ struct rb_root queue_weights_tree; ++ ++ /* ++ * Number of groups with at least one descendant process that ++ * has at least one request waiting for completion. Note that ++ * this accounts for also requests already dispatched, but not ++ * yet completed. Therefore this number of groups may differ ++ * (be larger) than the number of active groups, as a group is ++ * considered active only if its corresponding entity has ++ * descendant queues with at least one request queued. This ++ * number is used to decide whether a scenario is symmetric. ++ * For a detailed explanation see comments on the computation ++ * of the variable asymmetric_scenario in the function ++ * bfq_better_to_idle(). ++ * ++ * However, it is hard to compute this number exactly, for ++ * groups with multiple descendant processes. Consider a group ++ * that is inactive, i.e., that has no descendant process with ++ * pending I/O inside BFQ queues. Then suppose that ++ * num_groups_with_pending_reqs is still accounting for this ++ * group, because the group has descendant processes with some ++ * I/O request still in flight. num_groups_with_pending_reqs ++ * should be decremented when the in-flight request of the ++ * last descendant process is finally completed (assuming that ++ * nothing else has changed for the group in the meantime, in ++ * terms of composition of the group and active/inactive state of child ++ * groups and processes). To accomplish this, an additional ++ * pending-request counter must be added to entities, and must ++ * be updated correctly. To avoid this additional field and operations, ++ * we resort to the following tradeoff between simplicity and ++ * accuracy: for an inactive group that is still counted in ++ * num_groups_with_pending_reqs, we decrement ++ * num_groups_with_pending_reqs when the first descendant ++ * process of the group remains with no request waiting for ++ * completion. ++ * ++ * Even this simpler decrement strategy requires a little ++ * carefulness: to avoid multiple decrements, we flag a group, ++ * more precisely an entity representing a group, as still ++ * counted in num_groups_with_pending_reqs when it becomes ++ * inactive. Then, when the first descendant queue of the ++ * entity remains with no request waiting for completion, ++ * num_groups_with_pending_reqs is decremented, and this flag ++ * is reset. After this flag is reset for the entity, ++ * num_groups_with_pending_reqs won't be decremented any ++ * longer in case a new descendant queue of the entity remains ++ * with no request waiting for completion. ++ */ ++ unsigned int num_groups_with_pending_reqs; ++ ++ /* ++ * Per-class (RT, BE, IDLE) number of bfq_queues containing ++ * requests (including the queue in service, even if it is ++ * idling). ++ */ ++ unsigned int busy_queues[3]; ++ /* number of weight-raised busy @bfq_queues */ ++ int wr_busy_queues; ++ /* number of queued requests */ ++ int queued; ++ /* number of requests dispatched and waiting for completion */ ++ int rq_in_driver; ++ ++ /* ++ * Maximum number of requests in driver in the last ++ * @hw_tag_samples completed requests. ++ */ ++ int max_rq_in_driver; ++ /* number of samples used to calculate hw_tag */ ++ int hw_tag_samples; ++ /* flag set to one if the driver is showing a queueing behavior */ ++ int hw_tag; ++ ++ /* number of budgets assigned */ ++ int budgets_assigned; ++ ++ /* ++ * Timer set when idling (waiting) for the next request from ++ * the queue in service. ++ */ ++ struct hrtimer idle_slice_timer; ++ ++ /* bfq_queue in service */ ++ struct bfq_queue *in_service_queue; ++ ++ /* on-disk position of the last served request */ ++ sector_t last_position; ++ ++ /* position of the last served request for the in-service queue */ ++ sector_t in_serv_last_pos; ++ ++ /* time of last request completion (ns) */ ++ u64 last_completion; ++ ++ /* time of first rq dispatch in current observation interval (ns) */ ++ u64 first_dispatch; ++ /* time of last rq dispatch in current observation interval (ns) */ ++ u64 last_dispatch; ++ ++ /* beginning of the last budget */ ++ ktime_t last_budget_start; ++ /* beginning of the last idle slice */ ++ ktime_t last_idling_start; ++ ++ /* number of samples in current observation interval */ ++ int peak_rate_samples; ++ /* num of samples of seq dispatches in current observation interval */ ++ u32 sequential_samples; ++ /* total num of sectors transferred in current observation interval */ ++ u64 tot_sectors_dispatched; ++ /* max rq size seen during current observation interval (sectors) */ ++ u32 last_rq_max_size; ++ /* time elapsed from first dispatch in current observ. interval (us) */ ++ u64 delta_from_first; ++ /* ++ * Current estimate of the device peak rate, measured in ++ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by ++ * BFQ_RATE_SHIFT is performed to increase precision in ++ * fixed-point calculations. ++ */ ++ u32 peak_rate; ++ ++ /* maximum budget allotted to a bfq_queue before rescheduling */ ++ int bfq_max_budget; ++ ++ /* list of all the bfq_queues active on the device */ ++ struct list_head active_list; ++ /* list of all the bfq_queues idle on the device */ ++ struct list_head idle_list; ++ ++ /* ++ * Timeout for async/sync requests; when it fires, requests ++ * are served in fifo order. ++ */ ++ u64 bfq_fifo_expire[2]; ++ /* weight of backward seeks wrt forward ones */ ++ unsigned int bfq_back_penalty; ++ /* maximum allowed backward seek */ ++ unsigned int bfq_back_max; ++ /* maximum idling time */ ++ u32 bfq_slice_idle; ++ ++ /* user-configured max budget value (0 for auto-tuning) */ ++ int bfq_user_max_budget; ++ /* ++ * Timeout for bfq_queues to consume their budget; used to ++ * prevent seeky queues from imposing long latencies to ++ * sequential or quasi-sequential ones (this also implies that ++ * seeky queues cannot receive guarantees in the service ++ * domain; after a timeout they are charged for the time they ++ * have been in service, to preserve fairness among them, but ++ * without service-domain guarantees). ++ */ ++ unsigned int bfq_timeout; ++ ++ /* ++ * Number of consecutive requests that must be issued within ++ * the idle time slice to set again idling to a queue which ++ * was marked as non-I/O-bound (see the definition of the ++ * IO_bound flag for further details). ++ */ ++ unsigned int bfq_requests_within_timer; ++ ++ /* ++ * Force device idling whenever needed to provide accurate ++ * service guarantees, without caring about throughput ++ * issues. CAVEAT: this may even increase latencies, in case ++ * of useless idling for processes that did stop doing I/O. ++ */ ++ bool strict_guarantees; ++ ++ /* ++ * Last time at which a queue entered the current burst of ++ * queues being activated shortly after each other; for more ++ * details about this and the following parameters related to ++ * a burst of activations, see the comments on the function ++ * bfq_handle_burst. ++ */ ++ unsigned long last_ins_in_burst; ++ /* ++ * Reference time interval used to decide whether a queue has ++ * been activated shortly after @last_ins_in_burst. ++ */ ++ unsigned long bfq_burst_interval; ++ /* number of queues in the current burst of queue activations */ ++ int burst_size; ++ ++ /* common parent entity for the queues in the burst */ ++ struct bfq_entity *burst_parent_entity; ++ /* Maximum burst size above which the current queue-activation ++ * burst is deemed as 'large'. ++ */ ++ unsigned long bfq_large_burst_thresh; ++ /* true if a large queue-activation burst is in progress */ ++ bool large_burst; ++ /* ++ * Head of the burst list (as for the above fields, more ++ * details in the comments on the function bfq_handle_burst). ++ */ ++ struct hlist_head burst_list; ++ ++ /* if set to true, low-latency heuristics are enabled */ ++ bool low_latency; ++ /* ++ * Maximum factor by which the weight of a weight-raised queue ++ * is multiplied. ++ */ ++ unsigned int bfq_wr_coeff; ++ /* maximum duration of a weight-raising period (jiffies) */ ++ unsigned int bfq_wr_max_time; ++ ++ /* Maximum weight-raising duration for soft real-time processes */ ++ unsigned int bfq_wr_rt_max_time; ++ /* ++ * Minimum idle period after which weight-raising may be ++ * reactivated for a queue (in jiffies). ++ */ ++ unsigned int bfq_wr_min_idle_time; ++ /* ++ * Minimum period between request arrivals after which ++ * weight-raising may be reactivated for an already busy async ++ * queue (in jiffies). ++ */ ++ unsigned long bfq_wr_min_inter_arr_async; ++ ++ /* Max service-rate for a soft real-time queue, in sectors/sec */ ++ unsigned int bfq_wr_max_softrt_rate; ++ /* ++ * Cached value of the product ref_rate*ref_wr_duration, used ++ * for computing the maximum duration of weight raising ++ * automatically. ++ */ ++ u64 rate_dur_prod; ++ ++ /* fallback dummy bfqq for extreme OOM conditions */ ++ struct bfq_queue oom_bfqq; ++ ++ spinlock_t lock; ++ ++ /* ++ * bic associated with the task issuing current bio for ++ * merging. This and the next field are used as a support to ++ * be able to perform the bic lookup, needed by bio-merge ++ * functions, before the scheduler lock is taken, and thus ++ * avoid taking the request-queue lock while the scheduler ++ * lock is being held. ++ */ ++ struct bfq_io_cq *bio_bic; ++ /* bfqq associated with the task issuing current bio for merging */ ++ struct bfq_queue *bio_bfqq; ++ /* Extra flag used only for TESTING */ ++ bool bio_bfqq_set; ++ ++ /* ++ * Depth limits used in bfq_limit_depth (see comments on the ++ * function) ++ */ ++ unsigned int word_depths[2][2]; ++}; ++ ++enum bfqq_state_flags { ++ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ ++ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* ++ * waiting for a request ++ * without idling the device ++ */ ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ ++ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ ++ BFQ_BFQQ_FLAG_IO_bound, /* ++ * bfqq has timed-out at least once ++ * having consumed at most 2/10 of ++ * its budget ++ */ ++ BFQ_BFQQ_FLAG_in_large_burst, /* ++ * bfqq activated in a large burst, ++ * see comments to bfq_handle_burst. ++ */ ++ BFQ_BFQQ_FLAG_softrt_update, /* ++ * may need softrt-next-start ++ * update ++ */ ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ ++ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ ++}; ++ ++#define BFQ_BFQQ_FNS(name) \ ++static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ ++{ \ ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ ++} ++ ++BFQ_BFQQ_FNS(just_created); ++BFQ_BFQQ_FNS(busy); ++BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(non_blocking_wait_rq); ++BFQ_BFQQ_FNS(fifo_expire); ++BFQ_BFQQ_FNS(has_short_ttime); ++BFQ_BFQQ_FNS(sync); ++BFQ_BFQQ_FNS(IO_bound); ++BFQ_BFQQ_FNS(in_large_burst); ++BFQ_BFQQ_FNS(coop); ++BFQ_BFQQ_FNS(split_coop); ++BFQ_BFQQ_FNS(softrt_update); ++#undef BFQ_BFQQ_FNS ++ ++/* Logging facilities. */ ++#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE ++ ++static const char *checked_dev_name(const struct device *dev) ++{ ++ static const char nodev[] = "nodev"; ++ ++ if (dev) ++ return dev_name(dev); ++ ++ return nodev; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ pr_crit("%s %s [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ bfqg->blkg_path, __func__, ##args); \ ++} while (0) ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __func__, ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ pr_crit("%s bfq [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ __func__, ##args) ++ ++#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++#if !defined(CONFIG_BLK_DEV_IO_TRACE) ++ ++/* Avoid possible "unused-variable" warning. See commit message. */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) ++ ++#define bfq_log(bfqd, fmt, args...) do {} while (0) ++ ++#else /* CONFIG_BLK_DEV_IO_TRACE */ ++ ++#include <linux/blktrace_api.h> ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ bfqq_group(bfqq)->blkg_path, __func__, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, bfqg->blkg_path, \ ++ __func__, ##args);\ ++} while (0) ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __func__, ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) ++ ++#endif /* CONFIG_BLK_DEV_IO_TRACE */ ++#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++/* Expiration reasons. */ ++enum bfqq_expiration { ++ BFQ_BFQQ_TOO_IDLE = 0, /* ++ * queue has been idling for ++ * too long ++ */ ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++ BFQ_BFQQ_PREEMPTED /* preemption in progress */ ++}; ++ ++ ++struct bfqg_stats { ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++ /* number of ios merged */ ++ struct blkg_rwstat merged; ++ /* total time spent on device in ns, may not be accurate w/ queueing */ ++ struct blkg_rwstat service_time; ++ /* total time spent waiting in scheduler queue in ns */ ++ struct blkg_rwstat wait_time; ++ /* number of IOs queued up */ ++ struct blkg_rwstat queued; ++ /* total disk time and nr sectors dispatched by this group */ ++ struct blkg_stat time; ++ /* sum of number of ios queued across all samples */ ++ struct blkg_stat avg_queue_size_sum; ++ /* count of samples taken for average */ ++ struct blkg_stat avg_queue_size_samples; ++ /* how many times this group has been removed from service tree */ ++ struct blkg_stat dequeue; ++ /* total time spent waiting for it to be assigned a timeslice. */ ++ struct blkg_stat group_wait_time; ++ /* time spent idling for this blkcg_gq */ ++ struct blkg_stat idle_time; ++ /* total time with empty current active q with other requests queued */ ++ struct blkg_stat empty_time; ++ /* fields after this shouldn't be cleared on stat reset */ ++ u64 start_group_wait_time; ++ u64 start_idle_time; ++ u64 start_empty_time; ++ uint16_t flags; ++#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ ++}; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++/* ++ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. ++ * ++ * @ps: @blkcg_policy_storage that this structure inherits ++ * @weight: weight of the bfq_group ++ */ ++struct bfq_group_data { ++ /* must be the first member */ ++ struct blkcg_policy_data pd; ++ ++ unsigned int weight; ++}; ++ ++/** ++ * struct bfq_group - per (device, cgroup) data structure. ++ * @entity: schedulable entity to insert into the parent group sched_data. ++ * @sched_data: own sched_data, to contain child entities (they may be ++ * both bfq_queues and bfq_groups). ++ * @bfqd: the bfq_data for the device this group acts upon. ++ * @async_bfqq: array of async queues for all the tasks belonging to ++ * the group, one queue per ioprio value per ioprio_class, ++ * except for the idle class that has only one queue. ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used ++ * to avoid too many special cases during group creation/ ++ * migration. ++ * @active_entities: number of active entities belonging to the group; ++ * unused for the root group. Used to know whether there ++ * are groups with more than one active @bfq_entity ++ * (see the comments to the function ++ * bfq_bfqq_may_idle()). ++ * @rq_pos_tree: rbtree sorted by next_request position, used when ++ * determining if two or more queues have interleaving ++ * requests (see bfq_find_close_cooperator()). ++ * ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup ++ * there is a set of bfq_groups, each one collecting the lower-level ++ * entities belonging to the group that are acting on the same device. ++ * ++ * Locking works as follows: ++ * o @bfqd is protected by the queue lock, RCU is used to access it ++ * from the readers. ++ * o All the other fields are protected by the @bfqd queue lock. ++ */ ++struct bfq_group { ++ /* must be the first member */ ++ struct blkg_policy_data pd; ++ ++ /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ ++ char blkg_path[128]; ++ ++ /* reference counter (see comments in bfq_bic_update_cgroup) */ ++ int ref; ++ ++ struct bfq_entity entity; ++ struct bfq_sched_data sched_data; ++ ++ void *bfqd; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct bfq_entity *my_entity; ++ ++ int active_entities; ++ ++ struct rb_root rq_pos_tree; ++ ++ struct bfqg_stats stats; ++}; ++ ++#else ++struct bfq_group { ++ struct bfq_sched_data sched_data; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct rb_root rq_pos_tree; ++}; ++#endif ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); ++ ++static unsigned int bfq_class_idx(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ return bfqq ? bfqq->ioprio_class - 1 : ++ BFQ_DEFAULT_GRP_CLASS - 1; ++} ++ ++static unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd) ++{ ++ return bfqd->busy_queues[0] + bfqd->busy_queues[1] + ++ bfqd->busy_queues[2]; ++} ++ ++static struct bfq_service_tree * ++bfq_entity_service_tree(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sched_data = entity->sched_data; ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned int idx = bfq_class_idx(entity); ++ ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); ++ BUG_ON(sched_data == NULL); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "%p %d", ++ sched_data->service_tree + idx, idx); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "%p %d", ++ sched_data->service_tree + idx, idx); ++ } ++#endif ++ return sched_data->service_tree + idx; ++} ++ ++static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) ++{ ++ return bic->bfqq[is_sync]; ++} ++ ++static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, ++ bool is_sync) ++{ ++ bic->bfqq[is_sync] = bfqq; ++} ++ ++static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) ++{ ++ return bic->icq.q->elevator->elevator_data; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ if (!group_entity) ++ group_entity = &bfqq->bfqd->root_group->entity; ++ ++ return container_of(group_entity, struct bfq_group, entity); ++} ++ ++#else ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ return bfqq->bfqd->root_group; ++} ++ ++#endif ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); ++static void bfq_put_queue(struct bfq_queue *bfqq); ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic); ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++#endif ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++ ++#endif /* _BFQ_H */ +diff --git a/block/bfq-sched.c b/block/bfq-sched.c +new file mode 100644 +index 000000000000..7a4923231106 +--- /dev/null ++++ b/block/bfq-sched.c +@@ -0,0 +1,2077 @@ ++/* ++ * BFQ: Hierarchical B-WF2Q+ scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> ++ * ++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> ++ * Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2016 Paolo Valente <paolo.valente@linaro.org> ++ */ ++ ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++ ++/** ++ * bfq_gt - compare two timestamps. ++ * @a: first ts. ++ * @b: second ts. ++ * ++ * Return @a > @b, dealing with wrapping correctly. ++ */ ++static int bfq_gt(u64 a, u64 b) ++{ ++ return (s64)(a - b) > 0; ++} ++ ++static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) ++{ ++ struct rb_node *node = tree->rb_node; ++ ++ return rb_entry(node, struct bfq_entity, rb_node); ++} ++ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ bool expiration); ++ ++static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); ++ ++/** ++ * bfq_update_next_in_service - update sd->next_in_service ++ * @sd: sched_data for which to perform the update. ++ * @new_entity: if not NULL, pointer to the entity whose activation, ++ * requeueing or repositionig triggered the invocation of ++ * this function. ++ * @expiration: id true, this function is being invoked after the ++ * expiration of the in-service entity ++ * ++ * This function is called to update sd->next_in_service, which, in ++ * its turn, may change as a consequence of the insertion or ++ * extraction of an entity into/from one of the active trees of ++ * sd. These insertions/extractions occur as a consequence of ++ * activations/deactivations of entities, with some activations being ++ * 'true' activations, and other activations being requeueings (i.e., ++ * implementing the second, requeueing phase of the mechanism used to ++ * reposition an entity in its active tree; see comments on ++ * __bfq_activate_entity and __bfq_requeue_entity for details). In ++ * both the last two activation sub-cases, new_entity points to the ++ * just activated or requeued entity. ++ * ++ * Returns true if sd->next_in_service changes in such a way that ++ * entity->parent may become the next_in_service for its parent ++ * entity. ++ */ ++static bool bfq_update_next_in_service(struct bfq_sched_data *sd, ++ struct bfq_entity *new_entity, ++ bool expiration) ++{ ++ struct bfq_entity *next_in_service = sd->next_in_service; ++ struct bfq_queue *bfqq; ++ bool parent_sched_may_change = false; ++ bool change_without_lookup = false; ++ ++ /* ++ * If this update is triggered by the activation, requeueing ++ * or repositiong of an entity that does not coincide with ++ * sd->next_in_service, then a full lookup in the active tree ++ * can be avoided. In fact, it is enough to check whether the ++ * just-modified entity has the same priority as ++ * sd->next_in_service, is eligible and has a lower virtual ++ * finish time than sd->next_in_service. If this compound ++ * condition holds, then the new entity becomes the new ++ * next_in_service. Otherwise no change is needed. ++ */ ++ if (new_entity && new_entity != sd->next_in_service) { ++ /* ++ * Flag used to decide whether to replace ++ * sd->next_in_service with new_entity. Tentatively ++ * set to true, and left as true if ++ * sd->next_in_service is NULL. ++ */ ++ change_without_lookup = true; ++ ++ /* ++ * If there is already a next_in_service candidate ++ * entity, then compare timestamps to decide whether ++ * to replace sd->service_tree with new_entity. ++ */ ++ if (next_in_service) { ++ unsigned int new_entity_class_idx = ++ bfq_class_idx(new_entity); ++ struct bfq_service_tree *st = ++ sd->service_tree + new_entity_class_idx; ++ ++ change_without_lookup = ++ (new_entity_class_idx == ++ bfq_class_idx(next_in_service) ++ && ++ !bfq_gt(new_entity->start, st->vtime) ++ && ++ bfq_gt(next_in_service->finish, ++ new_entity->finish)); ++ } ++ ++ if (change_without_lookup) { ++ next_in_service = new_entity; ++ bfqq = bfq_entity_to_bfqq(next_in_service); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "chose without lookup"); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(next_in_service, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data*)bfqg->bfqd, bfqg, ++ "chose without lookup"); ++ } ++#endif ++ } ++ } ++ ++ if (!change_without_lookup) /* lookup needed */ ++ next_in_service = bfq_lookup_next_entity(sd, expiration); ++ ++ if (next_in_service) { ++ bool new_budget_triggers_change = ++ bfq_update_parent_budget(next_in_service); ++ ++ parent_sched_may_change = !sd->next_in_service || ++ new_budget_triggers_change; ++ } ++ ++ sd->next_in_service = next_in_service; ++ ++ if (!next_in_service) ++ return parent_sched_may_change; ++ ++ bfqq = bfq_entity_to_bfqq(next_in_service); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "chosen this queue"); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(next_in_service, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "chosen this entity"); ++ } ++#endif ++ return parent_sched_may_change; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++/* both next loops stop at one of the child entities of the root group */ ++#define for_each_entity(entity) \ ++ for (; entity ; entity = entity->parent) ++ ++/* ++ * For each iteration, compute parent in advance, so as to be safe if ++ * entity is deallocated during the iteration. Such a deallocation may ++ * happen as a consequence of a bfq_put_queue that frees the bfq_queue ++ * containing entity. ++ */ ++#define for_each_entity_safe(entity, parent) \ ++ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) ++ ++/* ++ * Returns true if this budget changes may let next_in_service->parent ++ * become the next_in_service entity for its parent entity. ++ */ ++static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) ++{ ++ struct bfq_entity *bfqg_entity; ++ struct bfq_group *bfqg; ++ struct bfq_sched_data *group_sd; ++ bool ret = false; ++ ++ BUG_ON(!next_in_service); ++ ++ group_sd = next_in_service->sched_data; ++ ++ bfqg = container_of(group_sd, struct bfq_group, sched_data); ++ /* ++ * bfq_group's my_entity field is not NULL only if the group ++ * is not the root group. We must not touch the root entity ++ * as it must never become an in-service entity. ++ */ ++ bfqg_entity = bfqg->my_entity; ++ if (bfqg_entity) { ++ if (bfqg_entity->budget > next_in_service->budget) ++ ret = true; ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "old budg: %d, new budg: %d", ++ bfqg_entity->budget, next_in_service->budget); ++ bfqg_entity->budget = next_in_service->budget; ++ } ++ ++ return ret; ++} ++ ++/* ++ * This function tells whether entity stops being a candidate for next ++ * service, according to the restrictive definition of the field ++ * next_in_service. In particular, this function is invoked for an ++ * entity that is about to be set in service. ++ * ++ * If entity is a queue, then the entity is no longer a candidate for ++ * next service according to the that definition, because entity is ++ * about to become the in-service queue. This function then returns ++ * true if entity is a queue. ++ * ++ * In contrast, entity could still be a candidate for next service if ++ * it is not a queue, and has more than one active child. In fact, ++ * even if one of its children is about to be set in service, other ++ * active children may still be the next to serve, for the parent ++ * entity, even according to the above definition. As a consequence, a ++ * non-queue entity is not a candidate for next-service only if it has ++ * only one active child. And only if this condition holds, then this ++ * function returns true for a non-queue entity. ++ */ ++static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) ++{ ++ struct bfq_group *bfqg; ++ ++ if (bfq_entity_to_bfqq(entity)) ++ return true; ++ ++ bfqg = container_of(entity, struct bfq_group, entity); ++ ++ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); ++ BUG_ON(bfqg->active_entities == 0); ++ /* ++ * The field active_entities does not always contain the ++ * actual number of active children entities: it happens to ++ * not account for the in-service entity in case the latter is ++ * removed from its active tree (which may get done after ++ * invoking the function bfq_no_longer_next_in_service in ++ * bfq_get_next_queue). Fortunately, here, i.e., while ++ * bfq_no_longer_next_in_service is not yet completed in ++ * bfq_get_next_queue, bfq_active_extract has not yet been ++ * invoked, and thus active_entities still coincides with the ++ * actual number of active entities. ++ */ ++ if (bfqg->active_entities == 1) ++ return true; ++ ++ return false; ++} ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++#define for_each_entity(entity) \ ++ for (; entity ; entity = NULL) ++ ++#define for_each_entity_safe(entity, parent) \ ++ for (parent = NULL; entity ; entity = parent) ++ ++static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) ++{ ++ return false; ++} ++ ++static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) ++{ ++ return true; ++} ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++/* ++ * Shift for timestamp calculations. This actually limits the maximum ++ * service allowed in one timestamp delta (small shift values increase it), ++ * the maximum total weight that can be used for the queues in the system ++ * (big shift values increase it), and the period of virtual time ++ * wraparounds. ++ */ ++#define WFQ_SERVICE_SHIFT 22 ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = NULL; ++ ++ BUG_ON(!entity); ++ ++ if (!entity->my_sched_data) ++ bfqq = container_of(entity, struct bfq_queue, entity); ++ ++ return bfqq; ++} ++ ++ ++/** ++ * bfq_delta - map service into the virtual time domain. ++ * @service: amount of service. ++ * @weight: scale factor (weight of an entity or weight sum). ++ */ ++static u64 bfq_delta(unsigned long service, unsigned long weight) ++{ ++ u64 d = (u64)service << WFQ_SERVICE_SHIFT; ++ ++ do_div(d, weight); ++ return d; ++} ++ ++/** ++ * bfq_calc_finish - assign the finish time to an entity. ++ * @entity: the entity to act upon. ++ * @service: the service to be charged to the entity. ++ */ ++static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned long long start, finish, delta; ++ ++ BUG_ON(entity->weight == 0); ++ ++ entity->finish = entity->start + ++ bfq_delta(service, entity->weight); ++ ++ start = ((entity->start>>10)*1000)>>12; ++ finish = ((entity->finish>>10)*1000)>>12; ++ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "start %llu, finish %llu, delta %llu", ++ start, finish, delta); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "group: serv %lu, w %d", ++ service, entity->weight); ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "group: start %llu, finish %llu, delta %llu", ++ start, finish, delta); ++#endif ++ } ++} ++ ++/** ++ * bfq_entity_of - get an entity from a node. ++ * @node: the node field of the entity. ++ * ++ * Convert a node pointer to the relative entity. This is used only ++ * to simplify the logic of some functions and not as the generic ++ * conversion mechanism because, e.g., in the tree walking functions, ++ * the check for a %NULL value would be redundant. ++ */ ++static struct bfq_entity *bfq_entity_of(struct rb_node *node) ++{ ++ struct bfq_entity *entity = NULL; ++ ++ if (node) ++ entity = rb_entry(node, struct bfq_entity, rb_node); ++ ++ return entity; ++} ++ ++/** ++ * bfq_extract - remove an entity from a tree. ++ * @root: the tree root. ++ * @entity: the entity to remove. ++ */ ++static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) ++{ ++ BUG_ON(entity->tree != root); ++ ++ entity->tree = NULL; ++ rb_erase(&entity->rb_node, root); ++} ++ ++/** ++ * bfq_idle_extract - extract an entity from the idle tree. ++ * @st: the service tree of the owning @entity. ++ * @entity: the entity being removed. ++ */ ++static void bfq_idle_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *next; ++ ++ BUG_ON(entity->tree != &st->idle); ++ ++ if (entity == st->first_idle) { ++ next = rb_next(&entity->rb_node); ++ st->first_idle = bfq_entity_of(next); ++ } ++ ++ if (entity == st->last_idle) { ++ next = rb_prev(&entity->rb_node); ++ st->last_idle = bfq_entity_of(next); ++ } ++ ++ bfq_extract(&st->idle, entity); ++ ++ if (bfqq) ++ list_del(&bfqq->bfqq_list); ++} ++ ++/** ++ * bfq_insert - generic tree insertion. ++ * @root: tree root. ++ * @entity: entity to insert. ++ * ++ * This is used for the idle and the active tree, since they are both ++ * ordered by finish time. ++ */ ++static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) ++{ ++ struct bfq_entity *entry; ++ struct rb_node **node = &root->rb_node; ++ struct rb_node *parent = NULL; ++ ++ BUG_ON(entity->tree); ++ ++ while (*node) { ++ parent = *node; ++ entry = rb_entry(parent, struct bfq_entity, rb_node); ++ ++ if (bfq_gt(entry->finish, entity->finish)) ++ node = &parent->rb_left; ++ else ++ node = &parent->rb_right; ++ } ++ ++ rb_link_node(&entity->rb_node, parent, node); ++ rb_insert_color(&entity->rb_node, root); ++ ++ entity->tree = root; ++} ++ ++/** ++ * bfq_update_min - update the min_start field of a entity. ++ * @entity: the entity to update. ++ * @node: one of its children. ++ * ++ * This function is called when @entity may store an invalid value for ++ * min_start due to updates to the active tree. The function assumes ++ * that the subtree rooted at @node (which may be its left or its right ++ * child) has a valid min_start value. ++ */ ++static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) ++{ ++ struct bfq_entity *child; ++ ++ if (node) { ++ child = rb_entry(node, struct bfq_entity, rb_node); ++ if (bfq_gt(entity->min_start, child->min_start)) ++ entity->min_start = child->min_start; ++ } ++} ++ ++/** ++ * bfq_update_active_node - recalculate min_start. ++ * @node: the node to update. ++ * ++ * @node may have changed position or one of its children may have moved, ++ * this function updates its min_start value. The left and right subtrees ++ * are assumed to hold a correct min_start value. ++ */ ++static void bfq_update_active_node(struct rb_node *node) ++{ ++ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ entity->min_start = entity->start; ++ bfq_update_min(entity, node->rb_right); ++ bfq_update_min(entity, node->rb_left); ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "new min_start %llu", ++ ((entity->min_start>>10)*1000)>>12); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "new min_start %llu", ++ ((entity->min_start>>10)*1000)>>12); ++#endif ++ } ++} ++ ++/** ++ * bfq_update_active_tree - update min_start for the whole active tree. ++ * @node: the starting node. ++ * ++ * @node must be the deepest modified node after an update. This function ++ * updates its min_start using the values held by its children, assuming ++ * that they did not change, and then updates all the nodes that may have ++ * changed in the path to the root. The only nodes that may have changed ++ * are the ones in the path or their siblings. ++ */ ++static void bfq_update_active_tree(struct rb_node *node) ++{ ++ struct rb_node *parent; ++ ++up: ++ bfq_update_active_node(node); ++ ++ parent = rb_parent(node); ++ if (!parent) ++ return; ++ ++ if (node == parent->rb_left && parent->rb_right) ++ bfq_update_active_node(parent->rb_right); ++ else if (parent->rb_left) ++ bfq_update_active_node(parent->rb_left); ++ ++ node = parent; ++ goto up; ++} ++ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root); ++ ++static void __bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root); ++ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq); ++ ++ ++/** ++ * bfq_active_insert - insert an entity in the active tree of its ++ * group/device. ++ * @st: the service tree of the entity. ++ * @entity: the entity being inserted. ++ * ++ * The active tree is ordered by finish time, but an extra key is kept ++ * per each node, containing the minimum value for the start times of ++ * its children (and the node itself), so it's possible to search for ++ * the eligible node with the lowest finish time in logarithmic time. ++ */ ++static void bfq_active_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node = &entity->rb_node; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_sched_data *sd = NULL; ++ struct bfq_group *bfqg = NULL; ++ struct bfq_data *bfqd = NULL; ++#endif ++ ++ bfq_insert(&st->active, entity); ++ ++ if (node->rb_left) ++ node = node->rb_left; ++ else if (node->rb_right) ++ node = node->rb_right; ++ ++ bfq_update_active_tree(node); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ sd = entity->sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++#endif ++ if (bfqq) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (bfqg != bfqd->root_group) { ++ BUG_ON(!bfqg); ++ BUG_ON(!bfqd); ++ bfqg->active_entities++; ++ } ++#endif ++} ++ ++/** ++ * bfq_ioprio_to_weight - calc a weight from an ioprio. ++ * @ioprio: the ioprio value to convert. ++ */ ++static unsigned short bfq_ioprio_to_weight(int ioprio) ++{ ++ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); ++ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; ++} ++ ++/** ++ * bfq_weight_to_ioprio - calc an ioprio from a weight. ++ * @weight: the weight value to convert. ++ * ++ * To preserve as much as possible the old only-ioprio user interface, ++ * 0 is used as an escape ioprio value for weights (numerically) equal or ++ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. ++ */ ++static unsigned short bfq_weight_to_ioprio(int weight) ++{ ++ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); ++ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? ++ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; ++} ++ ++static void bfq_get_entity(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ if (bfqq) { ++ bfqq->ref++; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", ++ bfqq, bfqq->ref); ++ } ++} ++ ++/** ++ * bfq_find_deepest - find the deepest node that an extraction can modify. ++ * @node: the node being removed. ++ * ++ * Do the first step of an extraction in an rb tree, looking for the ++ * node that will replace @node, and returning the deepest node that ++ * the following modifications to the tree can touch. If @node is the ++ * last node in the tree return %NULL. ++ */ ++static struct rb_node *bfq_find_deepest(struct rb_node *node) ++{ ++ struct rb_node *deepest; ++ ++ if (!node->rb_right && !node->rb_left) ++ deepest = rb_parent(node); ++ else if (!node->rb_right) ++ deepest = node->rb_left; ++ else if (!node->rb_left) ++ deepest = node->rb_right; ++ else { ++ deepest = rb_next(node); ++ if (deepest->rb_right) ++ deepest = deepest->rb_right; ++ else if (rb_parent(deepest) != node) ++ deepest = rb_parent(deepest); ++ } ++ ++ return deepest; ++} ++ ++/** ++ * bfq_active_extract - remove an entity from the active tree. ++ * @st: the service_tree containing the tree. ++ * @entity: the entity being removed. ++ */ ++static void bfq_active_extract(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct rb_node *node; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_sched_data *sd = NULL; ++ struct bfq_group *bfqg = NULL; ++ struct bfq_data *bfqd = NULL; ++#endif ++ ++ node = bfq_find_deepest(&entity->rb_node); ++ bfq_extract(&st->active, entity); ++ ++ if (node) ++ bfq_update_active_tree(node); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ sd = entity->sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++#endif ++ if (bfqq) ++ list_del(&bfqq->bfqq_list); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (bfqg != bfqd->root_group) { ++ BUG_ON(!bfqg); ++ BUG_ON(!bfqd); ++ BUG_ON(!bfqg->active_entities); ++ bfqg->active_entities--; ++ } ++#endif ++} ++ ++/** ++ * bfq_idle_insert - insert an entity into the idle tree. ++ * @st: the service tree containing the tree. ++ * @entity: the entity to insert. ++ */ ++static void bfq_idle_insert(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) ++ st->first_idle = entity; ++ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) ++ st->last_idle = entity; ++ ++ bfq_insert(&st->idle, entity); ++ ++ if (bfqq) ++ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); ++} ++ ++/** ++ * bfq_forget_entity - do not consider entity any longer for scheduling ++ * @st: the service tree. ++ * @entity: the entity being removed. ++ * @is_in_service: true if entity is currently the in-service entity. ++ * ++ * Forget everything about @entity. In addition, if entity represents ++ * a queue, and the latter is not in service, then release the service ++ * reference to the queue (the one taken through bfq_get_entity). In ++ * fact, in this case, there is really no more service reference to ++ * the queue, as the latter is also outside any service tree. If, ++ * instead, the queue is in service, then __bfq_bfqd_reset_in_service ++ * will take care of putting the reference when the queue finally ++ * stops being served. ++ */ ++static void bfq_forget_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity, ++ bool is_in_service) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ BUG_ON(!entity->on_st); ++ ++ entity->on_st = false; ++ st->wsum -= entity->weight; ++ if (bfqq && !is_in_service) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "(before): %p %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/** ++ * bfq_put_idle_entity - release the idle tree ref of an entity. ++ * @st: service tree for the entity. ++ * @entity: the entity being released. ++ */ ++static void bfq_put_idle_entity(struct bfq_service_tree *st, ++ struct bfq_entity *entity) ++{ ++ bfq_idle_extract(st, entity); ++ bfq_forget_entity(st, entity, ++ entity == entity->sched_data->in_service_entity); ++} ++ ++/** ++ * bfq_forget_idle - update the idle tree if necessary. ++ * @st: the service tree to act upon. ++ * ++ * To preserve the global O(log N) complexity we only remove one entry here; ++ * as the idle tree will not grow indefinitely this can be done safely. ++ */ ++static void bfq_forget_idle(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *first_idle = st->first_idle; ++ struct bfq_entity *last_idle = st->last_idle; ++ ++ if (RB_EMPTY_ROOT(&st->active) && last_idle && ++ !bfq_gt(last_idle->finish, st->vtime)) { ++ /* ++ * Forget the whole idle tree, increasing the vtime past ++ * the last finish time of idle entities. ++ */ ++ st->vtime = last_idle->finish; ++ } ++ ++ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) ++ bfq_put_idle_entity(st, first_idle); ++} ++ ++/* ++ * Update weight and priority of entity. If update_class_too is true, ++ * then update the ioprio_class of entity too. ++ * ++ * The reason why the update of ioprio_class is controlled through the ++ * last parameter is as follows. Changing the ioprio class of an ++ * entity implies changing the destination service trees for that ++ * entity. If such a change occurred when the entity is already on one ++ * of the service trees for its previous class, then the state of the ++ * entity would become more complex: none of the new possible service ++ * trees for the entity, according to bfq_entity_service_tree(), would ++ * match any of the possible service trees on which the entity ++ * is. Complex operations involving these trees, such as entity ++ * activations and deactivations, should take into account this ++ * additional complexity. To avoid this issue, this function is ++ * invoked with update_class_too unset in the points in the code where ++ * entity may happen to be on some tree. ++ */ ++static struct bfq_service_tree * ++__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, ++ struct bfq_entity *entity, ++ bool update_class_too) ++{ ++ struct bfq_service_tree *new_st = old_st; ++ ++ if (entity->prio_changed) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned int prev_weight, new_weight; ++ struct bfq_data *bfqd = NULL; ++ struct rb_root *root; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_sched_data *sd; ++ struct bfq_group *bfqg; ++#endif ++ ++ if (bfqq) ++ bfqd = bfqq->bfqd; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ sd = entity->my_sched_data; ++ bfqg = container_of(sd, struct bfq_group, sched_data); ++ BUG_ON(!bfqg); ++ bfqd = (struct bfq_data *)bfqg->bfqd; ++ BUG_ON(!bfqd); ++ } ++#endif ++ ++ BUG_ON(entity->tree && update_class_too); ++ BUG_ON(old_st->wsum < entity->weight); ++ old_st->wsum -= entity->weight; ++ ++ if (entity->new_weight != entity->orig_weight) { ++ if (entity->new_weight < BFQ_MIN_WEIGHT || ++ entity->new_weight > BFQ_MAX_WEIGHT) { ++ pr_crit("update_weight_prio: new_weight %d\n", ++ entity->new_weight); ++ if (entity->new_weight < BFQ_MIN_WEIGHT) ++ entity->new_weight = BFQ_MIN_WEIGHT; ++ else ++ entity->new_weight = BFQ_MAX_WEIGHT; ++ } ++ entity->orig_weight = entity->new_weight; ++ if (bfqq) ++ bfqq->ioprio = ++ bfq_weight_to_ioprio(entity->orig_weight); ++ } ++ ++ if (bfqq && update_class_too) ++ bfqq->ioprio_class = bfqq->new_ioprio_class; ++ ++ /* ++ * Reset prio_changed only if the ioprio_class change ++ * is not pending any longer. ++ */ ++ if (!bfqq || bfqq->ioprio_class == bfqq->new_ioprio_class) ++ entity->prio_changed = 0; ++ ++ /* ++ * NOTE: here we may be changing the weight too early, ++ * this will cause unfairness. The correct approach ++ * would have required additional complexity to defer ++ * weight changes to the proper time instants (i.e., ++ * when entity->finish <= old_st->vtime). ++ */ ++ new_st = bfq_entity_service_tree(entity); ++ ++ prev_weight = entity->weight; ++ new_weight = entity->orig_weight * ++ (bfqq ? bfqq->wr_coeff : 1); ++ /* ++ * If the weight of the entity changes and the entity is a ++ * queue, remove the entity from its old weight counter (if ++ * there is a counter associated with the entity). ++ */ ++ if (prev_weight != new_weight && bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "weight changed %d %d(%d %d)", ++ prev_weight, new_weight, ++ entity->orig_weight, ++ bfqq->wr_coeff); ++ ++ root = &bfqd->queue_weights_tree; ++ __bfq_weights_tree_remove(bfqd, bfqq, root); ++ } ++ entity->weight = new_weight; ++ /* ++ * Add the entity, if it is not a weight-raised queue, to the ++ * counter associated with its new weight. ++ */ ++ if (prev_weight != new_weight && bfqq && bfqq->wr_coeff == 1) { ++ /* If we get here, root has been initialized. */ ++ bfq_weights_tree_add(bfqd, bfqq, root); ++ } ++ ++ new_st->wsum += entity->weight; ++ ++ if (new_st != old_st) { ++ BUG_ON(!update_class_too); ++ entity->start = new_st->vtime; ++ } ++ } ++ ++ return new_st; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); ++#endif ++ ++/** ++ * bfq_bfqq_served - update the scheduler status after selection for ++ * service. ++ * @bfqq: the queue being served. ++ * @served: bytes to transfer. ++ * ++ * NOTE: this can be optimized, as the timestamps of upper level entities ++ * are synchronized every time a new bfqq is selected for service. By now, ++ * we keep it to better check consistency. ++ */ ++static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st; ++ ++ if (!bfqq->service_from_backlogged) ++ bfqq->first_IO_time = jiffies; ++ ++ if (bfqq->wr_coeff > 1) ++ bfqq->service_from_wr += served; ++ ++ bfqq->service_from_backlogged += served; ++ for_each_entity(entity) { ++ st = bfq_entity_service_tree(entity); ++ ++ entity->service += served; ++ ++ BUG_ON(st->wsum == 0); ++ ++ st->vtime += bfq_delta(served, st->wsum); ++ bfq_forget_idle(st); ++ } ++#ifndef BFQ_MQ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); ++#endif ++#endif ++ st = bfq_entity_service_tree(&bfqq->entity); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", ++ served, ((st->vtime>>10)*1000)>>12, st); ++} ++ ++/** ++ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length ++ * of the time interval during which bfqq has been in ++ * service. ++ * @bfqd: the device ++ * @bfqq: the queue that needs a service update. ++ * @time_ms: the amount of time during which the queue has received service ++ * ++ * If a queue does not consume its budget fast enough, then providing ++ * the queue with service fairness may impair throughput, more or less ++ * severely. For this reason, queues that consume their budget slowly ++ * are provided with time fairness instead of service fairness. This ++ * goal is achieved through the BFQ scheduling engine, even if such an ++ * engine works in the service, and not in the time domain. The trick ++ * is charging these queues with an inflated amount of service, equal ++ * to the amount of service that they would have received during their ++ * service slot if they had been fast, i.e., if their requests had ++ * been dispatched at a rate equal to the estimated peak rate. ++ * ++ * It is worth noting that time fairness can cause important ++ * distortions in terms of bandwidth distribution, on devices with ++ * internal queueing. The reason is that I/O requests dispatched ++ * during the service slot of a queue may be served after that service ++ * slot is finished, and may have a total processing time loosely ++ * correlated with the duration of the service slot. This is ++ * especially true for short service slots. ++ */ ++static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ unsigned long time_ms) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ unsigned long timeout_ms = jiffies_to_msecs(bfq_timeout); ++ unsigned long bounded_time_ms = min(time_ms, timeout_ms); ++ int serv_to_charge_for_time = ++ (bfqd->bfq_max_budget * bounded_time_ms) / timeout_ms; ++ int tot_serv_to_charge = max(serv_to_charge_for_time, entity->service); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "%lu/%lu ms, %d/%d/%d/%d sectors", ++ time_ms, timeout_ms, ++ entity->service, ++ tot_serv_to_charge, ++ bfqd->bfq_max_budget, ++ entity->budget); ++ ++ /* Increase budget to avoid inconsistencies */ ++ if (tot_serv_to_charge > entity->budget) ++ entity->budget = tot_serv_to_charge; ++ ++ bfq_bfqq_served(bfqq, ++ max_t(int, 0, tot_serv_to_charge - entity->service)); ++} ++ ++static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, ++ struct bfq_service_tree *st, ++ bool backshifted) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ struct bfq_sched_data *sd = entity->sched_data; ++ ++ /* ++ * When this function is invoked, entity is not in any service ++ * tree, then it is safe to invoke next function with the last ++ * parameter set (see the comments on the function). ++ */ ++ BUG_ON(entity->tree); ++ st = __bfq_entity_update_weight_prio(st, entity, true); ++ bfq_calc_finish(entity, entity->budget); ++ ++ /* ++ * If some queues enjoy backshifting for a while, then their ++ * (virtual) finish timestamps may happen to become lower and ++ * lower than the system virtual time. In particular, if ++ * these queues often happen to be idle for short time ++ * periods, and during such time periods other queues with ++ * higher timestamps happen to be busy, then the backshifted ++ * timestamps of the former queues can become much lower than ++ * the system virtual time. In fact, to serve the queues with ++ * higher timestamps while the ones with lower timestamps are ++ * idle, the system virtual time may be pushed-up to much ++ * higher values than the finish timestamps of the idle ++ * queues. As a consequence, the finish timestamps of all new ++ * or newly activated queues may end up being much larger than ++ * those of lucky queues with backshifted timestamps. The ++ * latter queues may then monopolize the device for a lot of ++ * time. This would simply break service guarantees. ++ * ++ * To reduce this problem, push up a little bit the ++ * backshifted timestamps of the queue associated with this ++ * entity (only a queue can happen to have the backshifted ++ * flag set): just enough to let the finish timestamp of the ++ * queue be equal to the current value of the system virtual ++ * time. This may introduce a little unfairness among queues ++ * with backshifted timestamps, but it does not break ++ * worst-case fairness guarantees. ++ * ++ * As a special case, if bfqq is weight-raised, push up ++ * timestamps much less, to keep very low the probability that ++ * this push up causes the backshifted finish timestamps of ++ * weight-raised queues to become higher than the backshifted ++ * finish timestamps of non weight-raised queues. ++ */ ++ if (backshifted && bfq_gt(st->vtime, entity->finish)) { ++ unsigned long delta = st->vtime - entity->finish; ++ ++ if (bfqq) ++ delta /= bfqq->wr_coeff; ++ ++ entity->start += delta; ++ entity->finish += delta; ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "new queue finish %llu", ++ ((entity->finish>>10)*1000)>>12); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "new group finish %llu", ++ ((entity->finish>>10)*1000)>>12); ++#endif ++ } ++ } ++ ++ bfq_active_insert(st, entity); ++ ++ if (bfqq) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "queue %seligible in st %p", ++ entity->start <= st->vtime ? "" : "non ", st); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ } else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "group %seligible in st %p", ++ entity->start <= st->vtime ? "" : "non ", st); ++#endif ++ } ++ BUG_ON(RB_EMPTY_ROOT(&st->active)); ++ BUG_ON(&st->active != &sd->service_tree->active && ++ &st->active != &(sd->service_tree+1)->active && ++ &st->active != &(sd->service_tree+2)->active); ++} ++ ++/** ++ * __bfq_activate_entity - handle activation of entity. ++ * @entity: the entity being activated. ++ * @non_blocking_wait_rq: true if entity was waiting for a request ++ * ++ * Called for a 'true' activation, i.e., if entity is not active and ++ * one of its children receives a new request. ++ * ++ * Basically, this function updates the timestamps of entity and ++ * inserts entity into its active tree, after possibly extracting it ++ * from its idle tree. ++ */ ++static void __bfq_activate_entity(struct bfq_entity *entity, ++ bool non_blocking_wait_rq) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ bool backshifted = false; ++ unsigned long long min_vstart; ++ ++ BUG_ON(!sd); ++ BUG_ON(!st); ++ ++ /* See comments on bfq_fqq_update_budg_for_activation */ ++ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { ++ backshifted = true; ++ min_vstart = entity->finish; ++ } else ++ min_vstart = st->vtime; ++ ++ if (entity->tree == &st->idle) { ++ /* ++ * Must be on the idle tree, bfq_idle_extract() will ++ * check for that. ++ */ ++ bfq_idle_extract(st, entity); ++ BUG_ON(entity->tree); ++ entity->start = bfq_gt(min_vstart, entity->finish) ? ++ min_vstart : entity->finish; ++ } else { ++ BUG_ON(entity->tree); ++ /* ++ * The finish time of the entity may be invalid, and ++ * it is in the past for sure, otherwise the queue ++ * would have been on the idle tree. ++ */ ++ entity->start = min_vstart; ++ st->wsum += entity->weight; ++ /* ++ * entity is about to be inserted into a service tree, ++ * and then set in service: get a reference to make ++ * sure entity does not disappear until it is no ++ * longer in service or scheduled for service. ++ */ ++ bfq_get_entity(entity); ++ ++ BUG_ON(entity->on_st && bfqq); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (entity->on_st && !bfqq) { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, ++ entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, ++ bfqg, ++ "activate bug, class %d in_service %p", ++ bfq_class_idx(entity), sd->in_service_entity); ++ } ++#endif ++ BUG_ON(entity->on_st && !bfqq); ++ entity->on_st = true; ++ } ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (!bfq_entity_to_bfqq(entity)) { /* bfq_group */ ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ struct bfq_data *bfqd = bfqg->bfqd; ++ ++ BUG_ON(!bfqd); ++ if (!entity->in_groups_with_pending_reqs) { ++ entity->in_groups_with_pending_reqs = true; ++ bfqd->num_groups_with_pending_reqs++; ++ } ++ bfq_log_bfqg(bfqd, bfqg, "num_groups_with_pending_reqs %u", ++ bfqd->num_groups_with_pending_reqs); ++ } ++#endif ++ ++ bfq_update_fin_time_enqueue(entity, st, backshifted); ++} ++ ++/** ++ * __bfq_requeue_entity - handle requeueing or repositioning of an entity. ++ * @entity: the entity being requeued or repositioned. ++ * ++ * Requeueing is needed if this entity stops being served, which ++ * happens if a leaf descendant entity has expired. On the other hand, ++ * repositioning is needed if the next_inservice_entity for the child ++ * entity has changed. See the comments inside the function for ++ * details. ++ * ++ * Basically, this function: 1) removes entity from its active tree if ++ * present there, 2) updates the timestamps of entity and 3) inserts ++ * entity back into its active tree (in the new, right position for ++ * the new values of the timestamps). ++ */ ++static void __bfq_requeue_entity(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ BUG_ON(!sd); ++ BUG_ON(!st); ++ ++ BUG_ON(entity != sd->in_service_entity && ++ entity->tree != &st->active); ++ ++ if (entity == sd->in_service_entity) { ++ /* ++ * We are requeueing the current in-service entity, ++ * which may have to be done for one of the following ++ * reasons: ++ * - entity represents the in-service queue, and the ++ * in-service queue is being requeued after an ++ * expiration; ++ * - entity represents a group, and its budget has ++ * changed because one of its child entities has ++ * just been either activated or requeued for some ++ * reason; the timestamps of the entity need then to ++ * be updated, and the entity needs to be enqueued ++ * or repositioned accordingly. ++ * ++ * In particular, before requeueing, the start time of ++ * the entity must be moved forward to account for the ++ * service that the entity has received while in ++ * service. This is done by the next instructions. The ++ * finish time will then be updated according to this ++ * new value of the start time, and to the budget of ++ * the entity. ++ */ ++ bfq_calc_finish(entity, entity->service); ++ entity->start = entity->finish; ++ BUG_ON(entity->tree && entity->tree == &st->idle); ++ BUG_ON(entity->tree && entity->tree != &st->active); ++ /* ++ * In addition, if the entity had more than one child ++ * when set in service, then it was not extracted from ++ * the active tree. This implies that the position of ++ * the entity in the active tree may need to be ++ * changed now, because we have just updated the start ++ * time of the entity, and we will update its finish ++ * time in a moment (the requeueing is then, more ++ * precisely, a repositioning in this case). To ++ * implement this repositioning, we: 1) dequeue the ++ * entity here, 2) update the finish time and requeue ++ * the entity according to the new timestamps below. ++ */ ++ if (entity->tree) ++ bfq_active_extract(st, entity); ++ } else { /* The entity is already active, and not in service */ ++ /* ++ * In this case, this function gets called only if the ++ * next_in_service entity below this entity has ++ * changed, and this change has caused the budget of ++ * this entity to change, which, finally implies that ++ * the finish time of this entity must be ++ * updated. Such an update may cause the scheduling, ++ * i.e., the position in the active tree, of this ++ * entity to change. We handle this change by: 1) ++ * dequeueing the entity here, 2) updating the finish ++ * time and requeueing the entity according to the new ++ * timestamps below. This is the same approach as the ++ * non-extracted-entity sub-case above. ++ */ ++ bfq_active_extract(st, entity); ++ } ++ ++ bfq_update_fin_time_enqueue(entity, st, false); ++} ++ ++static void __bfq_activate_requeue_entity(struct bfq_entity *entity, ++ struct bfq_sched_data *sd, ++ bool non_blocking_wait_rq) ++{ ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ if (sd->in_service_entity == entity || entity->tree == &st->active) ++ /* ++ * in service or already queued on the active tree, ++ * requeue or reposition ++ */ ++ __bfq_requeue_entity(entity); ++ else ++ /* ++ * Not in service and not queued on its active tree: ++ * the activity is idle and this is a true activation. ++ */ ++ __bfq_activate_entity(entity, non_blocking_wait_rq); ++} ++ ++ ++/** ++ * bfq_activate_requeue_entity - activate or requeue an entity representing a bfq_queue, ++ * and activate, requeue or reposition all ancestors ++ * for which such an update becomes necessary. ++ * @entity: the entity to activate. ++ * @non_blocking_wait_rq: true if this entity was waiting for a request ++ * @requeue: true if this is a requeue, which implies that bfqq is ++ * being expired; thus ALL its ancestors stop being served and must ++ * therefore be requeued ++ * @expiration: true if this function is being invoked in the expiration path ++ * of the in-service queue ++ */ ++static void bfq_activate_requeue_entity(struct bfq_entity *entity, ++ bool non_blocking_wait_rq, ++ bool requeue, bool expiration) ++{ ++ struct bfq_sched_data *sd; ++ ++ for_each_entity(entity) { ++ BUG_ON(!entity); ++ sd = entity->sched_data; ++ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); ++ ++ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && ++ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && ++ RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); ++ ++ if (!bfq_update_next_in_service(sd, entity, expiration) && ++ !requeue) { ++ BUG_ON(!sd->next_in_service); ++ break; ++ } ++ BUG_ON(!sd->next_in_service); ++ } ++} ++ ++/** ++ * __bfq_deactivate_entity - update sched_data and service trees for ++ * entity, so as to represent entity as inactive ++ * @entity: the entity being deactivated. ++ * @ins_into_idle_tree: if false, the entity will not be put into the ++ * idle tree. ++ * ++ * If necessary and allowed, puts entity into the idle tree. NOTE: ++ * entity may be on no tree if in service. ++ */ ++static bool __bfq_deactivate_entity(struct bfq_entity *entity, ++ bool ins_into_idle_tree) ++{ ++ struct bfq_sched_data *sd = entity->sched_data; ++ struct bfq_service_tree *st; ++ bool is_in_service; ++ ++ if (!entity->on_st) { /* entity never activated, or already inactive */ ++ BUG_ON(sd && entity == sd->in_service_entity); ++ return false; ++ } ++ ++ /* ++ * If we get here, then entity is active, which implies that ++ * bfq_group_set_parent has already been invoked for the group ++ * represented by entity. Therefore, the field ++ * entity->sched_data has been set, and we can safely use it. ++ */ ++ st = bfq_entity_service_tree(entity); ++ is_in_service = entity == sd->in_service_entity; ++ ++ BUG_ON(is_in_service && entity->tree && entity->tree != &st->active); ++ ++ bfq_calc_finish(entity, entity->service); ++ ++ if (is_in_service) { ++ sd->in_service_entity = NULL; ++ } else ++ /* ++ * Non in-service entity: nobody will take care of ++ * resetting its service counter on expiration. Do it ++ * now. ++ */ ++ entity->service = 0; ++ ++ if (entity->tree == &st->active) ++ bfq_active_extract(st, entity); ++ else if (!is_in_service && entity->tree == &st->idle) ++ bfq_idle_extract(st, entity); ++ else if (entity->tree) ++ BUG(); ++ ++ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) ++ bfq_forget_entity(st, entity, is_in_service); ++ else ++ bfq_idle_insert(st, entity); ++ ++ return true; ++} ++ ++/** ++ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. ++ * @entity: the entity to deactivate. ++ * @ins_into_idle_tree: true if the entity can be put into the idle tree ++ * @expiration: true if this function is being invoked in the expiration path ++ * of the in-service queue ++ */ ++static void bfq_deactivate_entity(struct bfq_entity *entity, ++ bool ins_into_idle_tree, ++ bool expiration) ++{ ++ struct bfq_sched_data *sd; ++ struct bfq_entity *parent = NULL; ++ ++ for_each_entity_safe(entity, parent) { ++ sd = entity->sched_data; ++ ++ BUG_ON(sd == NULL); /* ++ * It would mean that this is the ++ * root group. ++ */ ++ ++ BUG_ON(expiration && entity != sd->in_service_entity); ++ ++ BUG_ON(entity != sd->in_service_entity && ++ entity->tree == ++ &bfq_entity_service_tree(entity)->active && ++ !sd->next_in_service); ++ ++ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { ++ /* ++ * entity is not in any tree any more, so ++ * this deactivation is a no-op, and there is ++ * nothing to change for upper-level entities ++ * (in case of expiration, this can never ++ * happen). ++ */ ++ BUG_ON(expiration); /* ++ * entity cannot be already out of ++ * any tree ++ */ ++ return; ++ } ++ ++ if (sd->next_in_service == entity) ++ /* ++ * entity was the next_in_service entity, ++ * then, since entity has just been ++ * deactivated, a new one must be found. ++ */ ++ bfq_update_next_in_service(sd, NULL, expiration); ++ ++ if (sd->next_in_service || sd->in_service_entity) { ++ /* ++ * The parent entity is still active, because ++ * either next_in_service or in_service_entity ++ * is not NULL. So, no further upwards ++ * deactivation must be performed. Yet, ++ * next_in_service has changed. Then the ++ * schedule does need to be updated upwards. ++ * ++ * NOTE If in_service_entity is not NULL, then ++ * next_in_service may happen to be NULL, ++ * although the parent entity is evidently ++ * active. This happens if 1) the entity ++ * pointed by in_service_entity is the only ++ * active entity in the parent entity, and 2) ++ * according to the definition of ++ * next_in_service, the in_service_entity ++ * cannot be considered as ++ * next_in_service. See the comments on the ++ * definition of next_in_service for details. ++ */ ++ BUG_ON(sd->next_in_service == entity); ++ BUG_ON(sd->in_service_entity == entity); ++ break; ++ } ++ ++ /* ++ * If we get here, then the parent is no more ++ * backlogged and we need to propagate the ++ * deactivation upwards. Thus let the loop go on. ++ */ ++ ++ /* ++ * Also let parent be queued into the idle tree on ++ * deactivation, to preserve service guarantees, and ++ * assuming that who invoked this function does not ++ * need parent entities too to be removed completely. ++ */ ++ ins_into_idle_tree = true; ++ } ++ ++ /* ++ * If the deactivation loop is fully executed, then there are ++ * no more entities to touch and next loop is not executed at ++ * all. Otherwise, requeue remaining entities if they are ++ * about to stop receiving service, or reposition them if this ++ * is not the case. ++ */ ++ entity = parent; ++ for_each_entity(entity) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ /* ++ * Invoke __bfq_requeue_entity on entity, even if ++ * already active, to requeue/reposition it in the ++ * active tree (because sd->next_in_service has ++ * changed) ++ */ ++ __bfq_requeue_entity(entity); ++ ++ sd = entity->sched_data; ++ BUG_ON(expiration && sd->in_service_entity != entity); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "invoking udpdate_next for this queue"); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, ++ struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "invoking udpdate_next for this entity"); ++ } ++#endif ++ if (!bfq_update_next_in_service(sd, entity, expiration) && ++ !expiration) ++ /* ++ * next_in_service unchanged or not causing ++ * any change in entity->parent->sd, and no ++ * requeueing needed for expiration: stop ++ * here. ++ */ ++ break; ++ } ++} ++ ++/** ++ * bfq_calc_vtime_jump - compute the value to which the vtime should jump, ++ * if needed, to have at least one entity eligible. ++ * @st: the service tree to act upon. ++ * ++ * Assumes that st is not empty. ++ */ ++static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) ++{ ++ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); ++ ++ if (bfq_gt(root_entity->min_start, st->vtime)) { ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "new value %llu", ++ ((root_entity->min_start>>10)*1000)>>12); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(root_entity, struct bfq_group, ++ entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "new value %llu", ++ ((root_entity->min_start>>10)*1000)>>12); ++ } ++#endif ++ return root_entity->min_start; ++ } ++ return st->vtime; ++} ++ ++static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) ++{ ++ if (new_value > st->vtime) { ++ st->vtime = new_value; ++ bfq_forget_idle(st); ++ } ++} ++ ++/** ++ * bfq_first_active_entity - find the eligible entity with ++ * the smallest finish time ++ * @st: the service tree to select from. ++ * @vtime: the system virtual to use as a reference for eligibility ++ * ++ * This function searches the first schedulable entity, starting from the ++ * root of the tree and going on the left every time on this side there is ++ * a subtree with at least one eligible (start >= vtime) entity. The path on ++ * the right is followed only if a) the left subtree contains no eligible ++ * entities and b) no eligible entity has been found yet. ++ */ ++static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, ++ u64 vtime) ++{ ++ struct bfq_entity *entry, *first = NULL; ++ struct rb_node *node = st->active.rb_node; ++ ++ while (node) { ++ entry = rb_entry(node, struct bfq_entity, rb_node); ++left: ++ if (!bfq_gt(entry->start, vtime)) ++ first = entry; ++ ++ BUG_ON(bfq_gt(entry->min_start, vtime)); ++ ++ if (node->rb_left) { ++ entry = rb_entry(node->rb_left, ++ struct bfq_entity, rb_node); ++ if (!bfq_gt(entry->min_start, vtime)) { ++ node = node->rb_left; ++ goto left; ++ } ++ } ++ if (first) ++ break; ++ node = node->rb_right; ++ } ++ ++ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); ++ return first; ++} ++ ++/** ++ * __bfq_lookup_next_entity - return the first eligible entity in @st. ++ * @st: the service tree. ++ * ++ * If there is no in-service entity for the sched_data st belongs to, ++ * then return the entity that will be set in service if: ++ * 1) the parent entity this st belongs to is set in service; ++ * 2) no entity belonging to such parent entity undergoes a state change ++ * that would influence the timestamps of the entity (e.g., becomes idle, ++ * becomes backlogged, changes its budget, ...). ++ * ++ * In this first case, update the virtual time in @st too (see the ++ * comments on this update inside the function). ++ * ++ * In constrast, if there is an in-service entity, then return the ++ * entity that would be set in service if not only the above ++ * conditions, but also the next one held true: the currently ++ * in-service entity, on expiration, ++ * 1) gets a finish time equal to the current one, or ++ * 2) is not eligible any more, or ++ * 3) is idle. ++ */ ++static struct bfq_entity * ++__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service) ++{ ++ struct bfq_entity *entity; ++ u64 new_vtime; ++ struct bfq_queue *bfqq; ++ ++ if (RB_EMPTY_ROOT(&st->active)) ++ return NULL; ++ ++ /* ++ * Get the value of the system virtual time for which at ++ * least one entity is eligible. ++ */ ++ new_vtime = bfq_calc_vtime_jump(st); ++ ++ /* ++ * If there is no in-service entity for the sched_data this ++ * active tree belongs to, then push the system virtual time ++ * up to the value that guarantees that at least one entity is ++ * eligible. If, instead, there is an in-service entity, then ++ * do not make any such update, because there is already an ++ * eligible entity, namely the in-service one (even if the ++ * entity is not on st, because it was extracted when set in ++ * service). ++ */ ++ if (!in_service) ++ bfq_update_vtime(st, new_vtime); ++ ++ entity = bfq_first_active_entity(st, new_vtime); ++ BUG_ON(bfq_gt(entity->start, new_vtime)); ++ ++ /* Log some information */ ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "start %llu vtime %llu st %p", ++ ((entity->start>>10)*1000)>>12, ++ ((new_vtime>>10)*1000)>>12, st); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "start %llu vtime %llu (%llu) st %p", ++ ((entity->start>>10)*1000)>>12, ++ ((st->vtime>>10)*1000)>>12, ++ ((new_vtime>>10)*1000)>>12, st); ++ } ++#endif ++ ++ BUG_ON(!entity); ++ ++ return entity; ++} ++ ++/** ++ * bfq_lookup_next_entity - return the first eligible entity in @sd. ++ * @sd: the sched_data. ++ * @expiration: true if we are on the expiration path of the in-service queue ++ * ++ * This function is invoked when there has been a change in the trees ++ * for sd, and we need to know what is the new next entity to serve ++ * after this change. ++ */ ++static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, ++ bool expiration) ++{ ++ struct bfq_service_tree *st = sd->service_tree; ++ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); ++ struct bfq_entity *entity = NULL; ++ struct bfq_queue *bfqq; ++ int class_idx = 0; ++ ++ BUG_ON(!sd); ++ BUG_ON(!st); ++ /* ++ * Choose from idle class, if needed to guarantee a minimum ++ * bandwidth to this class (and if there is some active entity ++ * in idle class). This should also mitigate ++ * priority-inversion problems in case a low priority task is ++ * holding file system resources. ++ */ ++ if (time_is_before_jiffies(sd->bfq_class_idle_last_service + ++ BFQ_CL_IDLE_TIMEOUT)) { ++ if (!RB_EMPTY_ROOT(&idle_class_st->active)) ++ class_idx = BFQ_IOPRIO_CLASSES - 1; ++ /* About to be served if backlogged, or not yet backlogged */ ++ sd->bfq_class_idle_last_service = jiffies; ++ } ++ ++ /* ++ * Find the next entity to serve for the highest-priority ++ * class, unless the idle class needs to be served. ++ */ ++ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { ++ /* ++ * If expiration is true, then bfq_lookup_next_entity ++ * is being invoked as a part of the expiration path ++ * of the in-service queue. In this case, even if ++ * sd->in_service_entity is not NULL, ++ * sd->in_service_entiy at this point is actually not ++ * in service any more, and, if needed, has already ++ * been properly queued or requeued into the right ++ * tree. The reason why sd->in_service_entity is still ++ * not NULL here, even if expiration is true, is that ++ * sd->in_service_entiy is reset as a last step in the ++ * expiration path. So, if expiration is true, tell ++ * __bfq_lookup_next_entity that there is no ++ * sd->in_service_entity. ++ */ ++ entity = __bfq_lookup_next_entity(st + class_idx, ++ sd->in_service_entity && ++ !expiration); ++ ++ if (entity) ++ break; ++ } ++ ++ BUG_ON(!entity && ++ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || ++ !RB_EMPTY_ROOT(&(st+2)->active))); ++ ++ if (!entity) ++ return NULL; ++ ++ /* Log some information */ ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", ++ st + class_idx, class_idx); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "chosen from st %p %d", ++ st + class_idx, class_idx); ++ } ++#endif ++ ++ return entity; ++} ++ ++static bool next_queue_may_preempt(struct bfq_data *bfqd) ++{ ++ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; ++ ++ return sd->next_in_service != sd->in_service_entity; ++} ++ ++/* ++ * Get next queue for service. ++ */ ++static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_entity *entity = NULL; ++ struct bfq_sched_data *sd; ++ struct bfq_queue *bfqq; ++ ++ BUG_ON(bfqd->in_service_queue); ++ ++ if (bfq_tot_busy_queues(bfqd) == 0) ++ return NULL; ++ ++ /* ++ * Traverse the path from the root to the leaf entity to ++ * serve. Set in service all the entities visited along the ++ * way. ++ */ ++ sd = &bfqd->root_group->sched_data; ++ for (; sd ; sd = entity->my_sched_data) { ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ if (entity) { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg(bfqd, bfqg, ++ "lookup in this group"); ++ if (!sd->next_in_service) ++ pr_crit("lookup in this group"); ++ } else { ++ bfq_log_bfqg(bfqd, bfqd->root_group, ++ "lookup in root group"); ++ if (!sd->next_in_service) ++ pr_crit("lookup in root group"); ++ } ++#endif ++ ++ BUG_ON(!sd->next_in_service); ++ ++ /* ++ * WARNING. We are about to set the in-service entity ++ * to sd->next_in_service, i.e., to the (cached) value ++ * returned by bfq_lookup_next_entity(sd) the last ++ * time it was invoked, i.e., the last time when the ++ * service order in sd changed as a consequence of the ++ * activation or deactivation of an entity. In this ++ * respect, if we execute bfq_lookup_next_entity(sd) ++ * in this very moment, it may, although with low ++ * probability, yield a different entity than that ++ * pointed to by sd->next_in_service. This rare event ++ * happens in case there was no CLASS_IDLE entity to ++ * serve for sd when bfq_lookup_next_entity(sd) was ++ * invoked for the last time, while there is now one ++ * such entity. ++ * ++ * If the above event happens, then the scheduling of ++ * such entity in CLASS_IDLE is postponed until the ++ * service of the sd->next_in_service entity ++ * finishes. In fact, when the latter is expired, ++ * bfq_lookup_next_entity(sd) gets called again, ++ * exactly to update sd->next_in_service. ++ */ ++ ++ /* Make next_in_service entity become in_service_entity */ ++ entity = sd->next_in_service; ++ sd->in_service_entity = entity; ++ ++ /* ++ * If entity is no longer a candidate for next ++ * service, then it must be extracted from its active ++ * tree, so as to make sure that it won't be ++ * considered when computing next_in_service. See the ++ * comments on the function ++ * bfq_no_longer_next_in_service() for details. ++ */ ++ if (bfq_no_longer_next_in_service(entity)) ++ bfq_active_extract(bfq_entity_service_tree(entity), ++ entity); ++ ++ /* ++ * Even if entity is not to be extracted according to ++ * the above check, a descendant entity may get ++ * extracted in one of the next iterations of this ++ * loop. Such an event could cause a change in ++ * next_in_service for the level of the descendant ++ * entity, and thus possibly back to this level. ++ * ++ * However, we cannot perform the resulting needed ++ * update of next_in_service for this level before the ++ * end of the whole loop, because, to know which is ++ * the correct next-to-serve candidate entity for each ++ * level, we need first to find the leaf entity to set ++ * in service. In fact, only after we know which is ++ * the next-to-serve leaf entity, we can discover ++ * whether the parent entity of the leaf entity ++ * becomes the next-to-serve, and so on. ++ */ ++ ++ /* Log some information */ ++ bfqq = bfq_entity_to_bfqq(entity); ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, ++ "this queue, finish %llu", ++ (((entity->finish>>10)*1000)>>10)>>2); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg(bfqd, bfqg, ++ "this entity, finish %llu", ++ (((entity->finish>>10)*1000)>>10)>>2); ++ } ++#endif ++ ++ } ++ ++ BUG_ON(!entity); ++ bfqq = bfq_entity_to_bfqq(entity); ++ BUG_ON(!bfqq); ++ ++ /* ++ * We can finally update all next-to-serve entities along the ++ * path from the leaf entity just set in service to the root. ++ */ ++ for_each_entity(entity) { ++ struct bfq_sched_data *sd = entity->sched_data; ++ ++ if (!bfq_update_next_in_service(sd, NULL, false)) ++ break; ++ } ++ ++ return bfqq; ++} ++ ++static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *in_serv_bfqq = bfqd->in_service_queue; ++ struct bfq_entity *in_serv_entity = &in_serv_bfqq->entity; ++ struct bfq_entity *entity = in_serv_entity; ++ ++#ifndef BFQ_MQ ++ if (bfqd->in_service_bic) { ++ put_io_context(bfqd->in_service_bic->icq.ioc); ++ bfqd->in_service_bic = NULL; ++ } ++#endif ++ ++ bfq_clear_bfqq_wait_request(in_serv_bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqd->in_service_queue = NULL; ++ ++ /* ++ * When this function is called, all in-service entities have ++ * been properly deactivated or requeued, so we can safely ++ * execute the final step: reset in_service_entity along the ++ * path from entity to the root. ++ */ ++ for_each_entity(entity) ++ entity->sched_data->in_service_entity = NULL; ++ ++ /* ++ * in_serv_entity is no longer in service, so, if it is in no ++ * service tree either, then release the service reference to ++ * the queue it represents (taken with bfq_get_entity). ++ */ ++ if (!in_serv_entity->on_st) ++ bfq_put_queue(in_serv_bfqq); ++} ++ ++static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool ins_into_idle_tree, bool expiration) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); ++} ++ ++static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && ++ entity->on_st); ++ ++ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), ++ false, false); ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); ++} ++ ++static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool expiration) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ bfq_activate_requeue_entity(entity, false, ++ bfqq == bfqd->in_service_queue, expiration); ++} ++ ++static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); ++ ++/* ++ * Called when the bfqq no longer has requests pending, remove it from ++ * the service tree. As a special case, it can be invoked during an ++ * expiration. ++ */ ++static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool expiration) ++{ ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ bfq_log_bfqq(bfqd, bfqq, "del from busy"); ++ ++ bfq_clear_bfqq_busy(bfqq); ++ ++ BUG_ON(bfq_tot_busy_queues(bfqd) == 0); ++ bfqd->busy_queues[bfqq->ioprio_class - 1]--; ++ ++ if (bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++ ++ bfqg_stats_update_dequeue(bfqq_group(bfqq)); ++ ++ BUG_ON(bfqq->entity.budget < 0); ++ ++ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); ++ if (!bfqq->dispatched) ++ bfq_weights_tree_remove(bfqd, bfqq); ++} ++ ++/* ++ * Called when an inactive queue receives a new request. ++ */ ++static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ ++ bfq_log_bfqq(bfqd, bfqq, "add to busy"); ++ ++ bfq_activate_bfqq(bfqd, bfqq); ++ ++ bfq_mark_bfqq_busy(bfqq); ++ bfqd->busy_queues[bfqq->ioprio_class - 1]++; ++ ++ if (!bfqq->dispatched) ++ if (bfqq->wr_coeff == 1) ++ bfq_weights_tree_add(bfqd, bfqq, ++ &bfqd->queue_weights_tree); ++ ++ if (bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); ++ } ++ ++} +diff --git a/block/bfq-sq-iosched.c b/block/bfq-sq-iosched.c +new file mode 100644 +index 000000000000..6da94eef0cf1 +--- /dev/null ++++ b/block/bfq-sq-iosched.c +@@ -0,0 +1,5957 @@ ++/* ++ * Budget Fair Queueing (BFQ) I/O scheduler. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> ++ * ++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> ++ * Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> ++ * ++ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ ++ * file. ++ * ++ * BFQ is a proportional-share I/O scheduler, with some extra ++ * low-latency capabilities. BFQ also supports full hierarchical ++ * scheduling through cgroups. Next paragraphs provide an introduction ++ * on BFQ inner workings. Details on BFQ benefits and usage can be ++ * found in Documentation/block/bfq-iosched.txt. ++ * ++ * BFQ is a proportional-share storage-I/O scheduling algorithm based ++ * on the slice-by-slice service scheme of CFQ. But BFQ assigns ++ * budgets, measured in number of sectors, to processes instead of ++ * time slices. The device is not granted to the in-service process ++ * for a given time slice, but until it has exhausted its assigned ++ * budget. This change from the time to the service domain enables BFQ ++ * to distribute the device throughput among processes as desired, ++ * without any distortion due to throughput fluctuations, or to device ++ * internal queueing. BFQ uses an ad hoc internal scheduler, called ++ * B-WF2Q+, to schedule processes according to their budgets. More ++ * precisely, BFQ schedules queues associated with processes. Thanks to ++ * the accurate policy of B-WF2Q+, BFQ can afford to assign high ++ * budgets to I/O-bound processes issuing sequential requests (to ++ * boost the throughput), and yet guarantee a low latency to ++ * interactive and soft real-time applications. ++ * ++ * In particular, BFQ schedules I/O so as to achieve the latter goal-- ++ * low latency for interactive and soft real-time applications--if the ++ * low_latency parameter is set (default configuration). To this ++ * purpose, BFQ constantly tries to detect whether the I/O requests in ++ * a bfq_queue come from an interactive or a soft real-time ++ * application. For brevity, in these cases, the queue is said to be ++ * interactive or soft real-time. In both cases, BFQ privileges the ++ * service of the queue, over that of non-interactive and ++ * non-soft-real-time queues. This privileging is performed, mainly, ++ * by raising the weight of the queue. So, for brevity, we call just ++ * weight-raising periods the time periods during which a queue is ++ * privileged, because deemed interactive or soft real-time. ++ * ++ * The detection of soft real-time queues/applications is described in ++ * detail in the comments on the function ++ * bfq_bfqq_softrt_next_start. On the other hand, the detection of an ++ * interactive queue works as follows: a queue is deemed interactive ++ * if it is constantly non empty only for a limited time interval, ++ * after which it does become empty. The queue may be deemed ++ * interactive again (for a limited time), if it restarts being ++ * constantly non empty, provided that this happens only after the ++ * queue has remained empty for a given minimum idle time. ++ * ++ * By default, BFQ computes automatically the above maximum time ++ * interval, i.e., the time interval after which a constantly ++ * non-empty queue stops being deemed interactive. Since a queue is ++ * weight-raised while it is deemed interactive, this maximum time ++ * interval happens to coincide with the (maximum) duration of the ++ * weight-raising for interactive queues. ++ * ++ * NOTE: if the main or only goal, with a given device, is to achieve ++ * the maximum-possible throughput at all times, then do switch off ++ * all low-latency heuristics for that device, by setting low_latency ++ * to 0. ++ * ++ * BFQ is described in [1], where also a reference to the initial, ++ * more theoretical paper on BFQ can be found. The interested reader ++ * can find in the latter paper full details on the main algorithm, as ++ * well as formulas of the guarantees and formal proofs of all the ++ * properties. With respect to the version of BFQ presented in these ++ * papers, this implementation adds a few more heuristics, such as the ++ * one that guarantees a low latency to soft real-time applications, ++ * and a hierarchical extension based on H-WF2Q+. ++ * ++ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with ++ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) ++ * complexity derives from the one introduced with EEVDF in [3]. ++ * ++ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O ++ * Scheduler", Proceedings of the First Workshop on Mobile System ++ * Technologies (MST-2015), May 2015. ++ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf ++ * ++ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf ++ * ++ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing ++ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, ++ * Oct 1997. ++ * ++ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz ++ * ++ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline ++ * First: A Flexible and Accurate Mechanism for Proportional Share ++ * Resource Allocation,'' technical report. ++ * ++ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf ++ */ ++#include <linux/module.h> ++#include <linux/slab.h> ++#include <linux/blkdev.h> ++#include <linux/cgroup.h> ++#include <linux/elevator.h> ++#include <linux/jiffies.h> ++#include <linux/rbtree.h> ++#include <linux/ioprio.h> ++#include "blk.h" ++#include "bfq.h" ++#include "blk-wbt.h" ++ ++/* Expiration time of sync (0) and async (1) requests, in ns. */ ++static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; ++ ++/* Maximum backwards seek, in KiB. */ ++static const int bfq_back_max = (16 * 1024); ++ ++/* Penalty of a backwards seek, in number of sectors. */ ++static const int bfq_back_penalty = 2; ++ ++/* Idling period duration, in ns. */ ++static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); ++ ++/* Minimum number of assigned budgets for which stats are safe to compute. */ ++static const int bfq_stats_min_budgets = 194; ++ ++/* Default maximum budget values, in sectors and number of requests. */ ++static const int bfq_default_max_budget = (16 * 1024); ++ ++/* ++ * When a sync request is dispatched, the queue that contains that ++ * request, and all the ancestor entities of that queue, are charged ++ * with the number of sectors of the request. In constrast, if the ++ * request is async, then the queue and its ancestor entities are ++ * charged with the number of sectors of the request, multiplied by ++ * the factor below. This throttles the bandwidth for async I/O, ++ * w.r.t. to sync I/O, and it is done to counter the tendency of async ++ * writes to steal I/O throughput to reads. ++ * ++ * The current value of this parameter is the result of a tuning with ++ * several hardware and software configurations. We tried to find the ++ * lowest value for which writes do not cause noticeable problems to ++ * reads. In fact, the lower this parameter, the stabler I/O control, ++ * in the following respect. The lower this parameter is, the less ++ * the bandwidth enjoyed by a group decreases ++ * - when the group does writes, w.r.t. to when it does reads; ++ * - when other groups do reads, w.r.t. to when they do writes. ++ */ ++static const int bfq_async_charge_factor = 3; ++ ++/* Default timeout values, in jiffies, approximating CFQ defaults. */ ++static const int bfq_timeout = (HZ / 8); ++ ++/* ++ * Time limit for merging (see comments in bfq_setup_cooperator). Set ++ * to the slowest value that, in our tests, proved to be effective in ++ * removing false positives, while not causing true positives to miss ++ * queue merging. ++ * ++ * As can be deduced from the low time limit below, queue merging, if ++ * successful, happens at the very beggining of the I/O of the involved ++ * cooperating processes, as a consequence of the arrival of the very ++ * first requests from each cooperator. After that, there is very ++ * little chance to find cooperators. ++ */ ++static const unsigned long bfq_merge_time_limit = HZ/10; ++ ++#define MAX_LENGTH_REASON_NAME 25 ++ ++static const char reason_name[][MAX_LENGTH_REASON_NAME] = {"TOO_IDLE", ++"BUDGET_TIMEOUT", "BUDGET_EXHAUSTED", "NO_MORE_REQUESTS", ++"PREEMPTED"}; ++ ++static struct kmem_cache *bfq_pool; ++ ++/* Below this threshold (in ns), we consider thinktime immediate. */ ++#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) ++ ++/* hw_tag detection: parallel requests threshold and min samples needed. */ ++#define BFQ_HW_QUEUE_THRESHOLD 3 ++#define BFQ_HW_QUEUE_SAMPLES 32 ++ ++#define BFQQ_SEEK_THR (sector_t)(8 * 100) ++#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) ++#define BFQ_RQ_SEEKY(bfqd, last_pos, rq) \ ++ (get_sdist(last_pos, rq) > \ ++ BFQQ_SEEK_THR && \ ++ (!blk_queue_nonrot(bfqd->queue) || \ ++ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT)) ++#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) ++#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 19) ++ ++/* Min number of samples required to perform peak-rate update */ ++#define BFQ_RATE_MIN_SAMPLES 32 ++/* Min observation time interval required to perform a peak-rate update (ns) */ ++#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) ++/* Target observation time interval for a peak-rate update (ns) */ ++#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC ++ ++/* ++ * Shift used for peak-rate fixed precision calculations. ++ * With ++ * - the current shift: 16 positions ++ * - the current type used to store rate: u32 ++ * - the current unit of measure for rate: [sectors/usec], or, more precisely, ++ * [(sectors/usec) / 2^BFQ_RATE_SHIFT] to take into account the shift, ++ * the range of rates that can be stored is ++ * [1 / 2^BFQ_RATE_SHIFT, 2^(32 - BFQ_RATE_SHIFT)] sectors/usec = ++ * [1 / 2^16, 2^16] sectors/usec = [15e-6, 65536] sectors/usec = ++ * [15, 65G] sectors/sec ++ * Which, assuming a sector size of 512B, corresponds to a range of ++ * [7.5K, 33T] B/sec ++ */ ++#define BFQ_RATE_SHIFT 16 ++ ++/* ++ * When configured for computing the duration of the weight-raising ++ * for interactive queues automatically (see the comments at the ++ * beginning of this file), BFQ does it using the following formula: ++ * duration = (ref_rate / r) * ref_wr_duration, ++ * where r is the peak rate of the device, and ref_rate and ++ * ref_wr_duration are two reference parameters. In particular, ++ * ref_rate is the peak rate of the reference storage device (see ++ * below), and ref_wr_duration is about the maximum time needed, with ++ * BFQ and while reading two files in parallel, to load typical large ++ * applications on the reference device (see the comments on ++ * max_service_from_wr below, for more details on how ref_wr_duration ++ * is obtained). In practice, the slower/faster the device at hand ++ * is, the more/less it takes to load applications with respect to the ++ * reference device. Accordingly, the longer/shorter BFQ grants ++ * weight raising to interactive applications. ++ * ++ * BFQ uses two different reference pairs (ref_rate, ref_wr_duration), ++ * depending on whether the device is rotational or non-rotational. ++ * ++ * In the following definitions, ref_rate[0] and ref_wr_duration[0] ++ * are the reference values for a rotational device, whereas ++ * ref_rate[1] and ref_wr_duration[1] are the reference values for a ++ * non-rotational device. The reference rates are not the actual peak ++ * rates of the devices used as a reference, but slightly lower ++ * values. The reason for using slightly lower values is that the ++ * peak-rate estimator tends to yield slightly lower values than the ++ * actual peak rate (it can yield the actual peak rate only if there ++ * is only one process doing I/O, and the process does sequential ++ * I/O). ++ * ++ * The reference peak rates are measured in sectors/usec, left-shifted ++ * by BFQ_RATE_SHIFT. ++ */ ++static int ref_rate[2] = {14000, 33000}; ++/* ++ * To improve readability, a conversion function is used to initialize ++ * the following array, which entails that the array can be ++ * initialized only in a function. ++ */ ++static int ref_wr_duration[2]; ++ ++/* ++ * BFQ uses the above-detailed, time-based weight-raising mechanism to ++ * privilege interactive tasks. This mechanism is vulnerable to the ++ * following false positives: I/O-bound applications that will go on ++ * doing I/O for much longer than the duration of weight ++ * raising. These applications have basically no benefit from being ++ * weight-raised at the beginning of their I/O. On the opposite end, ++ * while being weight-raised, these applications ++ * a) unjustly steal throughput to applications that may actually need ++ * low latency; ++ * b) make BFQ uselessly perform device idling; device idling results ++ * in loss of device throughput with most flash-based storage, and may ++ * increase latencies when used purposelessly. ++ * ++ * BFQ tries to reduce these problems, by adopting the following ++ * countermeasure. To introduce this countermeasure, we need first to ++ * finish explaining how the duration of weight-raising for ++ * interactive tasks is computed. ++ * ++ * For a bfq_queue deemed as interactive, the duration of weight ++ * raising is dynamically adjusted, as a function of the estimated ++ * peak rate of the device, so as to be equal to the time needed to ++ * execute the 'largest' interactive task we benchmarked so far. By ++ * largest task, we mean the task for which each involved process has ++ * to do more I/O than for any of the other tasks we benchmarked. This ++ * reference interactive task is the start-up of LibreOffice Writer, ++ * and in this task each process/bfq_queue needs to have at most ~110K ++ * sectors transfered. ++ * ++ * This last piece of information enables BFQ to reduce the actual ++ * duration of weight-raising for at least one class of I/O-bound ++ * applications: those doing sequential or quasi-sequential I/O. An ++ * example is file copy. In fact, once started, the main I/O-bound ++ * processes of these applications usually consume the above 110K ++ * sectors in much less time than the processes of an application that ++ * is starting, because these I/O-bound processes will greedily devote ++ * almost all their CPU cycles only to their target, ++ * throughput-friendly I/O operations. This is even more true if BFQ ++ * happens to be underestimating the device peak rate, and thus ++ * overestimating the duration of weight raising. But, according to ++ * our measurements, once transferred 110K sectors, these processes ++ * have no right to be weight-raised any longer. ++ * ++ * Basing on the last consideration, BFQ ends weight-raising for a ++ * bfq_queue if the latter happens to have received an amount of ++ * service at least equal to the following constant. The constant is ++ * set to slightly more than 110K, to have a minimum safety margin. ++ * ++ * This early ending of weight-raising reduces the amount of time ++ * during which interactive false positives cause the two problems ++ * described at the beginning of these comments. ++ */ ++static const unsigned long max_service_from_wr = 120000; ++ ++#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ ++ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) ++ ++#define RQ_BIC(rq) icq_to_bic((rq)->elv.priv[0]) ++#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) ++ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd); ++ ++#include "bfq-ioc.c" ++#include "bfq-sched.c" ++#include "bfq-cgroup-included.c" ++ ++#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) ++#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) ++ ++#define bfq_sample_valid(samples) ((samples) > 80) ++ ++/* ++ * Scheduler run of queue, if there are requests pending and no one in the ++ * driver that will restart queueing. ++ */ ++static void bfq_schedule_dispatch(struct bfq_data *bfqd) ++{ ++ if (bfqd->queued != 0) { ++ bfq_log(bfqd, ""); ++ kblockd_schedule_work(&bfqd->unplug_work); ++ } ++} ++ ++/* ++ * Lifted from AS - choose which of rq1 and rq2 that is best served now. ++ * We choose the request that is closesr to the head right now. Distance ++ * behind the head is penalized and only allowed to a certain extent. ++ */ ++static struct request *bfq_choose_req(struct bfq_data *bfqd, ++ struct request *rq1, ++ struct request *rq2, ++ sector_t last) ++{ ++ sector_t s1, s2, d1 = 0, d2 = 0; ++ unsigned long back_max; ++#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ ++#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ ++ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ ++ ++ if (!rq1 || rq1 == rq2) ++ return rq2; ++ if (!rq2) ++ return rq1; ++ ++ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) ++ return rq1; ++ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) ++ return rq2; ++ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) ++ return rq1; ++ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) ++ return rq2; ++ ++ s1 = blk_rq_pos(rq1); ++ s2 = blk_rq_pos(rq2); ++ ++ /* ++ * By definition, 1KiB is 2 sectors. ++ */ ++ back_max = bfqd->bfq_back_max * 2; ++ ++ /* ++ * Strict one way elevator _except_ in the case where we allow ++ * short backward seeks which are biased as twice the cost of a ++ * similar forward seek. ++ */ ++ if (s1 >= last) ++ d1 = s1 - last; ++ else if (s1 + back_max >= last) ++ d1 = (last - s1) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ1_WRAP; ++ ++ if (s2 >= last) ++ d2 = s2 - last; ++ else if (s2 + back_max >= last) ++ d2 = (last - s2) * bfqd->bfq_back_penalty; ++ else ++ wrap |= BFQ_RQ2_WRAP; ++ ++ /* Found required data */ ++ ++ /* ++ * By doing switch() on the bit mask "wrap" we avoid having to ++ * check two variables for all permutations: --> faster! ++ */ ++ switch (wrap) { ++ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ ++ if (d1 < d2) ++ return rq1; ++ else if (d2 < d1) ++ return rq2; ++ ++ if (s1 >= s2) ++ return rq1; ++ else ++ return rq2; ++ ++ case BFQ_RQ2_WRAP: ++ return rq1; ++ case BFQ_RQ1_WRAP: ++ return rq2; ++ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ ++ default: ++ /* ++ * Since both rqs are wrapped, ++ * start with the one that's further behind head ++ * (--> only *one* back seek required), ++ * since back seek takes more time than forward. ++ */ ++ if (s1 <= s2) ++ return rq1; ++ else ++ return rq2; ++ } ++} ++ ++static struct bfq_queue * ++bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, ++ sector_t sector, struct rb_node **ret_parent, ++ struct rb_node ***rb_link) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *bfqq = NULL; ++ ++ parent = NULL; ++ p = &root->rb_node; ++ while (*p) { ++ struct rb_node **n; ++ ++ parent = *p; ++ bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ ++ /* ++ * Sort strictly based on sector. Smallest to the left, ++ * largest to the right. ++ */ ++ if (sector > blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_right; ++ else if (sector < blk_rq_pos(bfqq->next_rq)) ++ n = &(*p)->rb_left; ++ else ++ break; ++ p = n; ++ bfqq = NULL; ++ } ++ ++ *ret_parent = parent; ++ if (rb_link) ++ *rb_link = p; ++ ++ bfq_log(bfqd, "%llu: returning %d", ++ (unsigned long long) sector, ++ bfqq ? bfqq->pid : 0); ++ ++ return bfqq; ++} ++ ++static bool bfq_too_late_for_merging(struct bfq_queue *bfqq) ++{ ++ return bfqq->service_from_backlogged > 0 && ++ time_is_before_jiffies(bfqq->first_IO_time + ++ bfq_merge_time_limit); ++} ++ ++static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct rb_node **p, *parent; ++ struct bfq_queue *__bfqq; ++ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ ++ /* ++ * bfqq cannot be merged any longer (see comments in ++ * bfq_setup_cooperator): no point in adding bfqq into the ++ * position tree. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) ++ return; ++ ++ if (bfq_class_idle(bfqq)) ++ return; ++ if (!bfqq->next_rq) ++ return; ++ ++ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, ++ blk_rq_pos(bfqq->next_rq), &parent, &p); ++ if (!__bfqq) { ++ rb_link_node(&bfqq->pos_node, parent, p); ++ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); ++ } else ++ bfqq->pos_root = NULL; ++} ++ ++/* ++ * The following function returns true if every queue must receive the ++ * same share of the throughput (this condition is used when deciding ++ * whether idling may be disabled, see the comments in the function ++ * bfq_better_to_idle()). ++ * ++ * Such a scenario occurs when: ++ * 1) all active queues have the same weight, ++ * 2) all active queues belong to the same I/O-priority class, ++ * 3) all active groups at the same level in the groups tree have the same ++ * weight, ++ * 4) all active groups at the same level in the groups tree have the same ++ * number of children. ++ * ++ * Unfortunately, keeping the necessary state for evaluating exactly ++ * the last two symmetry sub-conditions above would be quite complex ++ * and time consuming. Therefore this function evaluates, instead, ++ * only the following stronger three sub-conditions, for which it is ++ * much easier to maintain the needed state: ++ * 1) all active queues have the same weight, ++ * 2) all active queues belong to the same I/O-priority class, ++ * 3) there are no active groups. ++ * In particular, the last condition is always true if hierarchical ++ * support or the cgroups interface are not enabled, thus no state ++ * needs to be maintained in this case. ++ */ ++static bool bfq_symmetric_scenario(struct bfq_data *bfqd) ++{ ++ /* ++ * For queue weights to differ, queue_weights_tree must contain ++ * at least two nodes. ++ */ ++ bool varied_queue_weights = !RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && ++ (bfqd->queue_weights_tree.rb_node->rb_left || ++ bfqd->queue_weights_tree.rb_node->rb_right); ++ ++ bool multiple_classes_busy = ++ (bfqd->busy_queues[0] && bfqd->busy_queues[1]) || ++ (bfqd->busy_queues[0] && bfqd->busy_queues[2]) || ++ (bfqd->busy_queues[1] && bfqd->busy_queues[2]); ++ ++ bfq_log(bfqd, "varied_queue_weights %d mul_classes %d", ++ varied_queue_weights, multiple_classes_busy); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfq_log(bfqd, "num_groups_with_pending_reqs %u", ++ bfqd->num_groups_with_pending_reqs); ++#endif ++ ++ return !(varied_queue_weights || multiple_classes_busy ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ || bfqd->num_groups_with_pending_reqs > 0 ++#endif ++ ); ++} ++ ++/* ++ * If the weight-counter tree passed as input contains no counter for ++ * the weight of the input queue, then add that counter; otherwise just ++ * increment the existing counter. ++ * ++ * Note that weight-counter trees contain few nodes in mostly symmetric ++ * scenarios. For example, if all queues have the same weight, then the ++ * weight-counter tree for the queues may contain at most one node. ++ * This holds even if low_latency is on, because weight-raised queues ++ * are not inserted in the tree. ++ * In most scenarios, the rate at which nodes are created/destroyed ++ * should be low too. ++ */ ++static void bfq_weights_tree_add(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct rb_node **new = &(root->rb_node), *parent = NULL; ++ ++ /* ++ * Do not insert if the queue is already associated with a ++ * counter, which happens if: ++ * 1) a request arrival has caused the queue to become both ++ * non-weight-raised, and hence change its weight, and ++ * backlogged; in this respect, each of the two events ++ * causes an invocation of this function, ++ * 2) this is the invocation of this function caused by the ++ * second event. This second invocation is actually useless, ++ * and we handle this fact by exiting immediately. More ++ * efficient or clearer solutions might possibly be adopted. ++ */ ++ if (bfqq->weight_counter) ++ return; ++ ++ while (*new) { ++ struct bfq_weight_counter *__counter = container_of(*new, ++ struct bfq_weight_counter, ++ weights_node); ++ parent = *new; ++ ++ if (entity->weight == __counter->weight) { ++ bfqq->weight_counter = __counter; ++ goto inc_counter; ++ } ++ if (entity->weight < __counter->weight) ++ new = &((*new)->rb_left); ++ else ++ new = &((*new)->rb_right); ++ } ++ ++ bfqq->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), ++ GFP_ATOMIC); ++ ++ /* ++ * In the unlucky event of an allocation failure, we just ++ * exit. This will cause the weight of queue to not be ++ * considered in bfq_symmetric_scenario, which, in its turn, ++ * causes the scenario to be deemed wrongly symmetric in case ++ * bfqq's weight would have been the only weight making the ++ * scenario asymmetric. On the bright side, no unbalance will ++ * however occur when bfqq becomes inactive again (the ++ * invocation of this function is triggered by an activation ++ * of queue). In fact, bfq_weights_tree_remove does nothing ++ * if !bfqq->weight_counter. ++ */ ++ if (unlikely(!bfqq->weight_counter)) ++ return; ++ ++ bfqq->weight_counter->weight = entity->weight; ++ rb_link_node(&bfqq->weight_counter->weights_node, parent, new); ++ rb_insert_color(&bfqq->weight_counter->weights_node, root); ++ ++inc_counter: ++ bfqq->weight_counter->num_active++; ++ bfqq->ref++; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "refs %d weight %d symmetric %d", ++ bfqq->ref, ++ entity->weight, ++ bfq_symmetric_scenario(bfqd)); ++} ++ ++/* ++ * Decrement the weight counter associated with the queue, and, if the ++ * counter reaches 0, remove the counter from the tree. ++ * See the comments to the function bfq_weights_tree_add() for considerations ++ * about overhead. ++ */ ++static void __bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct rb_root *root) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (!bfqq->weight_counter) ++ return; ++ ++ BUG_ON(RB_EMPTY_ROOT(root)); ++ BUG_ON(bfqq->weight_counter->weight != entity->weight); ++ ++ BUG_ON(!bfqq->weight_counter->num_active); ++ bfqq->weight_counter->num_active--; ++ ++ if (bfqq->weight_counter->num_active > 0) ++ goto reset_entity_pointer; ++ ++ rb_erase(&bfqq->weight_counter->weights_node, root); ++ kfree(bfqq->weight_counter); ++ ++reset_entity_pointer: ++ bfqq->weight_counter = NULL; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "refs %d weight %d symmetric %d", ++ bfqq->ref, ++ entity->weight, ++ bfq_symmetric_scenario(bfqd)); ++ bfq_put_queue(bfqq); ++} ++ ++/* ++ * Invoke __bfq_weights_tree_remove on bfqq and decrement the number ++ * of active groups for each queue's inactive parent entity. ++ */ ++static void bfq_weights_tree_remove(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = bfqq->entity.parent; ++ ++ for_each_entity(entity) { ++ struct bfq_sched_data *sd = entity->my_sched_data; ++ ++ BUG_ON(entity->sched_data == NULL); /* ++ * It would mean ++ * that this is ++ * the root group. ++ */ ++ ++ if (sd->next_in_service || sd->in_service_entity) { ++ BUG_ON(!entity->in_groups_with_pending_reqs); ++ /* ++ * entity is still active, because either ++ * next_in_service or in_service_entity is not ++ * NULL (see the comments on the definition of ++ * next_in_service for details on why ++ * in_service_entity must be checked too). ++ * ++ * As a consequence, its parent entities are ++ * active as well, and thus this loop must ++ * stop here. ++ */ ++ break; ++ } ++ ++ BUG_ON(!bfqd->num_groups_with_pending_reqs && ++ entity->in_groups_with_pending_reqs); ++ /* ++ * The decrement of num_groups_with_pending_reqs is ++ * not performed immediately upon the deactivation of ++ * entity, but it is delayed to when it also happens ++ * that the first leaf descendant bfqq of entity gets ++ * all its pending requests completed. The following ++ * instructions perform this delayed decrement, if ++ * needed. See the comments on ++ * num_groups_with_pending_reqs for details. ++ */ ++ if (entity->in_groups_with_pending_reqs) { ++ entity->in_groups_with_pending_reqs = false; ++ bfqd->num_groups_with_pending_reqs--; ++ } ++ bfq_log_bfqq(bfqd, bfqq, "num_groups_with_pending_reqs %u", ++ bfqd->num_groups_with_pending_reqs); ++ } ++ ++ /* ++ * Next function is invoked last, because it causes bfqq to be ++ * freed if the following holds: bfqq is not in service and ++ * has no dispatched request. DO NOT use bfqq after the next ++ * function invocation. ++ */ ++ __bfq_weights_tree_remove(bfqd, bfqq, ++ &bfqd->queue_weights_tree); ++} ++ ++/* ++ * Return expired entry, or NULL to just start from scratch in rbtree. ++ */ ++static struct request *bfq_check_fifo(struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct request *rq; ++ ++ if (bfq_bfqq_fifo_expire(bfqq)) ++ return NULL; ++ ++ bfq_mark_bfqq_fifo_expire(bfqq); ++ ++ rq = rq_entry_fifo(bfqq->fifo.next); ++ ++ if (rq == last || ktime_get_ns() < rq->fifo_time) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "returned %p", rq); ++ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); ++ return rq; ++} ++ ++static struct request *bfq_find_next_rq(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct request *last) ++{ ++ struct rb_node *rbnext = rb_next(&last->rb_node); ++ struct rb_node *rbprev = rb_prev(&last->rb_node); ++ struct request *next, *prev = NULL; ++ ++ BUG_ON(list_empty(&bfqq->fifo)); ++ ++ /* Follow expired path, else get first next available. */ ++ next = bfq_check_fifo(bfqq, last); ++ if (next) { ++ BUG_ON(next == last); ++ return next; ++ } ++ ++ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); ++ ++ if (rbprev) ++ prev = rb_entry_rq(rbprev); ++ ++ if (rbnext) ++ next = rb_entry_rq(rbnext); ++ else { ++ rbnext = rb_first(&bfqq->sort_list); ++ if (rbnext && rbnext != &last->rb_node) ++ next = rb_entry_rq(rbnext); ++ } ++ ++ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); ++} ++ ++/* see the definition of bfq_async_charge_factor for details */ ++static unsigned long bfq_serv_to_charge(struct request *rq, ++ struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1 || ++ !bfq_symmetric_scenario(bfqq->bfqd)) ++ return blk_rq_sectors(rq); ++ ++ return blk_rq_sectors(rq) * bfq_async_charge_factor; ++} ++ ++/** ++ * bfq_updated_next_req - update the queue after a new next_rq selection. ++ * @bfqd: the device data the queue belongs to. ++ * @bfqq: the queue to update. ++ * ++ * If the first request of a queue changes we make sure that the queue ++ * has enough budget to serve at least its first request (if the ++ * request has grown). We do this because if the queue has not enough ++ * budget for its first request, it has to go through two dispatch ++ * rounds to actually get it dispatched. ++ */ ++static void bfq_updated_next_req(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ struct bfq_service_tree *st = bfq_entity_service_tree(entity); ++ struct request *next_rq = bfqq->next_rq; ++ unsigned long new_budget; ++ ++ if (!next_rq) ++ return; ++ ++ if (bfqq == bfqd->in_service_queue) ++ /* ++ * In order not to break guarantees, budgets cannot be ++ * changed after an entity has been selected. ++ */ ++ return; ++ ++ BUG_ON(entity->tree != &st->active); ++ BUG_ON(entity == entity->sched_data->in_service_entity); ++ ++ new_budget = max_t(unsigned long, ++ max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)), ++ entity->service); ++ if (entity->budget != new_budget) { ++ entity->budget = new_budget; ++ bfq_log_bfqq(bfqd, bfqq, "new budget %lu", ++ new_budget); ++ bfq_requeue_bfqq(bfqd, bfqq, false); ++ } ++} ++ ++static unsigned int bfq_wr_duration(struct bfq_data *bfqd) ++{ ++ u64 dur; ++ ++ if (bfqd->bfq_wr_max_time > 0) ++ return bfqd->bfq_wr_max_time; ++ ++ dur = bfqd->rate_dur_prod; ++ do_div(dur, bfqd->peak_rate); ++ ++ /* ++ * Limit duration between 3 and 25 seconds. The upper limit ++ * has been conservatively set after the following worst case: ++ * on a QEMU/KVM virtual machine ++ * - running in a slow PC ++ * - with a virtual disk stacked on a slow low-end 5400rpm HDD ++ * - serving a heavy I/O workload, such as the sequential reading ++ * of several files ++ * mplayer took 23 seconds to start, if constantly weight-raised. ++ * ++ * As for higher values than that accomodating the above bad ++ * scenario, tests show that higher values would often yield ++ * the opposite of the desired result, i.e., would worsen ++ * responsiveness by allowing non-interactive applications to ++ * preserve weight raising for too long. ++ * ++ * On the other end, lower values than 3 seconds make it ++ * difficult for most interactive tasks to complete their jobs ++ * before weight-raising finishes. ++ */ ++ return clamp_val(dur, msecs_to_jiffies(3000), msecs_to_jiffies(25000)); ++} ++ ++/* switch back from soft real-time to interactive weight raising */ ++static void switch_back_to_interactive_wr(struct bfq_queue *bfqq, ++ struct bfq_data *bfqd) ++{ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ bfqq->last_wr_start_finish = bfqq->wr_start_at_switch_to_srt; ++} ++ ++static void ++bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_data *bfqd, ++ struct bfq_io_cq *bic, bool bfq_already_existing) ++{ ++ unsigned int old_wr_coeff; ++ bool busy = bfq_already_existing && bfq_bfqq_busy(bfqq); ++ ++ if (bic->saved_has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ else ++ bfq_clear_bfqq_has_short_ttime(bfqq); ++ ++ if (bic->saved_IO_bound) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ else ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (unlikely(busy)) ++ old_wr_coeff = bfqq->wr_coeff; ++ ++ bfqq->wr_coeff = bic->saved_wr_coeff; ++ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; ++ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); ++ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; ++ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "bic %p wr_coeff %d start_finish %lu max_time %lu", ++ bic, bfqq->wr_coeff, bfqq->last_wr_start_finish, ++ bfqq->wr_cur_max_time); ++ ++ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || ++ time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time))) { ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ time_is_after_eq_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) { ++ switch_back_to_interactive_wr(bfqq, bfqd); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "switching back to interactive"); ++ } else { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "switching off wr (%lu + %lu < %lu)", ++ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, ++ jiffies); ++ } ++ } ++ ++ /* make sure weight will be updated, however we got here */ ++ bfqq->entity.prio_changed = 1; ++ ++ if (likely(!busy)) ++ return; ++ ++ if (old_wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); ++ } else if (old_wr_coeff > 1 && bfqq->wr_coeff == 1) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++} ++ ++static int bfqq_process_refs(struct bfq_queue *bfqq) ++{ ++ int process_refs, io_refs; ++ ++ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); ++ ++ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; ++ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st - ++ (bfqq->weight_counter != NULL); ++ BUG_ON(process_refs < 0); ++ return process_refs; ++} ++ ++/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ ++static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *item; ++ struct hlist_node *n; ++ ++ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) ++ hlist_del_init(&item->burst_list_node); ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++ bfqd->burst_size = 1; ++ bfqd->burst_parent_entity = bfqq->entity.parent; ++} ++ ++/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ ++static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* Increment burst size to take into account also bfqq */ ++ bfqd->burst_size++; ++ ++ bfq_log_bfqq(bfqd, bfqq, "%d", bfqd->burst_size); ++ ++ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); ++ ++ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { ++ struct bfq_queue *pos, *bfqq_item; ++ struct hlist_node *n; ++ ++ /* ++ * Enough queues have been activated shortly after each ++ * other to consider this burst as large. ++ */ ++ bfqd->large_burst = true; ++ bfq_log_bfqq(bfqd, bfqq, "large burst started"); ++ ++ /* ++ * We can now mark all queues in the burst list as ++ * belonging to a large burst. ++ */ ++ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, ++ burst_list_node) { ++ bfq_mark_bfqq_in_large_burst(bfqq_item); ++ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); ++ } ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); ++ ++ /* ++ * From now on, and until the current burst finishes, any ++ * new queue being activated shortly after the last queue ++ * was inserted in the burst can be immediately marked as ++ * belonging to a large burst. So the burst list is not ++ * needed any more. Remove it. ++ */ ++ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, ++ burst_list_node) ++ hlist_del_init(&pos->burst_list_node); ++ } else /* ++ * Burst not yet large: add bfqq to the burst list. Do ++ * not increment the ref counter for bfqq, because bfqq ++ * is removed from the burst list before freeing bfqq ++ * in put_queue. ++ */ ++ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); ++} ++ ++/* ++ * If many queues belonging to the same group happen to be created ++ * shortly after each other, then the processes associated with these ++ * queues have typically a common goal. In particular, bursts of queue ++ * creations are usually caused by services or applications that spawn ++ * many parallel threads/processes. Examples are systemd during boot, ++ * or git grep. To help these processes get their job done as soon as ++ * possible, it is usually better to not grant either weight-raising ++ * or device idling to their queues. ++ * ++ * In this comment we describe, firstly, the reasons why this fact ++ * holds, and, secondly, the next function, which implements the main ++ * steps needed to properly mark these queues so that they can then be ++ * treated in a different way. ++ * ++ * The above services or applications benefit mostly from a high ++ * throughput: the quicker the requests of the activated queues are ++ * cumulatively served, the sooner the target job of these queues gets ++ * completed. As a consequence, weight-raising any of these queues, ++ * which also implies idling the device for it, is almost always ++ * counterproductive. In most cases it just lowers throughput. ++ * ++ * On the other hand, a burst of queue creations may be caused also by ++ * the start of an application that does not consist of a lot of ++ * parallel I/O-bound threads. In fact, with a complex application, ++ * several short processes may need to be executed to start-up the ++ * application. In this respect, to start an application as quickly as ++ * possible, the best thing to do is in any case to privilege the I/O ++ * related to the application with respect to all other ++ * I/O. Therefore, the best strategy to start as quickly as possible ++ * an application that causes a burst of queue creations is to ++ * weight-raise all the queues created during the burst. This is the ++ * exact opposite of the best strategy for the other type of bursts. ++ * ++ * In the end, to take the best action for each of the two cases, the ++ * two types of bursts need to be distinguished. Fortunately, this ++ * seems relatively easy, by looking at the sizes of the bursts. In ++ * particular, we found a threshold such that only bursts with a ++ * larger size than that threshold are apparently caused by ++ * services or commands such as systemd or git grep. For brevity, ++ * hereafter we call just 'large' these bursts. BFQ *does not* ++ * weight-raise queues whose creation occurs in a large burst. In ++ * addition, for each of these queues BFQ performs or does not perform ++ * idling depending on which choice boosts the throughput more. The ++ * exact choice depends on the device and request pattern at ++ * hand. ++ * ++ * Unfortunately, false positives may occur while an interactive task ++ * is starting (e.g., an application is being started). The ++ * consequence is that the queues associated with the task do not ++ * enjoy weight raising as expected. Fortunately these false positives ++ * are very rare. They typically occur if some service happens to ++ * start doing I/O exactly when the interactive task starts. ++ * ++ * Turning back to the next function, it implements all the steps ++ * needed to detect the occurrence of a large burst and to properly ++ * mark all the queues belonging to it (so that they can then be ++ * treated in a different way). This goal is achieved by maintaining a ++ * "burst list" that holds, temporarily, the queues that belong to the ++ * burst in progress. The list is then used to mark these queues as ++ * belonging to a large burst if the burst does become large. The main ++ * steps are the following. ++ * ++ * . when the very first queue is created, the queue is inserted into the ++ * list (as it could be the first queue in a possible burst) ++ * ++ * . if the current burst has not yet become large, and a queue Q that does ++ * not yet belong to the burst is activated shortly after the last time ++ * at which a new queue entered the burst list, then the function appends ++ * Q to the burst list ++ * ++ * . if, as a consequence of the previous step, the burst size reaches ++ * the large-burst threshold, then ++ * ++ * . all the queues in the burst list are marked as belonging to a ++ * large burst ++ * ++ * . the burst list is deleted; in fact, the burst list already served ++ * its purpose (keeping temporarily track of the queues in a burst, ++ * so as to be able to mark them as belonging to a large burst in the ++ * previous sub-step), and now is not needed any more ++ * ++ * . the device enters a large-burst mode ++ * ++ * . if a queue Q that does not belong to the burst is created while ++ * the device is in large-burst mode and shortly after the last time ++ * at which a queue either entered the burst list or was marked as ++ * belonging to the current large burst, then Q is immediately marked ++ * as belonging to a large burst. ++ * ++ * . if a queue Q that does not belong to the burst is created a while ++ * later, i.e., not shortly after, than the last time at which a queue ++ * either entered the burst list or was marked as belonging to the ++ * current large burst, then the current burst is deemed as finished and: ++ * ++ * . the large-burst mode is reset if set ++ * ++ * . the burst list is emptied ++ * ++ * . Q is inserted in the burst list, as Q may be the first queue ++ * in a possible new burst (then the burst list contains just Q ++ * after this step). ++ */ ++static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq is already in the burst list or is part of a large ++ * burst, or finally has just been split, then there is ++ * nothing else to do. ++ */ ++ if (!hlist_unhashed(&bfqq->burst_list_node) || ++ bfq_bfqq_in_large_burst(bfqq) || ++ time_is_after_eq_jiffies(bfqq->split_time + ++ msecs_to_jiffies(10))) ++ return; ++ ++ /* ++ * If bfqq's creation happens late enough, or bfqq belongs to ++ * a different group than the burst group, then the current ++ * burst is finished, and related data structures must be ++ * reset. ++ * ++ * In this respect, consider the special case where bfqq is ++ * the very first queue created after BFQ is selected for this ++ * device. In this case, last_ins_in_burst and ++ * burst_parent_entity are not yet significant when we get ++ * here. But it is easy to verify that, whether or not the ++ * following condition is true, bfqq will end up being ++ * inserted into the burst list. In particular the list will ++ * happen to contain only bfqq. And this is exactly what has ++ * to happen, as bfqq may be the first queue of the first ++ * burst. ++ */ ++ if (time_is_before_jiffies(bfqd->last_ins_in_burst + ++ bfqd->bfq_burst_interval) || ++ bfqq->entity.parent != bfqd->burst_parent_entity) { ++ bfqd->large_burst = false; ++ bfq_reset_burst_list(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "late activation or different group"); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then bfqq is being activated shortly after the ++ * last queue. So, if the current burst is also large, we can mark ++ * bfqq as belonging to this large burst immediately. ++ */ ++ if (bfqd->large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, "marked in burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ goto end; ++ } ++ ++ /* ++ * If we get here, then a large-burst state has not yet been ++ * reached, but bfqq is being activated shortly after the last ++ * queue. Then we add bfqq to the burst. ++ */ ++ bfq_add_to_burst(bfqd, bfqq); ++end: ++ /* ++ * At this point, bfqq either has been added to the current ++ * burst or has caused the current burst to terminate and a ++ * possible new burst to start. In particular, in the second ++ * case, bfqq has become the first queue in the possible new ++ * burst. In both cases last_ins_in_burst needs to be moved ++ * forward. ++ */ ++ bfqd->last_ins_in_burst = jiffies; ++ ++} ++ ++static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (entity->budget < entity->service) { ++ pr_crit("budget %d service %d\n", ++ entity->budget, entity->service); ++ BUG(); ++ } ++ return entity->budget - entity->service; ++} ++ ++/* ++ * If enough samples have been computed, return the current max budget ++ * stored in bfqd, which is dynamically updated according to the ++ * estimated disk peak rate; otherwise return the default max budget ++ */ ++static int bfq_max_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget; ++ else ++ return bfqd->bfq_max_budget; ++} ++ ++/* ++ * Return min budget, which is a fraction of the current or default ++ * max budget (trying with 1/32) ++ */ ++static int bfq_min_budget(struct bfq_data *bfqd) ++{ ++ if (bfqd->budgets_assigned < bfq_stats_min_budgets) ++ return bfq_default_max_budget / 32; ++ else ++ return bfqd->bfq_max_budget / 32; ++} ++ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason); ++ ++/* ++ * The next function, invoked after the input queue bfqq switches from ++ * idle to busy, updates the budget of bfqq. The function also tells ++ * whether the in-service queue should be expired, by returning ++ * true. The purpose of expiring the in-service queue is to give bfqq ++ * the chance to possibly preempt the in-service queue, and the reason ++ * for preempting the in-service queue is to achieve one of the two ++ * goals below. ++ * ++ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has ++ * expired because it has remained idle. In particular, bfqq may have ++ * expired for one of the following two reasons: ++ * ++ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and ++ * did not make it to issue a new request before its last request ++ * was served; ++ * ++ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue ++ * a new request before the expiration of the idling-time. ++ * ++ * Even if bfqq has expired for one of the above reasons, the process ++ * associated with the queue may be however issuing requests greedily, ++ * and thus be sensitive to the bandwidth it receives (bfqq may have ++ * remained idle for other reasons: CPU high load, bfqq not enjoying ++ * idling, I/O throttling somewhere in the path from the process to ++ * the I/O scheduler, ...). But if, after every expiration for one of ++ * the above two reasons, bfqq has to wait for the service of at least ++ * one full budget of another queue before being served again, then ++ * bfqq is likely to get a much lower bandwidth or resource time than ++ * its reserved ones. To address this issue, two countermeasures need ++ * to be taken. ++ * ++ * First, the budget and the timestamps of bfqq need to be updated in ++ * a special way on bfqq reactivation: they need to be updated as if ++ * bfqq did not remain idle and did not expire. In fact, if they are ++ * computed as if bfqq expired and remained idle until reactivation, ++ * then the process associated with bfqq is treated as if, instead of ++ * being greedy, it stopped issuing requests when bfqq remained idle, ++ * and restarts issuing requests only on this reactivation. In other ++ * words, the scheduler does not help the process recover the "service ++ * hole" between bfqq expiration and reactivation. As a consequence, ++ * the process receives a lower bandwidth than its reserved one. In ++ * contrast, to recover this hole, the budget must be updated as if ++ * bfqq was not expired at all before this reactivation, i.e., it must ++ * be set to the value of the remaining budget when bfqq was ++ * expired. Along the same line, timestamps need to be assigned the ++ * value they had the last time bfqq was selected for service, i.e., ++ * before last expiration. Thus timestamps need to be back-shifted ++ * with respect to their normal computation (see [1] for more details ++ * on this tricky aspect). ++ * ++ * Secondly, to allow the process to recover the hole, the in-service ++ * queue must be expired too, to give bfqq the chance to preempt it ++ * immediately. In fact, if bfqq has to wait for a full budget of the ++ * in-service queue to be completed, then it may become impossible to ++ * let the process recover the hole, even if the back-shifted ++ * timestamps of bfqq are lower than those of the in-service queue. If ++ * this happens for most or all of the holes, then the process may not ++ * receive its reserved bandwidth. In this respect, it is worth noting ++ * that, being the service of outstanding requests unpreemptible, a ++ * little fraction of the holes may however be unrecoverable, thereby ++ * causing a little loss of bandwidth. ++ * ++ * The last important point is detecting whether bfqq does need this ++ * bandwidth recovery. In this respect, the next function deems the ++ * process associated with bfqq greedy, and thus allows it to recover ++ * the hole, if: 1) the process is waiting for the arrival of a new ++ * request (which implies that bfqq expired for one of the above two ++ * reasons), and 2) such a request has arrived soon. The first ++ * condition is controlled through the flag non_blocking_wait_rq, ++ * while the second through the flag arrived_in_time. If both ++ * conditions hold, then the function computes the budget in the ++ * above-described special way, and signals that the in-service queue ++ * should be expired. Timestamp back-shifting is done later in ++ * __bfq_activate_entity. ++ * ++ * 2. Reduce latency. Even if timestamps are not backshifted to let ++ * the process associated with bfqq recover a service hole, bfqq may ++ * however happen to have, after being (re)activated, a lower finish ++ * timestamp than the in-service queue. That is, the next budget of ++ * bfqq may have to be completed before the one of the in-service ++ * queue. If this is the case, then preempting the in-service queue ++ * allows this goal to be achieved, apart from the unpreemptible, ++ * outstanding requests mentioned above. ++ * ++ * Unfortunately, regardless of which of the above two goals one wants ++ * to achieve, service trees need first to be updated to know whether ++ * the in-service queue must be preempted. To have service trees ++ * correctly updated, the in-service queue must be expired and ++ * rescheduled, and bfqq must be scheduled too. This is one of the ++ * most costly operations (in future versions, the scheduling ++ * mechanism may be re-designed in such a way to make it possible to ++ * know whether preemption is needed without needing to update service ++ * trees). In addition, queue preemptions almost always cause random ++ * I/O, and thus loss of throughput. Because of these facts, the next ++ * function adopts the following simple scheme to avoid both costly ++ * operations and too frequent preemptions: it requests the expiration ++ * of the in-service queue (unconditionally) only for queues that need ++ * to recover a hole, or that either are weight-raised or deserve to ++ * be weight-raised. ++ */ ++static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool arrived_in_time, ++ bool wr_or_deserves_wr) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ /* ++ * In the next compound condition, we check also whether there ++ * is some budget left, because otherwise there is no point in ++ * trying to go on serving bfqq with this same budget: bfqq ++ * would be expired immediately after being selected for ++ * service. This would only cause useless overhead. ++ */ ++ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time && ++ bfq_bfqq_budget_left(bfqq) > 0) { ++ /* ++ * We do not clear the flag non_blocking_wait_rq here, as ++ * the latter is used in bfq_activate_bfqq to signal ++ * that timestamps need to be back-shifted (and is ++ * cleared right after). ++ */ ++ ++ /* ++ * In next assignment we rely on that either ++ * entity->service or entity->budget are not updated ++ * on expiration if bfqq is empty (see ++ * __bfq_bfqq_recalc_budget). Thus both quantities ++ * remain unchanged after such an expiration, and the ++ * following statement therefore assigns to ++ * entity->budget the remaining budget on such an ++ * expiration. ++ */ ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = min_t(unsigned long, ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->max_budget); ++ ++ BUG_ON(entity->budget < 0); ++ ++ /* ++ * At this point, we have used entity->service to get ++ * the budget left (needed for updating ++ * entity->budget). Thus we finally can, and have to, ++ * reset entity->service. The latter must be reset ++ * because bfqq would otherwise be charged again for ++ * the service it has received during its previous ++ * service slot(s). ++ */ ++ entity->service = 0; ++ ++ return true; ++ } ++ ++ /* ++ * We can finally complete expiration, by setting service to 0. ++ */ ++ entity->service = 0; ++ BUG_ON(bfqq->max_budget < 0); ++ entity->budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(bfqq->next_rq, bfqq)); ++ BUG_ON(entity->budget < 0); ++ ++ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); ++ return wr_or_deserves_wr; ++} ++ ++/* ++ * Return the farthest past time instant according to jiffies ++ * macros. ++ */ ++static unsigned long bfq_smallest_from_now(void) ++{ ++ return jiffies - MAX_JIFFY_OFFSET; ++} ++ ++static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ unsigned int old_wr_coeff, ++ bool wr_or_deserves_wr, ++ bool interactive, ++ bool in_burst, ++ bool soft_rt) ++{ ++ if (old_wr_coeff == 1 && wr_or_deserves_wr) { ++ /* start a weight-raising period */ ++ if (interactive) { ++ bfqq->service_from_wr = 0; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else { ++ /* ++ * No interactive weight raising in progress ++ * here: assign minus infinity to ++ * wr_start_at_switch_to_srt, to make sure ++ * that, at the end of the soft-real-time ++ * weight raising periods that is starting ++ * now, no interactive weight-raising period ++ * may be wrongly considered as still in ++ * progress (and thus actually started by ++ * mistake). ++ */ ++ bfqq->wr_start_at_switch_to_srt = ++ bfq_smallest_from_now(); ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ } ++ /* ++ * If needed, further reduce budget to make sure it is ++ * close to bfqq's backlog, so as to reduce the ++ * scheduling-error component due to a too large ++ * budget. Do not care about throughput consequences, ++ * but only about latency. Finally, do not assign a ++ * too small budget either, to avoid increasing ++ * latency by causing too frequent expirations. ++ */ ++ bfqq->entity.budget = min_t(unsigned long, ++ bfqq->entity.budget, ++ 2 * bfq_min_budget(bfqd)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais starting at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } else if (old_wr_coeff > 1) { ++ if (interactive) { /* update wr coeff and duration */ ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ } else if (in_burst) { ++ bfqq->wr_coeff = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ jiffies, ++ jiffies_to_msecs(bfqq-> ++ wr_cur_max_time)); ++ } else if (soft_rt) { ++ /* ++ * The application is now or still meeting the ++ * requirements for being deemed soft rt. We ++ * can then correctly and safely (re)charge ++ * the weight-raising duration for the ++ * application with the weight-raising ++ * duration for soft rt applications. ++ * ++ * In particular, doing this recharge now, i.e., ++ * before the weight-raising period for the ++ * application finishes, reduces the probability ++ * of the following negative scenario: ++ * 1) the weight of a soft rt application is ++ * raised at startup (as for any newly ++ * created application), ++ * 2) since the application is not interactive, ++ * at a certain time weight-raising is ++ * stopped for the application, ++ * 3) at that time the application happens to ++ * still have pending requests, and hence ++ * is destined to not have a chance to be ++ * deemed soft rt before these requests are ++ * completed (see the comments to the ++ * function bfq_bfqq_softrt_next_start() ++ * for details on soft rt detection), ++ * 4) these pending requests experience a high ++ * latency because the application is not ++ * weight-raised while they are pending. ++ */ ++ if (bfqq->wr_cur_max_time != ++ bfqd->bfq_wr_rt_max_time) { ++ bfqq->wr_start_at_switch_to_srt = ++ bfqq->last_wr_start_finish; ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfqq->wr_cur_max_time = ++ bfqd->bfq_wr_rt_max_time; ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff * ++ BFQ_SOFTRT_WEIGHT_FACTOR; ++ bfq_log_bfqq(bfqd, bfqq, ++ "switching to soft_rt wr"); ++ } else ++ bfq_log_bfqq(bfqd, bfqq, ++ "moving forward soft_rt wr duration"); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++} ++ ++static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ return bfqq->dispatched == 0 && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ bfqd->bfq_wr_min_idle_time); ++} ++ ++static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ int old_wr_coeff, ++ struct request *rq, ++ bool *interactive) ++{ ++ bool soft_rt, in_burst, wr_or_deserves_wr, ++ bfqq_wants_to_preempt, ++ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), ++ /* ++ * See the comments on ++ * bfq_bfqq_update_budg_for_activation for ++ * details on the usage of the next variable. ++ */ ++ arrived_in_time = ktime_get_ns() <= ++ RQ_BIC(rq)->ttime.last_end_request + ++ bfqd->bfq_slice_idle * 3; ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request non-busy: " ++ "jiffies %lu, in_time %d, idle_long %d busyw %d " ++ "wr_coeff %u", ++ jiffies, arrived_in_time, ++ idle_for_long_time, ++ bfq_bfqq_non_blocking_wait_rq(bfqq), ++ old_wr_coeff); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); ++ ++ /* ++ * bfqq deserves to be weight-raised if: ++ * - it is sync, ++ * - it does not belong to a large burst, ++ * - it has been idle for enough time or is soft real-time, ++ * - is linked to a bfq_io_cq (it is not shared in any sense) ++ */ ++ in_burst = bfq_bfqq_in_large_burst(bfqq); ++ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && ++ !in_burst && ++ time_is_before_jiffies(bfqq->soft_rt_next_start) && ++ bfqq->dispatched == 0; ++ *interactive = ++ !in_burst && ++ idle_for_long_time; ++ wr_or_deserves_wr = bfqd->low_latency && ++ (bfqq->wr_coeff > 1 || ++ (bfq_bfqq_sync(bfqq) && ++ bfqq->bic && (*interactive || soft_rt))); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfq_add_request: " ++ "in_burst %d, " ++ "soft_rt %d (next %lu), inter %d, bic %p", ++ bfq_bfqq_in_large_burst(bfqq), soft_rt, ++ bfqq->soft_rt_next_start, ++ *interactive, ++ bfqq->bic); ++ ++ /* ++ * Using the last flag, update budget and check whether bfqq ++ * may want to preempt the in-service queue. ++ */ ++ bfqq_wants_to_preempt = ++ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, ++ arrived_in_time, ++ wr_or_deserves_wr); ++ ++ /* ++ * If bfqq happened to be activated in a burst, but has been ++ * idle for much more than an interactive queue, then we ++ * assume that, in the overall I/O initiated in the burst, the ++ * I/O associated with bfqq is finished. So bfqq does not need ++ * to be treated as a queue belonging to a burst ++ * anymore. Accordingly, we reset bfqq's in_large_burst flag ++ * if set, and remove bfqq from the burst list if it's ++ * there. We do not decrement burst_size, because the fact ++ * that bfqq does not need to belong to the burst list any ++ * more does not invalidate the fact that bfqq was created in ++ * a burst. ++ */ ++ if (likely(!bfq_bfqq_just_created(bfqq)) && ++ idle_for_long_time && ++ time_is_before_jiffies( ++ bfqq->budget_timeout + ++ msecs_to_jiffies(10000))) { ++ hlist_del_init(&bfqq->burst_list_node); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ } ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ ++ if (!bfq_bfqq_IO_bound(bfqq)) { ++ if (arrived_in_time) { ++ bfqq->requests_within_timer++; ++ if (bfqq->requests_within_timer >= ++ bfqd->bfq_requests_within_timer) ++ bfq_mark_bfqq_IO_bound(bfqq); ++ } else ++ bfqq->requests_within_timer = 0; ++ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", ++ bfqq->requests_within_timer); ++ } ++ ++ if (bfqd->low_latency) { ++ if (unlikely(time_is_after_jiffies(bfqq->split_time))) ++ /* wraparound */ ++ bfqq->split_time = ++ jiffies - bfqd->bfq_wr_min_idle_time - 1; ++ ++ if (time_is_before_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) { ++ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, ++ old_wr_coeff, ++ wr_or_deserves_wr, ++ *interactive, ++ in_burst, ++ soft_rt); ++ ++ if (old_wr_coeff != bfqq->wr_coeff) ++ bfqq->entity.prio_changed = 1; ++ } ++ } ++ ++ bfqq->last_idle_bklogged = jiffies; ++ bfqq->service_from_backlogged = 0; ++ bfq_clear_bfqq_softrt_update(bfqq); ++ ++ bfq_add_bfqq_busy(bfqd, bfqq); ++ ++ /* ++ * Expire in-service queue only if preemption may be needed ++ * for guarantees. In this respect, the function ++ * next_queue_may_preempt just checks a simple, necessary ++ * condition, and not a sufficient condition based on ++ * timestamps. In fact, for the latter condition to be ++ * evaluated, timestamps would need first to be updated, and ++ * this operation is quite costly (see the comments on the ++ * function bfq_bfqq_update_budg_for_activation). ++ */ ++ if (bfqd->in_service_queue && bfqq_wants_to_preempt && ++ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && ++ next_queue_may_preempt(bfqd)) { ++ struct bfq_queue *in_serv = ++ bfqd->in_service_queue; ++ BUG_ON(in_serv == bfqq); ++ ++ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, ++ false, BFQ_BFQQ_PREEMPTED); ++ } ++} ++ ++static void bfq_add_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *next_rq, *prev; ++ unsigned int old_wr_coeff = bfqq->wr_coeff; ++ bool interactive = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "size %u %s", ++ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); ++ ++ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ bfqq->queued[rq_is_sync(rq)]++; ++ bfqd->queued++; ++ ++ elv_rb_add(&bfqq->sort_list, rq); ++ ++ /* ++ * Check if this request is a better next-to-serve candidate. ++ */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); ++ BUG_ON(!next_rq); ++ bfqq->next_rq = next_rq; ++ ++ /* ++ * Adjust priority tree position, if next_rq changes. ++ */ ++ if (prev != bfqq->next_rq) ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ ++ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ ++ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, ++ rq, &interactive); ++ else { ++ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && ++ time_is_before_jiffies( ++ bfqq->last_wr_start_finish + ++ bfqd->bfq_wr_min_inter_arr_async)) { ++ bfqq->wr_coeff = bfqd->bfq_wr_coeff; ++ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); ++ ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > bfq_tot_busy_queues(bfqd)); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "non-idle wrais starting, " ++ "wr_max_time %u wr_busy %d", ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqd->wr_busy_queues); ++ } ++ if (prev != bfqq->next_rq) ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ /* ++ * Assign jiffies to last_wr_start_finish in the following ++ * cases: ++ * ++ * . if bfqq is not going to be weight-raised, because, for ++ * non weight-raised queues, last_wr_start_finish stores the ++ * arrival time of the last request; as of now, this piece ++ * of information is used only for deciding whether to ++ * weight-raise async queues ++ * ++ * . if bfqq is not weight-raised, because, if bfqq is now ++ * switching to weight-raised, then last_wr_start_finish ++ * stores the time when weight-raising starts ++ * ++ * . if bfqq is interactive, because, regardless of whether ++ * bfqq is currently weight-raised, the weight-raising ++ * period must start or restart (this case is considered ++ * separately because it is not detected by the above ++ * conditions, if bfqq is already weight-raised) ++ * ++ * last_wr_start_finish has to be updated also if bfqq is soft ++ * real-time, because the weight-raising period is constantly ++ * restarted on idle-to-busy transitions for these queues, but ++ * this is already done in bfq_bfqq_handle_idle_busy_switch if ++ * needed. ++ */ ++ if (bfqd->low_latency && ++ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) ++ bfqq->last_wr_start_finish = jiffies; ++} ++ ++static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, ++ struct bio *bio) ++{ ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (!bic) ++ return NULL; ++ ++ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); ++ if (bfqq) ++ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); ++ ++ return NULL; ++} ++ ++static sector_t get_sdist(sector_t last_pos, struct request *rq) ++{ ++ sector_t sdist = 0; ++ ++ if (last_pos) { ++ if (last_pos < blk_rq_pos(rq)) ++ sdist = blk_rq_pos(rq) - last_pos; ++ else ++ sdist = last_pos - blk_rq_pos(rq); ++ } ++ ++ return sdist; ++} ++ ++static void bfq_activate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bfqd->rq_in_driver++; ++} ++ ++static void bfq_deactivate_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ ++ BUG_ON(bfqd->rq_in_driver == 0); ++ bfqd->rq_in_driver--; ++} ++ ++static void bfq_remove_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ const int sync = rq_is_sync(rq); ++ ++ /* ++ * NOTE: ++ * (bfqq->entity.service > bfqq->entity.budget) may hold here, ++ * in case of forced dispatches. ++ */ ++ ++ if (bfqq->next_rq == rq) { ++ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); ++ bfq_updated_next_req(bfqd, bfqq); ++ } ++ ++ if (rq->queuelist.prev != &rq->queuelist) ++ list_del_init(&rq->queuelist); ++ BUG_ON(bfqq->queued[sync] == 0); ++ bfqq->queued[sync]--; ++ bfqd->queued--; ++ elv_rb_del(&bfqq->sort_list, rq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ bfqq->next_rq = NULL; ++ ++ BUG_ON(bfqq->entity.budget < 0); ++ ++ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { ++ BUG_ON(bfqq->ref < 2); /* referred by rq and on tree */ ++ bfq_del_bfqq_busy(bfqd, bfqq, false); ++ /* ++ * bfqq emptied. In normal operation, when ++ * bfqq is empty, bfqq->entity.service and ++ * bfqq->entity.budget must contain, ++ * respectively, the service received and the ++ * budget used last time bfqq emptied. These ++ * facts do not hold in this case, as at least ++ * this last removal occurred while bfqq is ++ * not in service. To avoid inconsistencies, ++ * reset both bfqq->entity.service and ++ * bfqq->entity.budget, if bfqq has still a ++ * process that may issue I/O requests to it. ++ */ ++ bfqq->entity.budget = bfqq->entity.service = 0; ++ } ++ ++ /* ++ * Remove queue from request-position tree as it is empty. ++ */ ++ if (bfqq->pos_root) { ++ rb_erase(&bfqq->pos_node, bfqq->pos_root); ++ bfqq->pos_root = NULL; ++ } ++ } else { ++ BUG_ON(!bfqq->next_rq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ ++ if (rq->cmd_flags & REQ_META) { ++ BUG_ON(bfqq->meta_pending == 0); ++ bfqq->meta_pending--; ++ } ++ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); ++} ++ ++static enum elv_merge bfq_merge(struct request_queue *q, struct request **req, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct request *__rq; ++ ++ __rq = bfq_find_rq_fmerge(bfqd, bio); ++ if (__rq && elv_bio_merge_ok(__rq, bio)) { ++ *req = __rq; ++ return ELEVATOR_FRONT_MERGE; ++ } ++ ++ return ELEVATOR_NO_MERGE; ++} ++ ++static void bfq_merged_request(struct request_queue *q, struct request *req, ++ enum elv_merge type) ++{ ++ if (type == ELEVATOR_FRONT_MERGE && ++ rb_prev(&req->rb_node) && ++ blk_rq_pos(req) < ++ blk_rq_pos(container_of(rb_prev(&req->rb_node), ++ struct request, rb_node))) { ++ struct bfq_queue *bfqq = RQ_BFQQ(req); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ struct request *prev, *next_rq; ++ ++ /* Reposition request in its sort_list */ ++ elv_rb_del(&bfqq->sort_list, req); ++ elv_rb_add(&bfqq->sort_list, req); ++ /* Choose next request to be served for bfqq */ ++ prev = bfqq->next_rq; ++ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, ++ bfqd->last_position); ++ BUG_ON(!next_rq); ++ bfqq->next_rq = next_rq; ++ /* ++ * If next_rq changes, update both the queue's budget to ++ * fit the new request and the queue's position in its ++ * rq_pos_tree. ++ */ ++ if (prev != bfqq->next_rq) { ++ bfq_updated_next_req(bfqd, bfqq); ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ } ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static void bfq_bio_merged(struct request_queue *q, struct request *req, ++ struct bio *bio) ++{ ++ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); ++} ++#endif ++ ++static void bfq_merged_requests(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); ++ ++ /* ++ * If next and rq belong to the same bfq_queue and next is older ++ * than rq, then reposition rq in the fifo (by substituting next ++ * with rq). Otherwise, if next and rq belong to different ++ * bfq_queues, never reposition rq: in fact, we would have to ++ * reposition it with respect to next's position in its own fifo, ++ * which would most certainly be too expensive with respect to ++ * the benefits. ++ */ ++ if (bfqq == next_bfqq && ++ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && ++ next->fifo_time < rq->fifo_time) { ++ list_del_init(&rq->queuelist); ++ list_replace_init(&next->queuelist, &rq->queuelist); ++ rq->fifo_time = next->fifo_time; ++ } ++ ++ if (bfqq->next_rq == next) ++ bfqq->next_rq = rq; ++ ++ bfq_remove_request(next); ++ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); ++} ++ ++/* Must be called with bfqq != NULL */ ++static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) ++{ ++ BUG_ON(!bfqq); ++ ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqq->bfqd->wr_busy_queues--; ++ BUG_ON(bfqq->bfqd->wr_busy_queues < 0); ++ } ++ bfqq->wr_coeff = 1; ++ bfqq->wr_cur_max_time = 0; ++ bfqq->last_wr_start_finish = jiffies; ++ /* ++ * Trigger a weight change on the next invocation of ++ * __bfq_entity_update_weight_prio. ++ */ ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "wrais ending at %lu, rais_max_time %u", ++ bfqq->last_wr_start_finish, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "wr_busy %d", ++ bfqq->bfqd->wr_busy_queues); ++} ++ ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ if (bfqg->async_bfqq[i][j]) ++ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); ++ if (bfqg->async_idle_bfqq) ++ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); ++} ++ ++static void bfq_end_wr(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) ++ bfq_bfqq_end_wr(bfqq); ++ bfq_end_wr_async(bfqd); ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++} ++ ++static sector_t bfq_io_struct_pos(void *io_struct, bool request) ++{ ++ if (request) ++ return blk_rq_pos(io_struct); ++ else ++ return ((struct bio *)io_struct)->bi_iter.bi_sector; ++} ++ ++static int bfq_rq_close_to_sector(void *io_struct, bool request, ++ sector_t sector) ++{ ++ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= ++ BFQQ_CLOSE_THR; ++} ++ ++static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ sector_t sector) ++{ ++ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; ++ struct rb_node *parent, *node; ++ struct bfq_queue *__bfqq; ++ ++ if (RB_EMPTY_ROOT(root)) ++ return NULL; ++ ++ /* ++ * First, if we find a request starting at the end of the last ++ * request, choose it. ++ */ ++ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); ++ if (__bfqq) ++ return __bfqq; ++ ++ /* ++ * If the exact sector wasn't found, the parent of the NULL leaf ++ * will contain the closest sector (rq_pos_tree sorted by ++ * next_request position). ++ */ ++ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ if (blk_rq_pos(__bfqq->next_rq) < sector) ++ node = rb_next(&__bfqq->pos_node); ++ else ++ node = rb_prev(&__bfqq->pos_node); ++ if (!node) ++ return NULL; ++ ++ __bfqq = rb_entry(node, struct bfq_queue, pos_node); ++ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) ++ return __bfqq; ++ ++ return NULL; ++} ++ ++static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, ++ struct bfq_queue *cur_bfqq, ++ sector_t sector) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * We shall notice if some of the queues are cooperating, ++ * e.g., working closely on the same area of the device. In ++ * that case, we can group them together and: 1) don't waste ++ * time idling, and 2) serve the union of their requests in ++ * the best possible order for throughput. ++ */ ++ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); ++ if (!bfqq || bfqq == cur_bfqq) ++ return NULL; ++ ++ return bfqq; ++} ++ ++static struct bfq_queue * ++bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ int process_refs, new_process_refs; ++ struct bfq_queue *__bfqq; ++ ++ /* ++ * If there are no process references on the new_bfqq, then it is ++ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain ++ * may have dropped their last reference (not just their last process ++ * reference). ++ */ ++ if (!bfqq_process_refs(new_bfqq)) ++ return NULL; ++ ++ /* Avoid a circular list and skip interim queue merges. */ ++ while ((__bfqq = new_bfqq->new_bfqq)) { ++ if (__bfqq == bfqq) ++ return NULL; ++ new_bfqq = __bfqq; ++ } ++ ++ process_refs = bfqq_process_refs(bfqq); ++ new_process_refs = bfqq_process_refs(new_bfqq); ++ /* ++ * If the process for the bfqq has gone away, there is no ++ * sense in merging the queues. ++ */ ++ if (process_refs == 0 || new_process_refs == 0) ++ return NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", ++ new_bfqq->pid); ++ ++ /* ++ * Merging is just a redirection: the requests of the process ++ * owning one of the two queues are redirected to the other queue. ++ * The latter queue, in its turn, is set as shared if this is the ++ * first time that the requests of some process are redirected to ++ * it. ++ * ++ * We redirect bfqq to new_bfqq and not the opposite, because we ++ * are in the context of the process owning bfqq, hence we have ++ * the io_cq of this process. So we can immediately configure this ++ * io_cq to redirect the requests of the process to new_bfqq. ++ * ++ * NOTE, even if new_bfqq coincides with the in-service queue, the ++ * io_cq of new_bfqq is not available, because, if the in-service ++ * queue is shared, bfqd->in_service_bic may not point to the ++ * io_cq of the in-service queue. ++ * Redirecting the requests of the process owning bfqq to the ++ * currently in-service queue is in any case the best option, as ++ * we feed the in-service queue with new requests close to the ++ * last request served and, by doing so, hopefully increase the ++ * throughput. ++ */ ++ bfqq->new_bfqq = new_bfqq; ++ new_bfqq->ref += process_refs; ++ return new_bfqq; ++} ++ ++static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, ++ struct bfq_queue *new_bfqq) ++{ ++ if (bfq_too_late_for_merging(new_bfqq)) { ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "too late for bfq%d to be merged", ++ new_bfqq->pid); ++ return false; ++ } ++ ++ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || ++ (bfqq->ioprio_class != new_bfqq->ioprio_class)) ++ return false; ++ ++ /* ++ * If either of the queues has already been detected as seeky, ++ * then merging it with the other queue is unlikely to lead to ++ * sequential I/O. ++ */ ++ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) ++ return false; ++ ++ /* ++ * Interleaved I/O is known to be done by (some) applications ++ * only for reads, so it does not make sense to merge async ++ * queues. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) ++ return false; ++ ++ return true; ++} ++ ++/* ++ * Attempt to schedule a merge of bfqq with the currently in-service ++ * queue or with a close queue among the scheduled queues. Return ++ * NULL if no merge was scheduled, a pointer to the shared bfq_queue ++ * structure otherwise. ++ * ++ * The OOM queue is not allowed to participate to cooperation: in fact, since ++ * the requests temporarily redirected to the OOM queue could be redirected ++ * again to dedicated queues at any time, the state needed to correctly ++ * handle merging with the OOM queue would be quite complex and expensive ++ * to maintain. Besides, in such a critical condition as an out of memory, ++ * the benefits of queue merging may be little relevant, or even negligible. ++ * ++ * WARNING: queue merging may impair fairness among non-weight raised ++ * queues, for at least two reasons: 1) the original weight of a ++ * merged queue may change during the merged state, 2) even being the ++ * weight the same, a merged queue may be bloated with many more ++ * requests than the ones produced by its originally-associated ++ * process. ++ */ ++static struct bfq_queue * ++bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ void *io_struct, bool request) ++{ ++ struct bfq_queue *in_service_bfqq, *new_bfqq; ++ ++ /* ++ * Prevent bfqq from being merged if it has been created too ++ * long ago. The idea is that true cooperating processes, and ++ * thus their associated bfq_queues, are supposed to be ++ * created shortly after each other. This is the case, e.g., ++ * for KVM/QEMU and dump I/O threads. Basing on this ++ * assumption, the following filtering greatly reduces the ++ * probability that two non-cooperating processes, which just ++ * happen to do close I/O for some short time interval, have ++ * their queues merged by mistake. ++ */ ++ if (bfq_too_late_for_merging(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "would have looked for coop, but too late"); ++ return NULL; ++ } ++ ++ if (bfqq->new_bfqq) ++ return bfqq->new_bfqq; ++ ++ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) ++ return NULL; ++ ++ /* If there is only one backlogged queue, don't search. */ ++ if (bfq_tot_busy_queues(bfqd) == 1) ++ return NULL; ++ ++ in_service_bfqq = bfqd->in_service_queue; ++ ++ if (in_service_bfqq && in_service_bfqq != bfqq && ++ likely(in_service_bfqq != &bfqd->oom_bfqq) && ++ bfq_rq_close_to_sector(io_struct, request, bfqd->in_serv_last_pos) && ++ bfqq->entity.parent == in_service_bfqq->entity.parent && ++ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { ++ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); ++ if (new_bfqq) ++ return new_bfqq; ++ } ++ /* ++ * Check whether there is a cooperator among currently scheduled ++ * queues. The only thing we need is that the bio/request is not ++ * NULL, as we need it to establish whether a cooperator exists. ++ */ ++ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, ++ bfq_io_struct_pos(io_struct, request)); ++ ++ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); ++ ++ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && ++ bfq_may_be_close_cooperator(bfqq, new_bfqq)) ++ return bfq_setup_merge(bfqq, new_bfqq); ++ ++ return NULL; ++} ++ ++static void bfq_bfqq_save_state(struct bfq_queue *bfqq) ++{ ++ struct bfq_io_cq *bic = bfqq->bic; ++ ++ /* ++ * If !bfqq->bic, the queue is already shared or its requests ++ * have already been redirected to a shared queue; both idle window ++ * and weight raising state have already been saved. Do nothing. ++ */ ++ if (!bic) ++ return; ++ ++ bic->saved_has_short_ttime = bfq_bfqq_has_short_ttime(bfqq); ++ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); ++ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); ++ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); ++ if (unlikely(bfq_bfqq_just_created(bfqq) && ++ !bfq_bfqq_in_large_burst(bfqq) && ++ bfqq->bfqd->low_latency)) { ++ /* ++ * bfqq being merged ritgh after being created: bfqq ++ * would have deserved interactive weight raising, but ++ * did not make it to be set in a weight-raised state, ++ * because of this early merge. Store directly the ++ * weight-raising state that would have been assigned ++ * to bfqq, so that to avoid that bfqq unjustly fails ++ * to enjoy weight raising if split soon. ++ */ ++ bic->saved_wr_coeff = bfqq->bfqd->bfq_wr_coeff; ++ bic->saved_wr_cur_max_time = bfq_wr_duration(bfqq->bfqd); ++ bic->saved_last_wr_start_finish = jiffies; ++ } else { ++ bic->saved_wr_coeff = bfqq->wr_coeff; ++ bic->saved_wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; ++ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; ++ } ++ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); ++} ++ ++static void bfq_get_bic_reference(struct bfq_queue *bfqq) ++{ ++ /* ++ * If bfqq->bic has a non-NULL value, the bic to which it belongs ++ * is about to begin using a shared bfq_queue. ++ */ ++ if (bfqq->bic) ++ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); ++} ++ ++static void ++bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, ++ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", ++ (unsigned long) new_bfqq->pid); ++ /* Save weight raising and idle window of the merged queues */ ++ bfq_bfqq_save_state(bfqq); ++ bfq_bfqq_save_state(new_bfqq); ++ if (bfq_bfqq_IO_bound(bfqq)) ++ bfq_mark_bfqq_IO_bound(new_bfqq); ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ /* ++ * If bfqq is weight-raised, then let new_bfqq inherit ++ * weight-raising. To reduce false positives, neglect the case ++ * where bfqq has just been created, but has not yet made it ++ * to be weight-raised (which may happen because EQM may merge ++ * bfqq even before bfq_add_request is executed for the first ++ * time for bfqq). Handling this case would however be very ++ * easy, thanks to the flag just_created. ++ */ ++ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { ++ new_bfqq->wr_coeff = bfqq->wr_coeff; ++ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; ++ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; ++ new_bfqq->wr_start_at_switch_to_srt = ++ bfqq->wr_start_at_switch_to_srt; ++ if (bfq_bfqq_busy(new_bfqq)) { ++ bfqd->wr_busy_queues++; ++ BUG_ON(bfqd->wr_busy_queues > ++ bfq_tot_busy_queues(bfqd)); ++ } ++ ++ new_bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, new_bfqq, ++ "wr start after merge with %d, rais_max_time %u", ++ bfqq->pid, ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ ++ bfqq->wr_coeff = 1; ++ bfqq->entity.prio_changed = 1; ++ if (bfq_bfqq_busy(bfqq)) { ++ bfqd->wr_busy_queues--; ++ BUG_ON(bfqd->wr_busy_queues < 0); ++ } ++ ++ } ++ ++ bfq_log_bfqq(bfqd, new_bfqq, "wr_busy %d", ++ bfqd->wr_busy_queues); ++ ++ /* ++ * Grab a reference to the bic, to prevent it from being destroyed ++ * before being possibly touched by a bfq_split_bfqq(). ++ */ ++ bfq_get_bic_reference(bfqq); ++ bfq_get_bic_reference(new_bfqq); ++ /* ++ * Merge queues (that is, let bic redirect its requests to new_bfqq) ++ */ ++ bic_set_bfqq(bic, new_bfqq, 1); ++ bfq_mark_bfqq_coop(new_bfqq); ++ /* ++ * new_bfqq now belongs to at least two bics (it is a shared queue): ++ * set new_bfqq->bic to NULL. bfqq either: ++ * - does not belong to any bic any more, and hence bfqq->bic must ++ * be set to NULL, or ++ * - is a queue whose owning bics have already been redirected to a ++ * different queue, hence the queue is destined to not belong to ++ * any bic soon and bfqq->bic is already NULL (therefore the next ++ * assignment causes no harm). ++ */ ++ new_bfqq->bic = NULL; ++ bfqq->bic = NULL; ++ /* release process reference to bfqq */ ++ bfq_put_queue(bfqq); ++} ++ ++static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, ++ struct bio *bio) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ bool is_sync = op_is_sync(bio->bi_opf); ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq, *new_bfqq; ++ ++ /* ++ * Disallow merge of a sync bio into an async request. ++ */ ++ if (is_sync && !rq_is_sync(rq)) ++ return false; ++ ++ /* ++ * Lookup the bfqq that this bio will be queued with. Allow ++ * merge only if rq is queued there. ++ * Queue lock is held here. ++ */ ++ bic = bfq_bic_lookup(bfqd, current->io_context); ++ if (!bic) ++ return false; ++ ++ bfqq = bic_to_bfqq(bic, is_sync); ++ /* ++ * We take advantage of this function to perform an early merge ++ * of the queues of possible cooperating processes. ++ */ ++ if (bfqq) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); ++ if (new_bfqq) { ++ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); ++ /* ++ * If we get here, the bio will be queued in the ++ * shared queue, i.e., new_bfqq, so use new_bfqq ++ * to decide whether bio and rq can be merged. ++ */ ++ bfqq = new_bfqq; ++ } ++ } ++ ++ return bfqq == RQ_BFQQ(rq); ++} ++ ++static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, ++ struct request *next) ++{ ++ return RQ_BFQQ(rq) == RQ_BFQQ(next); ++} ++ ++/* ++ * Set the maximum time for the in-service queue to consume its ++ * budget. This prevents seeky processes from lowering the throughput. ++ * In practice, a time-slice service scheme is used with seeky ++ * processes. ++ */ ++static void bfq_set_budget_timeout(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ unsigned int timeout_coeff; ++ ++ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) ++ timeout_coeff = 1; ++ else ++ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; ++ ++ bfqd->last_budget_start = ktime_get(); ++ ++ bfqq->budget_timeout = jiffies + ++ bfqd->bfq_timeout * timeout_coeff; ++ ++ bfq_log_bfqq(bfqd, bfqq, "%u", ++ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); ++} ++ ++static void __bfq_set_in_service_queue(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ if (bfqq) { ++ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); ++ bfq_mark_bfqq_must_alloc(bfqq); ++ bfq_clear_bfqq_fifo_expire(bfqq); ++ ++ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; ++ ++ BUG_ON(bfqq == bfqd->in_service_queue); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && ++ bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_before_jiffies(bfqq->budget_timeout)) { ++ /* ++ * For soft real-time queues, move the start ++ * of the weight-raising period forward by the ++ * time the queue has not received any ++ * service. Otherwise, a relatively long ++ * service delay is likely to cause the ++ * weight-raising period of the queue to end, ++ * because of the short duration of the ++ * weight-raising period of a soft real-time ++ * queue. It is worth noting that this move ++ * is not so dangerous for the other queues, ++ * because soft real-time queues are not ++ * greedy. ++ * ++ * To not add a further variable, we use the ++ * overloaded field budget_timeout to ++ * determine for how long the queue has not ++ * received service, i.e., how much time has ++ * elapsed since the queue expired. However, ++ * this is a little imprecise, because ++ * budget_timeout is set to jiffies if bfqq ++ * not only expires, but also remains with no ++ * request. ++ */ ++ if (time_after(bfqq->budget_timeout, ++ bfqq->last_wr_start_finish)) ++ bfqq->last_wr_start_finish += ++ jiffies - bfqq->budget_timeout; ++ else ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { ++ pr_crit( ++ "BFQ WARNING:last %lu budget %lu jiffies %lu", ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout, ++ jiffies); ++ pr_crit("diff %lu", jiffies - ++ max_t(unsigned long, ++ bfqq->last_wr_start_finish, ++ bfqq->budget_timeout)); ++ bfqq->last_wr_start_finish = jiffies; ++ } ++ } ++ ++ bfq_set_budget_timeout(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "cur-budget = %d prio_class %d", ++ bfqq->entity.budget, bfqq->ioprio_class); ++ } else ++ bfq_log(bfqd, "NULL"); ++ ++ bfqd->in_service_queue = bfqq; ++} ++ ++/* ++ * Get and set a new queue for service. ++ */ ++static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); ++ ++ __bfq_set_in_service_queue(bfqd, bfqq); ++ return bfqq; ++} ++ ++static void bfq_arm_slice_timer(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ struct bfq_io_cq *bic; ++ u32 sl; ++ ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ /* Processes have exited, don't wait. */ ++ bic = bfqd->in_service_bic; ++ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) ++ return; ++ ++ bfq_mark_bfqq_wait_request(bfqq); ++ ++ /* ++ * We don't want to idle for seeks, but we do want to allow ++ * fair distribution of slice time for a process doing back-to-back ++ * seeks. So allow a little bit of time for him to submit a new rq. ++ * ++ * To prevent processes with (partly) seeky workloads from ++ * being too ill-treated, grant them a small fraction of the ++ * assigned budget before reducing the waiting time to ++ * BFQ_MIN_TT. This happened to help reduce latency. ++ */ ++ sl = bfqd->bfq_slice_idle; ++ /* ++ * Unless the queue is being weight-raised or the scenario is ++ * asymmetric, grant only minimum idle time if the queue ++ * is seeky. A long idling is preserved for a weight-raised ++ * queue, or, more in general, in an asymemtric scenario, ++ * because a long idling is needed for guaranteeing to a queue ++ * its reserved share of the throughput (in particular, it is ++ * needed if the queue has a higher weight than some other ++ * queue). ++ */ ++ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ bfq_symmetric_scenario(bfqd)) ++ sl = min_t(u32, sl, BFQ_MIN_TT); ++ ++ bfqd->last_idling_start = ktime_get(); ++ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), ++ HRTIMER_MODE_REL); ++ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); ++ bfq_log(bfqd, "arm idle: %ld/%ld ms", ++ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); ++} ++ ++/* ++ * In autotuning mode, max_budget is dynamically recomputed as the ++ * amount of sectors transferred in timeout at the estimated peak ++ * rate. This enables BFQ to utilize a full timeslice with a full ++ * budget, even if the in-service queue is served at peak rate. And ++ * this maximises throughput with sequential workloads. ++ */ ++static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) ++{ ++ return (u64)bfqd->peak_rate * USEC_PER_MSEC * ++ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; ++} ++ ++/* ++ * Update parameters related to throughput and responsiveness, as a ++ * function of the estimated peak rate. See comments on ++ * bfq_calc_max_budget(), and on the ref_wr_duration array. ++ */ ++static void update_thr_responsiveness_params(struct bfq_data *bfqd) ++{ ++ if (bfqd->bfq_user_max_budget == 0) { ++ bfqd->bfq_max_budget = ++ bfq_calc_max_budget(bfqd); ++ BUG_ON(bfqd->bfq_max_budget < 0); ++ bfq_log(bfqd, "new max_budget = %d", ++ bfqd->bfq_max_budget); ++ } ++} ++ ++static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) ++{ ++ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ ++ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; ++ bfqd->peak_rate_samples = 1; ++ bfqd->sequential_samples = 0; ++ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = ++ blk_rq_sectors(rq); ++ } else /* no new rq dispatched, just reset the number of samples */ ++ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ ++ ++ bfq_log(bfqd, ++ "at end, sample %u/%u tot_sects %llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched); ++} ++ ++static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) ++{ ++ u32 rate, weight, divisor; ++ ++ /* ++ * For the convergence property to hold (see comments on ++ * bfq_update_peak_rate()) and for the assessment to be ++ * reliable, a minimum number of samples must be present, and ++ * a minimum amount of time must have elapsed. If not so, do ++ * not compute new rate. Just reset parameters, to get ready ++ * for a new evaluation attempt. ++ */ ++ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || ++ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { ++ bfq_log(bfqd, ++ "only resetting, delta_first %lluus samples %d", ++ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); ++ goto reset_computation; ++ } ++ ++ /* ++ * If a new request completion has occurred after last ++ * dispatch, then, to approximate the rate at which requests ++ * have been served by the device, it is more precise to ++ * extend the observation interval to the last completion. ++ */ ++ bfqd->delta_from_first = ++ max_t(u64, bfqd->delta_from_first, ++ bfqd->last_completion - bfqd->first_dispatch); ++ ++ BUG_ON(bfqd->delta_from_first == 0); ++ /* ++ * Rate computed in sects/usec, and not sects/nsec, for ++ * precision issues. ++ */ ++ rate = div64_ul(bfqd->tot_sectors_dispatched<<BFQ_RATE_SHIFT, ++ div_u64(bfqd->delta_from_first, NSEC_PER_USEC)); ++ ++ bfq_log(bfqd, ++"tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", ++ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ rate > 20<<BFQ_RATE_SHIFT); ++ ++ /* ++ * Peak rate not updated if: ++ * - the percentage of sequential dispatches is below 3/4 of the ++ * total, and rate is below the current estimated peak rate ++ * - rate is unreasonably high (> 20M sectors/sec) ++ */ ++ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && ++ rate <= bfqd->peak_rate) || ++ rate > 20<<BFQ_RATE_SHIFT) { ++ bfq_log(bfqd, ++ "goto reset, samples %u/%u rate/peak %llu/%llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ goto reset_computation; ++ } else { ++ bfq_log(bfqd, ++ "do update, samples %u/%u rate/peak %llu/%llu", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ } ++ ++ /* ++ * We have to update the peak rate, at last! To this purpose, ++ * we use a low-pass filter. We compute the smoothing constant ++ * of the filter as a function of the 'weight' of the new ++ * measured rate. ++ * ++ * As can be seen in next formulas, we define this weight as a ++ * quantity proportional to how sequential the workload is, ++ * and to how long the observation time interval is. ++ * ++ * The weight runs from 0 to 8. The maximum value of the ++ * weight, 8, yields the minimum value for the smoothing ++ * constant. At this minimum value for the smoothing constant, ++ * the measured rate contributes for half of the next value of ++ * the estimated peak rate. ++ * ++ * So, the first step is to compute the weight as a function ++ * of how sequential the workload is. Note that the weight ++ * cannot reach 9, because bfqd->sequential_samples cannot ++ * become equal to bfqd->peak_rate_samples, which, in its ++ * turn, holds true because bfqd->sequential_samples is not ++ * incremented for the first sample. ++ */ ++ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; ++ ++ /* ++ * Second step: further refine the weight as a function of the ++ * duration of the observation interval. ++ */ ++ weight = min_t(u32, 8, ++ div_u64(weight * bfqd->delta_from_first, ++ BFQ_RATE_REF_INTERVAL)); ++ ++ /* ++ * Divisor ranging from 10, for minimum weight, to 2, for ++ * maximum weight. ++ */ ++ divisor = 10 - weight; ++ BUG_ON(divisor == 0); ++ ++ /* ++ * Finally, update peak rate: ++ * ++ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor ++ */ ++ bfqd->peak_rate *= divisor-1; ++ bfqd->peak_rate /= divisor; ++ rate /= divisor; /* smoothing constant alpha = 1/divisor */ ++ ++ bfq_log(bfqd, ++ "divisor %d tmp_peak_rate %llu tmp_rate %u", ++ divisor, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), ++ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); ++ ++ BUG_ON(bfqd->peak_rate == 0); ++ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); ++ ++ bfqd->peak_rate += rate; ++ ++ /* ++ * For a very slow device, bfqd->peak_rate can reach 0 (see ++ * the minimum representable values reported in the comments ++ * on BFQ_RATE_SHIFT). Push to 1 if this happens, to avoid ++ * divisions by zero where bfqd->peak_rate is used as a ++ * divisor. ++ */ ++ bfqd->peak_rate = max_t(u32, 1, bfqd->peak_rate); ++ ++ update_thr_responsiveness_params(bfqd); ++ BUG_ON(bfqd->peak_rate > 20<<BFQ_RATE_SHIFT); ++ ++reset_computation: ++ bfq_reset_rate_computation(bfqd, rq); ++} ++ ++/* ++ * Update the read/write peak rate (the main quantity used for ++ * auto-tuning, see update_thr_responsiveness_params()). ++ * ++ * It is not trivial to estimate the peak rate (correctly): because of ++ * the presence of sw and hw queues between the scheduler and the ++ * device components that finally serve I/O requests, it is hard to ++ * say exactly when a given dispatched request is served inside the ++ * device, and for how long. As a consequence, it is hard to know ++ * precisely at what rate a given set of requests is actually served ++ * by the device. ++ * ++ * On the opposite end, the dispatch time of any request is trivially ++ * available, and, from this piece of information, the "dispatch rate" ++ * of requests can be immediately computed. So, the idea in the next ++ * function is to use what is known, namely request dispatch times ++ * (plus, when useful, request completion times), to estimate what is ++ * unknown, namely in-device request service rate. ++ * ++ * The main issue is that, because of the above facts, the rate at ++ * which a certain set of requests is dispatched over a certain time ++ * interval can vary greatly with respect to the rate at which the ++ * same requests are then served. But, since the size of any ++ * intermediate queue is limited, and the service scheme is lossless ++ * (no request is silently dropped), the following obvious convergence ++ * property holds: the number of requests dispatched MUST become ++ * closer and closer to the number of requests completed as the ++ * observation interval grows. This is the key property used in ++ * the next function to estimate the peak service rate as a function ++ * of the observed dispatch rate. The function assumes to be invoked ++ * on every request dispatch. ++ */ ++static void bfq_update_peak_rate(struct bfq_data *bfqd, struct request *rq) ++{ ++ u64 now_ns = ktime_get_ns(); ++ ++ if (bfqd->peak_rate_samples == 0) { /* first dispatch */ ++ bfq_log(bfqd, ++ "goto reset, samples %d", ++ bfqd->peak_rate_samples) ; ++ bfq_reset_rate_computation(bfqd, rq); ++ goto update_last_values; /* will add one sample */ ++ } ++ ++ /* ++ * Device idle for very long: the observation interval lasting ++ * up to this dispatch cannot be a valid observation interval ++ * for computing a new peak rate (similarly to the late- ++ * completion event in bfq_completed_request()). Go to ++ * update_rate_and_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - start a new observation interval with this dispatch ++ */ ++ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && ++ bfqd->rq_in_driver == 0) { ++ bfq_log(bfqd, ++"jumping to updating&resetting delta_last %lluus samples %d", ++ (now_ns - bfqd->last_dispatch)>>10, ++ bfqd->peak_rate_samples) ; ++ goto update_rate_and_reset; ++ } ++ ++ /* Update sampling information */ ++ bfqd->peak_rate_samples++; ++ ++ if ((bfqd->rq_in_driver > 0 || ++ now_ns - bfqd->last_completion < BFQ_MIN_TT) ++ && !BFQ_RQ_SEEKY(bfqd, bfqd->last_position, rq)) ++ bfqd->sequential_samples++; ++ ++ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); ++ ++ /* Reset max observed rq size every 32 dispatches */ ++ if (likely(bfqd->peak_rate_samples % 32)) ++ bfqd->last_rq_max_size = ++ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); ++ else ++ bfqd->last_rq_max_size = blk_rq_sectors(rq); ++ ++ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; ++ ++ bfq_log(bfqd, ++ "added samples %u/%u tot_sects %llu delta_first %lluus", ++ bfqd->peak_rate_samples, bfqd->sequential_samples, ++ bfqd->tot_sectors_dispatched, ++ bfqd->delta_from_first>>10); ++ ++ /* Target observation interval not yet reached, go on sampling */ ++ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) ++ goto update_last_values; ++ ++update_rate_and_reset: ++ bfq_update_rate_reset(bfqd, rq); ++update_last_values: ++ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ if (RQ_BFQQ(rq) == bfqd->in_service_queue) ++ bfqd->in_serv_last_pos = bfqd->last_position; ++ bfqd->last_dispatch = now_ns; ++ ++ bfq_log(bfqd, ++ "delta_first %lluus last_pos %llu peak_rate %llu", ++ (now_ns - bfqd->first_dispatch)>>10, ++ (unsigned long long) bfqd->last_position, ++ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); ++ bfq_log(bfqd, ++ "samples at end %d", bfqd->peak_rate_samples); ++} ++ ++/* ++ * Move request from internal lists to the dispatch list of the request queue ++ */ ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ /* ++ * For consistency, the next instruction should have been executed ++ * after removing the request from the queue and dispatching it. ++ * We execute instead this instruction before bfq_remove_request() ++ * (and hence introduce a temporary inconsistency), for efficiency. ++ * In fact, in a forced_dispatch, this prevents two counters related ++ * to bfqq->dispatched to risk to be uselessly decremented if bfqq ++ * is not in service, and then to be incremented again after ++ * incrementing bfqq->dispatched. ++ */ ++ bfqq->dispatched++; ++ bfq_update_peak_rate(q->elevator->elevator_data, rq); ++ ++ bfq_remove_request(rq); ++ elv_dispatch_sort(q, rq); ++} ++ ++static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * If this bfqq is shared between multiple processes, check ++ * to make sure that those processes are still issuing I/Os ++ * within the mean seek distance. If not, it may be time to ++ * break the queues apart again. ++ */ ++ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) ++ bfq_mark_bfqq_split_coop(bfqq); ++ ++ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ if (bfqq->dispatched == 0) ++ /* ++ * Overloading budget_timeout field to store ++ * the time at which the queue remains with no ++ * backlog and no outstanding request; used by ++ * the weight-raising mechanism. ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_del_bfqq_busy(bfqd, bfqq, true); ++ } else { ++ bfq_requeue_bfqq(bfqd, bfqq, true); ++ /* ++ * Resort priority tree of potential close cooperators. ++ */ ++ bfq_pos_tree_add_move(bfqd, bfqq); ++ } ++ ++ /* ++ * All in-service entities must have been properly deactivated ++ * or requeued before executing the next function, which ++ * resets all in-service entites as no more in service. ++ */ ++ __bfq_bfqd_reset_in_service(bfqd); ++} ++ ++/** ++ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. ++ * @bfqd: device data. ++ * @bfqq: queue to update. ++ * @reason: reason for expiration. ++ * ++ * Handle the feedback on @bfqq budget at queue expiration. ++ * See the body for detailed comments. ++ */ ++static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ enum bfqq_expiration reason) ++{ ++ struct request *next_rq; ++ int budget, min_budget; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ min_budget = bfq_min_budget(bfqd); ++ ++ if (bfqq->wr_coeff == 1) ++ budget = bfqq->max_budget; ++ else /* ++ * Use a constant, low budget for weight-raised queues, ++ * to help achieve a low latency. Keep it slightly higher ++ * than the minimum possible budget, to cause a little ++ * bit fewer expirations. ++ */ ++ budget = 2 * min_budget; ++ ++ bfq_log_bfqq(bfqd, bfqq, "last budg %d, budg left %d", ++ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); ++ bfq_log_bfqq(bfqd, bfqq, "last max_budg %d, min budg %d", ++ budget, bfq_min_budget(bfqd)); ++ bfq_log_bfqq(bfqd, bfqq, "sync %d, seeky %d", ++ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); ++ ++ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { ++ switch (reason) { ++ /* ++ * Caveat: in all the following cases we trade latency ++ * for throughput. ++ */ ++ case BFQ_BFQQ_TOO_IDLE: ++ /* ++ * This is the only case where we may reduce ++ * the budget: if there is no request of the ++ * process still waiting for completion, then ++ * we assume (tentatively) that the timer has ++ * expired because the batch of requests of ++ * the process could have been served with a ++ * smaller budget. Hence, betting that ++ * process will behave in the same way when it ++ * becomes backlogged again, we reduce its ++ * next budget. As long as we guess right, ++ * this budget cut reduces the latency ++ * experienced by the process. ++ * ++ * However, if there are still outstanding ++ * requests, then the process may have not yet ++ * issued its next request just because it is ++ * still waiting for the completion of some of ++ * the still outstanding ones. So in this ++ * subcase we do not reduce its budget, on the ++ * contrary we increase it to possibly boost ++ * the throughput, as discussed in the ++ * comments to the BUDGET_TIMEOUT case. ++ */ ++ if (bfqq->dispatched > 0) /* still outstanding reqs */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ else { ++ if (budget > 5 * min_budget) ++ budget -= 4 * min_budget; ++ else ++ budget = min_budget; ++ } ++ break; ++ case BFQ_BFQQ_BUDGET_TIMEOUT: ++ /* ++ * We double the budget here because it gives ++ * the chance to boost the throughput if this ++ * is not a seeky process (and has bumped into ++ * this timeout because of, e.g., ZBR). ++ */ ++ budget = min(budget * 2, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_BUDGET_EXHAUSTED: ++ /* ++ * The process still has backlog, and did not ++ * let either the budget timeout or the disk ++ * idling timeout expire. Hence it is not ++ * seeky, has a short thinktime and may be ++ * happy with a higher budget too. So ++ * definitely increase the budget of this good ++ * candidate to boost the disk throughput. ++ */ ++ budget = min(budget * 4, bfqd->bfq_max_budget); ++ break; ++ case BFQ_BFQQ_NO_MORE_REQUESTS: ++ /* ++ * For queues that expire for this reason, it ++ * is particularly important to keep the ++ * budget close to the actual service they ++ * need. Doing so reduces the timestamp ++ * misalignment problem described in the ++ * comments in the body of ++ * __bfq_activate_entity. In fact, suppose ++ * that a queue systematically expires for ++ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a ++ * new request in time to enjoy timestamp ++ * back-shifting. The larger the budget of the ++ * queue is with respect to the service the ++ * queue actually requests in each service ++ * slot, the more times the queue can be ++ * reactivated with the same virtual finish ++ * time. It follows that, even if this finish ++ * time is pushed to the system virtual time ++ * to reduce the consequent timestamp ++ * misalignment, the queue unjustly enjoys for ++ * many re-activations a lower finish time ++ * than all newly activated queues. ++ * ++ * The service needed by bfqq is measured ++ * quite precisely by bfqq->entity.service. ++ * Since bfqq does not enjoy device idling, ++ * bfqq->entity.service is equal to the number ++ * of sectors that the process associated with ++ * bfqq requested to read/write before waiting ++ * for request completions, or blocking for ++ * other reasons. ++ */ ++ budget = max_t(int, bfqq->entity.service, min_budget); ++ break; ++ default: ++ return; ++ } ++ } else if (!bfq_bfqq_sync(bfqq)) ++ /* ++ * Async queues get always the maximum possible ++ * budget, as for them we do not care about latency ++ * (in addition, their ability to dispatch is limited ++ * by the charging factor). ++ */ ++ budget = bfqd->bfq_max_budget; ++ ++ bfqq->max_budget = budget; ++ ++ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && ++ !bfqd->bfq_user_max_budget) ++ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); ++ ++ /* ++ * If there is still backlog, then assign a new budget, making ++ * sure that it is large enough for the next request. Since ++ * the finish time of bfqq must be kept in sync with the ++ * budget, be sure to call __bfq_bfqq_expire() *after* this ++ * update. ++ * ++ * If there is no backlog, then no need to update the budget; ++ * it will be updated on the arrival of a new request. ++ */ ++ next_rq = bfqq->next_rq; ++ if (next_rq) { ++ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || ++ reason == BFQ_BFQQ_NO_MORE_REQUESTS); ++ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, ++ bfq_serv_to_charge(next_rq, bfqq)); ++ BUG_ON(!bfq_bfqq_busy(bfqq)); ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", ++ next_rq ? blk_rq_sectors(next_rq) : 0, ++ bfqq->entity.budget); ++} ++ ++/* ++ * Return true if the process associated with bfqq is "slow". The slow ++ * flag is used, in addition to the budget timeout, to reduce the ++ * amount of service provided to seeky processes, and thus reduce ++ * their chances to lower the throughput. More details in the comments ++ * on the function bfq_bfqq_expire(). ++ * ++ * An important observation is in order: as discussed in the comments ++ * on the function bfq_update_peak_rate(), with devices with internal ++ * queues, it is hard if ever possible to know when and for how long ++ * an I/O request is processed by the device (apart from the trivial ++ * I/O pattern where a new request is dispatched only after the ++ * previous one has been completed). This makes it hard to evaluate ++ * the real rate at which the I/O requests of each bfq_queue are ++ * served. In fact, for an I/O scheduler like BFQ, serving a ++ * bfq_queue means just dispatching its requests during its service ++ * slot (i.e., until the budget of the queue is exhausted, or the ++ * queue remains idle, or, finally, a timeout fires). But, during the ++ * service slot of a bfq_queue, around 100 ms at most, the device may ++ * be even still processing requests of bfq_queues served in previous ++ * service slots. On the opposite end, the requests of the in-service ++ * bfq_queue may be completed after the service slot of the queue ++ * finishes. ++ * ++ * Anyway, unless more sophisticated solutions are used ++ * (where possible), the sum of the sizes of the requests dispatched ++ * during the service slot of a bfq_queue is probably the only ++ * approximation available for the service received by the bfq_queue ++ * during its service slot. And this sum is the quantity used in this ++ * function to evaluate the I/O speed of a process. ++ */ ++static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ bool compensate, enum bfqq_expiration reason, ++ unsigned long *delta_ms) ++{ ++ ktime_t delta_ktime; ++ u32 delta_usecs; ++ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ ++ ++ if (!bfq_bfqq_sync(bfqq)) ++ return false; ++ ++ if (compensate) ++ delta_ktime = bfqd->last_idling_start; ++ else ++ delta_ktime = ktime_get(); ++ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); ++ delta_usecs = ktime_to_us(delta_ktime); ++ ++ /* don't use too short time intervals */ ++ if (delta_usecs < 1000) { ++ if (blk_queue_nonrot(bfqd->queue)) ++ /* ++ * give same worst-case guarantees as idling ++ * for seeky ++ */ ++ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; ++ else /* charge at least one seek */ ++ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; ++ ++ bfq_log(bfqd, "too short %u", delta_usecs); ++ ++ return slow; ++ } ++ ++ *delta_ms = delta_usecs / USEC_PER_MSEC; ++ ++ /* ++ * Use only long (> 20ms) intervals to filter out excessive ++ * spikes in service rate estimation. ++ */ ++ if (delta_usecs > 20000) { ++ /* ++ * Caveat for rotational devices: processes doing I/O ++ * in the slower disk zones tend to be slow(er) even ++ * if not seeky. In this respect, the estimated peak ++ * rate is likely to be an average over the disk ++ * surface. Accordingly, to not be too harsh with ++ * unlucky processes, a process is deemed slow only if ++ * its rate has been lower than half of the estimated ++ * peak rate. ++ */ ++ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; ++ bfq_log(bfqd, "relative rate %d/%d", ++ bfqq->entity.service, bfqd->bfq_max_budget); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "slow %d", slow); ++ ++ return slow; ++} ++ ++/* ++ * To be deemed as soft real-time, an application must meet two ++ * requirements. First, the application must not require an average ++ * bandwidth higher than the approximate bandwidth required to playback or ++ * record a compressed high-definition video. ++ * The next function is invoked on the completion of the last request of a ++ * batch, to compute the next-start time instant, soft_rt_next_start, such ++ * that, if the next request of the application does not arrive before ++ * soft_rt_next_start, then the above requirement on the bandwidth is met. ++ * ++ * The second requirement is that the request pattern of the application is ++ * isochronous, i.e., that, after issuing a request or a batch of requests, ++ * the application stops issuing new requests until all its pending requests ++ * have been completed. After that, the application may issue a new batch, ++ * and so on. ++ * For this reason the next function is invoked to compute ++ * soft_rt_next_start only for applications that meet this requirement, ++ * whereas soft_rt_next_start is set to infinity for applications that do ++ * not. ++ * ++ * Unfortunately, even a greedy (i.e., I/O-bound) application may ++ * happen to meet, occasionally or systematically, both the above ++ * bandwidth and isochrony requirements. This may happen at least in ++ * the following circumstances. First, if the CPU load is high. The ++ * application may stop issuing requests while the CPUs are busy ++ * serving other processes, then restart, then stop again for a while, ++ * and so on. The other circumstances are related to the storage ++ * device: the storage device is highly loaded or reaches a low-enough ++ * throughput with the I/O of the application (e.g., because the I/O ++ * is random and/or the device is slow). In all these cases, the ++ * I/O of the application may be simply slowed down enough to meet ++ * the bandwidth and isochrony requirements. To reduce the probability ++ * that greedy applications are deemed as soft real-time in these ++ * corner cases, a further rule is used in the computation of ++ * soft_rt_next_start: the return value of this function is forced to ++ * be higher than the maximum between the following two quantities. ++ * ++ * (a) Current time plus: (1) the maximum time for which the arrival ++ * of a request is waited for when a sync queue becomes idle, ++ * namely bfqd->bfq_slice_idle, and (2) a few extra jiffies. We ++ * postpone for a moment the reason for adding a few extra ++ * jiffies; we get back to it after next item (b). Lower-bounding ++ * the return value of this function with the current time plus ++ * bfqd->bfq_slice_idle tends to filter out greedy applications, ++ * because the latter issue their next request as soon as possible ++ * after the last one has been completed. In contrast, a soft ++ * real-time application spends some time processing data, after a ++ * batch of its requests has been completed. ++ * ++ * (b) Current value of bfqq->soft_rt_next_start. As pointed out ++ * above, greedy applications may happen to meet both the ++ * bandwidth and isochrony requirements under heavy CPU or ++ * storage-device load. In more detail, in these scenarios, these ++ * applications happen, only for limited time periods, to do I/O ++ * slowly enough to meet all the requirements described so far, ++ * including the filtering in above item (a). These slow-speed ++ * time intervals are usually interspersed between other time ++ * intervals during which these applications do I/O at a very high ++ * speed. Fortunately, exactly because of the high speed of the ++ * I/O in the high-speed intervals, the values returned by this ++ * function happen to be so high, near the end of any such ++ * high-speed interval, to be likely to fall *after* the end of ++ * the low-speed time interval that follows. These high values are ++ * stored in bfqq->soft_rt_next_start after each invocation of ++ * this function. As a consequence, if the last value of ++ * bfqq->soft_rt_next_start is constantly used to lower-bound the ++ * next value that this function may return, then, from the very ++ * beginning of a low-speed interval, bfqq->soft_rt_next_start is ++ * likely to be constantly kept so high that any I/O request ++ * issued during the low-speed interval is considered as arriving ++ * to soon for the application to be deemed as soft ++ * real-time. Then, in the high-speed interval that follows, the ++ * application will not be deemed as soft real-time, just because ++ * it will do I/O at a high speed. And so on. ++ * ++ * Getting back to the filtering in item (a), in the following two ++ * cases this filtering might be easily passed by a greedy ++ * application, if the reference quantity was just ++ * bfqd->bfq_slice_idle: ++ * 1) HZ is so low that the duration of a jiffy is comparable to or ++ * higher than bfqd->bfq_slice_idle. This happens, e.g., on slow ++ * devices with HZ=100. The time granularity may be so coarse ++ * that the approximation, in jiffies, of bfqd->bfq_slice_idle ++ * is rather lower than the exact value. ++ * 2) jiffies, instead of increasing at a constant rate, may stop increasing ++ * for a while, then suddenly 'jump' by several units to recover the lost ++ * increments. This seems to happen, e.g., inside virtual machines. ++ * To address this issue, in the filtering in (a) we do not use as a ++ * reference time interval just bfqd->bfq_slice_idle, but ++ * bfqd->bfq_slice_idle plus a few jiffies. In particular, we add the ++ * minimum number of jiffies for which the filter seems to be quite ++ * precise also in embedded systems and KVM/QEMU virtual machines. ++ */ ++static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqd, bfqq, ++"service_blkg %lu soft_rate %u sects/sec interval %u", ++ bfqq->service_from_backlogged, ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate)); ++ ++ return max3(bfqq->soft_rt_next_start, ++ bfqq->last_idle_bklogged + ++ HZ * bfqq->service_from_backlogged / ++ bfqd->bfq_wr_max_softrt_rate, ++ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); ++} ++ ++static bool bfq_bfqq_injectable(struct bfq_queue *bfqq) ++{ ++ return BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && ++ blk_queue_nonrot(bfqq->bfqd->queue) && ++ bfqq->bfqd->hw_tag; ++} ++ ++/** ++ * bfq_bfqq_expire - expire a queue. ++ * @bfqd: device owning the queue. ++ * @bfqq: the queue to expire. ++ * @compensate: if true, compensate for the time spent idling. ++ * @reason: the reason causing the expiration. ++ * ++ * If the process associated with bfqq does slow I/O (e.g., because it ++ * issues random requests), we charge bfqq with the time it has been ++ * in service instead of the service it has received (see ++ * bfq_bfqq_charge_time for details on how this goal is achieved). As ++ * a consequence, bfqq will typically get higher timestamps upon ++ * reactivation, and hence it will be rescheduled as if it had ++ * received more service than what it has actually received. In the ++ * end, bfqq receives less service in proportion to how slowly its ++ * associated process consumes its budgets (and hence how seriously it ++ * tends to lower the throughput). In addition, this time-charging ++ * strategy guarantees time fairness among slow processes. In ++ * contrast, if the process associated with bfqq is not slow, we ++ * charge bfqq exactly with the service it has received. ++ * ++ * Charging time to the first type of queues and the exact service to ++ * the other has the effect of using the WF2Q+ policy to schedule the ++ * former on a timeslice basis, without violating service domain ++ * guarantees among the latter. ++ */ ++static void bfq_bfqq_expire(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ bool compensate, ++ enum bfqq_expiration reason) ++{ ++ bool slow; ++ unsigned long delta = 0; ++ struct bfq_entity *entity = &bfqq->entity; ++ int ref; ++ ++ BUG_ON(bfqq != bfqd->in_service_queue); ++ ++ /* ++ * Check whether the process is slow (see bfq_bfqq_is_slow). ++ */ ++ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); ++ ++ /* ++ * As above explained, charge slow (typically seeky) and ++ * timed-out queues with the time and not the service ++ * received, to favor sequential workloads. ++ * ++ * Processes doing I/O in the slower disk zones will tend to ++ * be slow(er) even if not seeky. Therefore, since the ++ * estimated peak rate is actually an average over the disk ++ * surface, these processes may timeout just for bad luck. To ++ * avoid punishing them, do not charge time to processes that ++ * succeeded in consuming at least 2/3 of their budget. This ++ * allows BFQ to preserve enough elasticity to still perform ++ * bandwidth, and not time, distribution with little unlucky ++ * or quasi-sequential processes. ++ */ ++ if (bfqq->wr_coeff == 1 && ++ (slow || ++ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && ++ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) ++ bfq_bfqq_charge_time(bfqd, bfqq, delta); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ if (reason == BFQ_BFQQ_TOO_IDLE && ++ entity->service <= 2 * entity->budget / 10) ++ bfq_clear_bfqq_IO_bound(bfqq); ++ ++ if (bfqd->low_latency && bfqq->wr_coeff == 1) ++ bfqq->last_wr_start_finish = jiffies; ++ ++ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list)) { ++ /* ++ * If we get here, and there are no outstanding ++ * requests, then the request pattern is isochronous ++ * (see the comments on the function ++ * bfq_bfqq_softrt_next_start()). Thus we can compute ++ * soft_rt_next_start. And we do it, unless bfqq is in ++ * interactive weight raising. We do not do it in the ++ * latter subcase, for the following reason. bfqq may ++ * be conveying the I/O needed to load a soft ++ * real-time application. Such an application will ++ * actually exhibit a soft real-time I/O pattern after ++ * it finally starts doing its job. But, if ++ * soft_rt_next_start is computed here for an ++ * interactive bfqq, and bfqq had received a lot of ++ * service before remaining with no outstanding ++ * request (likely to happen on a fast device), then ++ * soft_rt_next_start would be assigned such a high ++ * value that, for a very long time, bfqq would be ++ * prevented from being possibly considered as soft ++ * real time. ++ * ++ * If, instead, the queue still has outstanding ++ * requests, then we have to wait for the completion ++ * of all the outstanding requests to discover whether ++ * the request pattern is actually isochronous. ++ */ ++ BUG_ON(bfq_tot_busy_queues(bfqd) < 1); ++ if (bfqq->dispatched == 0 && ++ bfqq->wr_coeff != bfqd->bfq_wr_coeff) { ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", ++ bfqq->soft_rt_next_start); ++ } else if (bfqq->dispatched > 0) { ++ /* ++ * Schedule an update of soft_rt_next_start to when ++ * the task may be discovered to be isochronous. ++ */ ++ bfq_mark_bfqq_softrt_update(bfqq); ++ } ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "expire (%s, slow %d, num_disp %d, short %d, weight %d, serv %d/%d)", ++ reason_name[reason], slow, bfqq->dispatched, ++ bfq_bfqq_has_short_ttime(bfqq), entity->weight, ++ entity->service, entity->budget); ++ ++ /* ++ * Increase, decrease or leave budget unchanged according to ++ * reason. ++ */ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); ++ ref = bfqq->ref; ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ if (ref == 1) /* bfqq is gone, no more actions on it */ ++ return; ++ ++ BUG_ON(ref > 1 && ++ !bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && ++ !bfq_class_idle(bfqq)); ++ ++ bfqq->injected_service = 0; ++ ++ /* mark bfqq as waiting a request only if a bic still points to it */ ++ if (!bfq_bfqq_busy(bfqq) && ++ reason != BFQ_BFQQ_BUDGET_TIMEOUT && ++ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(bfqq->next_rq); ++ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); ++ /* ++ * Not setting service to 0, because, if the next rq ++ * arrives in time, the queue will go on receiving ++ * service with this same budget (as if it never expired) ++ */ ++ } else { ++ entity->service = 0; ++ bfq_log_bfqq(bfqd, bfqq, "resetting service"); ++ } ++ ++ /* ++ * Reset the received-service counter for every parent entity. ++ * Differently from what happens with bfqq->entity.service, ++ * the resetting of this counter never needs to be postponed ++ * for parent entities. In fact, in case bfqq may have a ++ * chance to go on being served using the last, partially ++ * consumed budget, bfqq->entity.service needs to be kept, ++ * because if bfqq then actually goes on being served using ++ * the same budget, the last value of bfqq->entity.service is ++ * needed to properly decrement bfqq->entity.budget by the ++ * portion already consumed. In contrast, it is not necessary ++ * to keep entity->service for parent entities too, because ++ * the bubble up of the new value of bfqq->entity.budget will ++ * make sure that the budgets of parent entities are correct, ++ * even in case bfqq and thus parent entities go on receiving ++ * service with the same budget. ++ */ ++ entity = entity->parent; ++ for_each_entity(entity) ++ entity->service = 0; ++} ++ ++/* ++ * Budget timeout is not implemented through a dedicated timer, but ++ * just checked on request arrivals and completions, as well as on ++ * idle timer expirations. ++ */ ++static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) ++{ ++ return time_is_before_eq_jiffies(bfqq->budget_timeout); ++} ++ ++/* ++ * If we expire a queue that is actively waiting (i.e., with the ++ * device idled) for the arrival of a new request, then we may incur ++ * the timestamp misalignment problem described in the body of the ++ * function __bfq_activate_entity. Hence we return true only if this ++ * condition does not hold, or if the queue is slow enough to deserve ++ * only to be kicked off for preserving a high throughput. ++ */ ++static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "wait_request %d left %d timeout %d", ++ bfq_bfqq_wait_request(bfqq), ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, ++ bfq_bfqq_budget_timeout(bfqq)); ++ ++ return (!bfq_bfqq_wait_request(bfqq) || ++ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) ++ && ++ bfq_bfqq_budget_timeout(bfqq); ++} ++ ++static bool idling_boosts_thr_without_issues(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bool rot_without_queueing = ++ !blk_queue_nonrot(bfqd->queue) && !bfqd->hw_tag, ++ bfqq_sequential_and_IO_bound, ++ idling_boosts_thr; ++ ++ bfqq_sequential_and_IO_bound = !BFQQ_SEEKY(bfqq) && ++ bfq_bfqq_IO_bound(bfqq) && bfq_bfqq_has_short_ttime(bfqq); ++ /* ++ * The next variable takes into account the cases where idling ++ * boosts the throughput. ++ * ++ * The value of the variable is computed considering, first, that ++ * idling is virtually always beneficial for the throughput if: ++ * (a) the device is not NCQ-capable and rotational, or ++ * (b) regardless of the presence of NCQ, the device is rotational and ++ * the request pattern for bfqq is I/O-bound and sequential, or ++ * (c) regardless of whether it is rotational, the device is ++ * not NCQ-capable and the request pattern for bfqq is ++ * I/O-bound and sequential. ++ * ++ * Secondly, and in contrast to the above item (b), idling an ++ * NCQ-capable flash-based device would not boost the ++ * throughput even with sequential I/O; rather it would lower ++ * the throughput in proportion to how fast the device ++ * is. Accordingly, the next variable is true if any of the ++ * above conditions (a), (b) or (c) is true, and, in ++ * particular, happens to be false if bfqd is an NCQ-capable ++ * flash-based device. ++ */ ++ idling_boosts_thr = rot_without_queueing || ++ ((!blk_queue_nonrot(bfqd->queue) || !bfqd->hw_tag) && ++ bfqq_sequential_and_IO_bound); ++ ++ bfq_log_bfqq(bfqd, bfqq, "idling_boosts_thr %d", idling_boosts_thr); ++ ++ /* ++ * The return value of this function is equal to that of ++ * idling_boosts_thr, unless a special case holds. In this ++ * special case, described below, idling may cause problems to ++ * weight-raised queues. ++ * ++ * When the request pool is saturated (e.g., in the presence ++ * of write hogs), if the processes associated with ++ * non-weight-raised queues ask for requests at a lower rate, ++ * then processes associated with weight-raised queues have a ++ * higher probability to get a request from the pool ++ * immediately (or at least soon) when they need one. Thus ++ * they have a higher probability to actually get a fraction ++ * of the device throughput proportional to their high ++ * weight. This is especially true with NCQ-capable drives, ++ * which enqueue several requests in advance, and further ++ * reorder internally-queued requests. ++ * ++ * For this reason, we force to false the return value if ++ * there are weight-raised busy queues. In this case, and if ++ * bfqq is not weight-raised, this guarantees that the device ++ * is not idled for bfqq (if, instead, bfqq is weight-raised, ++ * then idling will be guaranteed by another variable, see ++ * below). Combined with the timestamping rules of BFQ (see ++ * [1] for details), this behavior causes bfqq, and hence any ++ * sync non-weight-raised queue, to get a lower number of ++ * requests served, and thus to ask for a lower number of ++ * requests from the request pool, before the busy ++ * weight-raised queues get served again. This often mitigates ++ * starvation problems in the presence of heavy write ++ * workloads and NCQ, thereby guaranteeing a higher ++ * application and system responsiveness in these hostile ++ * scenarios. ++ */ ++ return idling_boosts_thr && ++ bfqd->wr_busy_queues == 0; ++} ++ ++/* ++ * There is a case where idling must be performed not for ++ * throughput concerns, but to preserve service guarantees. ++ * ++ * To introduce this case, we can note that allowing the drive ++ * to enqueue more than one request at a time, and hence ++ * delegating de facto final scheduling decisions to the ++ * drive's internal scheduler, entails loss of control on the ++ * actual request service order. In particular, the critical ++ * situation is when requests from different processes happen ++ * to be present, at the same time, in the internal queue(s) ++ * of the drive. In such a situation, the drive, by deciding ++ * the service order of the internally-queued requests, does ++ * determine also the actual throughput distribution among ++ * these processes. But the drive typically has no notion or ++ * concern about per-process throughput distribution, and ++ * makes its decisions only on a per-request basis. Therefore, ++ * the service distribution enforced by the drive's internal ++ * scheduler is likely to coincide with the desired ++ * device-throughput distribution only in a completely ++ * symmetric scenario where: ++ * (i) each of these processes must get the same throughput as ++ * the others; ++ * (ii) the I/O of each process has the same properties, in ++ * terms of locality (sequential or random), direction ++ * (reads or writes), request sizes, greediness ++ * (from I/O-bound to sporadic), and so on. ++ * In fact, in such a scenario, the drive tends to treat ++ * the requests of each of these processes in about the same ++ * way as the requests of the others, and thus to provide ++ * each of these processes with about the same throughput ++ * (which is exactly the desired throughput distribution). In ++ * contrast, in any asymmetric scenario, device idling is ++ * certainly needed to guarantee that bfqq receives its ++ * assigned fraction of the device throughput (see [1] for ++ * details). ++ * The problem is that idling may significantly reduce ++ * throughput with certain combinations of types of I/O and ++ * devices. An important example is sync random I/O, on flash ++ * storage with command queueing. So, unless bfqq falls in the ++ * above cases where idling also boosts throughput, it would ++ * be important to check conditions (i) and (ii) accurately, ++ * so as to avoid idling when not strictly needed for service ++ * guarantees. ++ * ++ * Unfortunately, it is extremely difficult to thoroughly ++ * check condition (ii). And, in case there are active groups, ++ * it becomes very difficult to check condition (i) too. In ++ * fact, if there are active groups, then, for condition (i) ++ * to become false, it is enough that an active group contains ++ * more active processes or sub-groups than some other active ++ * group. More precisely, for condition (i) to hold because of ++ * such a group, it is not even necessary that the group is ++ * (still) active: it is sufficient that, even if the group ++ * has become inactive, some of its descendant processes still ++ * have some request already dispatched but still waiting for ++ * completion. In fact, requests have still to be guaranteed ++ * their share of the throughput even after being ++ * dispatched. In this respect, it is easy to show that, if a ++ * group frequently becomes inactive while still having ++ * in-flight requests, and if, when this happens, the group is ++ * not considered in the calculation of whether the scenario ++ * is asymmetric, then the group may fail to be guaranteed its ++ * fair share of the throughput (basically because idling may ++ * not be performed for the descendant processes of the group, ++ * but it had to be). We address this issue with the ++ * following bi-modal behavior, implemented in the function ++ * bfq_symmetric_scenario(). ++ * ++ * If there are groups with requests waiting for completion ++ * (as commented above, some of these groups may even be ++ * already inactive), then the scenario is tagged as ++ * asymmetric, conservatively, without checking any of the ++ * conditions (i) and (ii). So the device is idled for bfqq. ++ * This behavior matches also the fact that groups are created ++ * exactly if controlling I/O is a primary concern (to ++ * preserve bandwidth and latency guarantees). ++ * ++ * On the opposite end, if there are no groups with requests ++ * waiting for completion, then only condition (i) is actually ++ * controlled, i.e., provided that condition (i) holds, idling ++ * is not performed, regardless of whether condition (ii) ++ * holds. In other words, only if condition (i) does not hold, ++ * then idling is allowed, and the device tends to be ++ * prevented from queueing many requests, possibly of several ++ * processes. Since there are no groups with requests waiting ++ * for completion, then, to control condition (i) it is enough ++ * to check just whether all the queues with requests waiting ++ * for completion also have the same weight. ++ * ++ * Not checking condition (ii) evidently exposes bfqq to the ++ * risk of getting less throughput than its fair share. ++ * However, for queues with the same weight, a further ++ * mechanism, preemption, mitigates or even eliminates this ++ * problem. And it does so without consequences on overall ++ * throughput. This mechanism and its benefits are explained ++ * in the next three paragraphs. ++ * ++ * Even if a queue, say Q, is expired when it remains idle, Q ++ * can still preempt the new in-service queue if the next ++ * request of Q arrives soon (see the comments on ++ * bfq_bfqq_update_budg_for_activation). If all queues and ++ * groups have the same weight, this form of preemption, ++ * combined with the hole-recovery heuristic described in the ++ * comments on function bfq_bfqq_update_budg_for_activation, ++ * are enough to preserve a correct bandwidth distribution in ++ * the mid term, even without idling. In fact, even if not ++ * idling allows the internal queues of the device to contain ++ * many requests, and thus to reorder requests, we can rather ++ * safely assume that the internal scheduler still preserves a ++ * minimum of mid-term fairness. ++ * ++ * More precisely, this preemption-based, idleless approach ++ * provides fairness in terms of IOPS, and not sectors per ++ * second. This can be seen with a simple example. Suppose ++ * that there are two queues with the same weight, but that ++ * the first queue receives requests of 8 sectors, while the ++ * second queue receives requests of 1024 sectors. In ++ * addition, suppose that each of the two queues contains at ++ * most one request at a time, which implies that each queue ++ * always remains idle after it is served. Finally, after ++ * remaining idle, each queue receives very quickly a new ++ * request. It follows that the two queues are served ++ * alternatively, preempting each other if needed. This ++ * implies that, although both queues have the same weight, ++ * the queue with large requests receives a service that is ++ * 1024/8 times as high as the service received by the other ++ * queue. ++ * ++ * The motivation for using preemption instead of idling (for ++ * queues with the same weight) is that, by not idling, ++ * service guarantees are preserved (completely or at least in ++ * part) without minimally sacrificing throughput. And, if ++ * there is no active group, then the primary expectation for ++ * this device is probably a high throughput. ++ * ++ * We are now left only with explaining the additional ++ * compound condition that is checked below for deciding ++ * whether the scenario is asymmetric. To explain this ++ * compound condition, we need to add that the function ++ * bfq_symmetric_scenario checks the weights of only ++ * non-weight-raised queues, for efficiency reasons (see ++ * comments on bfq_weights_tree_add()). Then the fact that ++ * bfqq is weight-raised is checked explicitly here. More ++ * precisely, the compound condition below takes into account ++ * also the fact that, even if bfqq is being weight-raised, ++ * the scenario is still symmetric if all queues with requests ++ * waiting for completion happen to be ++ * weight-raised. Actually, we should be even more precise ++ * here, and differentiate between interactive weight raising ++ * and soft real-time weight raising. ++ * ++ * As a side note, it is worth considering that the above ++ * device-idling countermeasures may however fail in the ++ * following unlucky scenario: if idling is (correctly) ++ * disabled in a time period during which all symmetry ++ * sub-conditions hold, and hence the device is allowed to ++ * enqueue many requests, but at some later point in time some ++ * sub-condition stops to hold, then it may become impossible ++ * to let requests be served in the desired order until all ++ * the requests already queued in the device have been served. ++ */ ++static bool idling_needed_for_service_guarantees(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ bool asymmetric_scenario = (bfqq->wr_coeff > 1 && ++ bfqd->wr_busy_queues < ++ bfq_tot_busy_queues(bfqd)) || ++ !bfq_symmetric_scenario(bfqd); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wr_coeff %d wr_busy %d busy %d asymmetric %d", ++ bfqq->wr_coeff, ++ bfqd->wr_busy_queues, ++ bfq_tot_busy_queues(bfqd), ++ asymmetric_scenario); ++ ++ return asymmetric_scenario; ++} ++ ++/* ++ * For a queue that becomes empty, device idling is allowed only if ++ * this function returns true for that queue. As a consequence, since ++ * device idling plays a critical role for both throughput boosting ++ * and service guarantees, the return value of this function plays a ++ * critical role as well. ++ * ++ * In a nutshell, this function returns true only if idling is ++ * beneficial for throughput or, even if detrimental for throughput, ++ * idling is however necessary to preserve service guarantees (low ++ * latency, desired throughput distribution, ...). In particular, on ++ * NCQ-capable devices, this function tries to return false, so as to ++ * help keep the drives' internal queues full, whenever this helps the ++ * device boost the throughput without causing any service-guarantee ++ * issue. ++ * ++ * Most of the issues taken into account to get the return value of ++ * this function are not trivial. We discuss these issues in the two ++ * functions providing the main pieces of information needed by this ++ * function. ++ */ ++static bool bfq_better_to_idle(struct bfq_queue *bfqq) ++{ ++ struct bfq_data *bfqd = bfqq->bfqd; ++ bool idling_boosts_thr_with_no_issue, idling_needed_for_service_guar; ++ ++ if (unlikely(bfqd->strict_guarantees)) ++ return true; ++ ++ /* ++ * Idling is performed only if slice_idle > 0. In addition, we ++ * do not idle if ++ * (a) bfqq is async ++ * (b) bfqq is in the idle io prio class: in this case we do ++ * not idle because we want to minimize the bandwidth that ++ * queues in this class can steal to higher-priority queues ++ */ ++ if (bfqd->bfq_slice_idle == 0 || !bfq_bfqq_sync(bfqq) || ++ bfq_class_idle(bfqq)) ++ return false; ++ ++ idling_boosts_thr_with_no_issue = ++ idling_boosts_thr_without_issues(bfqd, bfqq); ++ ++ idling_needed_for_service_guar = ++ idling_needed_for_service_guarantees(bfqd, bfqq); ++ ++ /* ++ * We have now the two components we need to compute the ++ * return value of the function, which is true only if idling ++ * either boosts the throughput (without issues), or is ++ * necessary to preserve service guarantees. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, ++ "wr_busy %d boosts %d IO-bound %d guar %d", ++ bfqd->wr_busy_queues, ++ idling_boosts_thr_with_no_issue, ++ bfq_bfqq_IO_bound(bfqq), ++ idling_needed_for_service_guar); ++ ++ return idling_boosts_thr_with_no_issue || ++ idling_needed_for_service_guar; ++} ++ ++/* ++ * If the in-service queue is empty but the function bfq_better_to_idle ++ * returns true, then: ++ * 1) the queue must remain in service and cannot be expired, and ++ * 2) the device must be idled to wait for the possible arrival of a new ++ * request for the queue. ++ * See the comments on the function bfq_better_to_idle for the reasons ++ * why performing device idling is the best choice to boost the throughput ++ * and preserve service guarantees when bfq_better_to_idle itself ++ * returns true. ++ */ ++static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) ++{ ++ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_better_to_idle(bfqq); ++} ++ ++static struct bfq_queue *bfq_choose_bfqq_for_injection(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ ++ /* ++ * A linear search; but, with a high probability, very few ++ * steps are needed to find a candidate queue, i.e., a queue ++ * with enough budget left for its next request. In fact: ++ * - BFQ dynamically updates the budget of every queue so as ++ * to accomodate the expected backlog of the queue; ++ * - if a queue gets all its requests dispatched as injected ++ * service, then the queue is removed from the active list ++ * (and re-added only if it gets new requests, but with ++ * enough budget for its new backlog). ++ */ ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) ++ if (!RB_EMPTY_ROOT(&bfqq->sort_list) && ++ bfq_serv_to_charge(bfqq->next_rq, bfqq) <= ++ bfq_bfqq_budget_left(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); ++ return bfqq; ++ } ++ ++ bfq_log(bfqd, "no queue found"); ++ return NULL; ++} ++ ++/* ++ * Select a queue for service. If we have a current queue in service, ++ * check whether to continue servicing it, or retrieve and set a new one. ++ */ ++static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq; ++ struct request *next_rq; ++ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ ++ bfqq = bfqd->in_service_queue; ++ if (!bfqq) ++ goto new_queue; ++ ++ bfq_log_bfqq(bfqd, bfqq, "already in-service queue"); ++ ++ /* ++ * Do not expire bfqq for budget timeout if bfqq may be about ++ * to enjoy device idling. The reason why, in this case, we ++ * prevent bfqq from expiring is the same as in the comments ++ * on the case where bfq_bfqq_must_idle() returns true, in ++ * bfq_completed_request(). ++ */ ++ if (bfq_may_expire_for_budg_timeout(bfqq) && ++ !bfq_bfqq_must_idle(bfqq)) ++ goto expire; ++ ++check_queue: ++ /* ++ * This loop is rarely executed more than once. Even when it ++ * happens, it is much more convenient to re-execute this loop ++ * than to return NULL and trigger a new dispatch to get a ++ * request served. ++ */ ++ next_rq = bfqq->next_rq; ++ /* ++ * If bfqq has requests queued and it has enough budget left to ++ * serve them, keep the queue, otherwise expire it. ++ */ ++ if (next_rq) { ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ ++ if (bfq_serv_to_charge(next_rq, bfqq) > ++ bfq_bfqq_budget_left(bfqq)) { ++ /* ++ * Expire the queue for budget exhaustion, ++ * which makes sure that the next budget is ++ * enough to serve the next request, even if ++ * it comes from the fifo expired path. ++ */ ++ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; ++ goto expire; ++ } else { ++ /* ++ * The idle timer may be pending because we may ++ * not disable disk idling even when a new request ++ * arrives. ++ */ ++ if (bfq_bfqq_wait_request(bfqq)) { ++ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); ++ /* ++ * If we get here: 1) at least a new request ++ * has arrived but we have not disabled the ++ * timer because the request was too small, ++ * 2) then the block layer has unplugged ++ * the device, causing the dispatch to be ++ * invoked. ++ * ++ * Since the device is unplugged, now the ++ * requests are probably large enough to ++ * provide a reasonable throughput. ++ * So we disable idling. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ } ++ goto keep_queue; ++ } ++ } ++ ++ /* ++ * No requests pending. However, if the in-service queue is idling ++ * for a new request, or has requests waiting for a completion and ++ * may idle after their completion, then keep it anyway. ++ * ++ * Yet, to boost throughput, inject service from other queues if ++ * possible. ++ */ ++ if (hrtimer_active(&bfqd->idle_slice_timer) || ++ (bfqq->dispatched != 0 && bfq_better_to_idle(bfqq))) { ++ if (bfq_bfqq_injectable(bfqq) && ++ bfqq->injected_service * bfqq->inject_coeff < ++ bfqq->entity.service * 10) { ++ bfq_log_bfqq(bfqd, bfqq, "looking for queue for injection"); ++ bfqq = bfq_choose_bfqq_for_injection(bfqd); ++ } else { ++ if (BFQQ_SEEKY(bfqq)) ++ bfq_log_bfqq(bfqd, bfqq, ++ "injection saturated %d * %d >= %d * 10", ++ bfqq->injected_service, bfqq->inject_coeff, ++ bfqq->entity.service); ++ bfqq = NULL; ++ } ++ goto keep_queue; ++ } ++ ++ reason = BFQ_BFQQ_NO_MORE_REQUESTS; ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, reason); ++new_queue: ++ bfqq = bfq_set_in_service_queue(bfqd); ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "checking new queue"); ++ goto check_queue; ++ } ++keep_queue: ++ if (bfqq) ++ bfq_log_bfqq(bfqd, bfqq, "returned this queue"); ++ else ++ bfq_log(bfqd, "no queue returned"); ++ ++ return bfqq; ++} ++ ++static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *entity = &bfqq->entity; ++ ++ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ ++ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && ++ time_is_after_jiffies(bfqq->last_wr_start_finish)); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", ++ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time), ++ bfqq->wr_coeff, ++ bfqq->entity.weight, bfqq->entity.orig_weight); ++ ++ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != ++ entity->orig_weight * bfqq->wr_coeff); ++ if (entity->prio_changed) ++ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); ++ ++ /* ++ * If the queue was activated in a burst, or too much ++ * time has elapsed from the beginning of this ++ * weight-raising period, then end weight raising. ++ */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bfq_bfqq_end_wr(bfqq); ++ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + ++ bfqq->wr_cur_max_time)) { ++ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || ++ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + ++ bfq_wr_duration(bfqd))) ++ bfq_bfqq_end_wr(bfqq); ++ else { ++ switch_back_to_interactive_wr(bfqq, bfqd); ++ BUG_ON(time_is_after_jiffies( ++ bfqq->last_wr_start_finish)); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqd, bfqq, ++ "back to interactive wr"); ++ } ++ } ++ if (bfqq->wr_coeff > 1 && ++ bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time && ++ bfqq->service_from_wr > max_service_from_wr) { ++ /* see comments on max_service_from_wr */ ++ bfq_bfqq_end_wr(bfqq); ++ bfq_log_bfqq(bfqd, bfqq, ++ "too much service"); ++ } ++ } ++ /* ++ * To improve latency (for this or other queues), immediately ++ * update weight both if it must be raised and if it must be ++ * lowered. Since, entity may be on some active tree here, and ++ * might have a pending change of its ioprio class, invoke ++ * next function with the last parameter unset (see the ++ * comments on the function). ++ */ ++ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) ++ __bfq_entity_update_weight_prio(bfq_entity_service_tree(entity), ++ entity, false); ++} ++ ++/* ++ * Dispatch one request from bfqq, moving it to the request queue ++ * dispatch list. ++ */ ++static int bfq_dispatch_request(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ struct request *rq = bfqq->next_rq; ++ unsigned long service_to_charge; ++ ++ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); ++ BUG_ON(!rq); ++ service_to_charge = bfq_serv_to_charge(rq, bfqq); ++ ++ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_bfqq_served(bfqq, service_to_charge); ++ ++ BUG_ON(bfqq->entity.budget < bfqq->entity.service); ++ ++ bfq_dispatch_insert(bfqd->queue, rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "dispatched %u sec req (%llu), budg left %d, new disp_nr %d", ++ blk_rq_sectors(rq), ++ (unsigned long long) blk_rq_pos(rq), ++ bfq_bfqq_budget_left(bfqq), ++ bfqq->dispatched); ++ ++ dispatched++; ++ ++ if (bfqq != bfqd->in_service_queue) { ++ if (likely(bfqd->in_service_queue)) { ++ bfqd->in_service_queue->injected_service += ++ bfq_serv_to_charge(rq, bfqq); ++ bfq_log_bfqq(bfqd, bfqd->in_service_queue, ++ "injected_service increased to %d", ++ bfqd->in_service_queue->injected_service); ++ } ++ return dispatched; ++ } ++ ++ /* ++ * If weight raising has to terminate for bfqq, then next ++ * function causes an immediate update of bfqq's weight, ++ * without waiting for next activation. As a consequence, on ++ * expiration, bfqq will be timestamped as if has never been ++ * weight-raised during this service slot, even if it has ++ * received part or even most of the service as a ++ * weight-raised queue. This inflates bfqq's timestamps, which ++ * is beneficial, as bfqq is then more willing to leave the ++ * device immediately to possible other weight-raised queues. ++ */ ++ bfq_update_wr_data(bfqd, bfqq); ++ ++ if (!bfqd->in_service_bic) { ++ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); ++ bfqd->in_service_bic = RQ_BIC(rq); ++ BUG_ON(!bfqd->in_service_bic); ++ } ++ ++ if (bfq_tot_busy_queues(bfqd) > 1 && bfq_class_idle(bfqq)) ++ goto expire; ++ ++ return dispatched; ++ ++expire: ++ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); ++ return dispatched; ++} ++ ++static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) ++{ ++ int dispatched = 0; ++ ++ while (bfqq->next_rq) { ++ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); ++ dispatched++; ++ } ++ ++ BUG_ON(!list_empty(&bfqq->fifo)); ++ return dispatched; ++} ++ ++/* ++ * Drain our current requests. ++ * Used for barriers and when switching io schedulers on-the-fly. ++ */ ++static int bfq_forced_dispatch(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq, *n; ++ struct bfq_service_tree *st; ++ int dispatched = 0; ++ ++ bfqq = bfqd->in_service_queue; ++ if (bfqq) ++ __bfq_bfqq_expire(bfqd, bfqq); ++ ++ /* ++ * Loop through classes, and be careful to leave the scheduler ++ * in a consistent state, as feedback mechanisms and vtime ++ * updates cannot be disabled during the process. ++ */ ++ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { ++ st = bfq_entity_service_tree(&bfqq->entity); ++ ++ dispatched += __bfq_forced_dispatch_bfqq(bfqq); ++ ++ bfqq->max_budget = bfq_max_budget(bfqd); ++ bfq_forget_idle(st); ++ } ++ ++ BUG_ON(bfq_tot_busy_queues(bfqd) != 0); ++ ++ return dispatched; ++} ++ ++static int bfq_dispatch_requests(struct request_queue *q, int force) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq; ++ ++ bfq_log(bfqd, "%d busy queues", bfq_tot_busy_queues(bfqd)); ++ ++ if (bfq_tot_busy_queues(bfqd) == 0) ++ return 0; ++ ++ if (unlikely(force)) ++ return bfq_forced_dispatch(bfqd); ++ ++ /* ++ * Force device to serve one request at a time if ++ * strict_guarantees is true. Forcing this service scheme is ++ * currently the ONLY way to guarantee that the request ++ * service order enforced by the scheduler is respected by a ++ * queueing device. Otherwise the device is free even to make ++ * some unlucky request wait for as long as the device ++ * wishes. ++ * ++ * Of course, serving one request at at time may cause loss of ++ * throughput. ++ */ ++ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) ++ return 0; ++ ++ bfqq = bfq_select_queue(bfqd); ++ if (!bfqq) ++ return 0; ++ ++ BUG_ON(bfqq == bfqd->in_service_queue && ++ bfqq->entity.budget < bfqq->entity.service); ++ ++ BUG_ON(bfqq == bfqd->in_service_queue && ++ bfq_bfqq_wait_request(bfqq)); ++ ++ if (!bfq_dispatch_request(bfqd, bfqq)) ++ return 0; ++ ++ bfq_log_bfqq(bfqd, bfqq, "%s request", ++ bfq_bfqq_sync(bfqq) ? "sync" : "async"); ++ ++ BUG_ON(bfqq->next_rq == NULL && ++ bfqq->entity.budget < bfqq->entity.service); ++ return 1; ++} ++ ++/* ++ * Task holds one reference to the queue, dropped when task exits. Each rq ++ * in-flight on this queue also holds a reference, dropped when rq is freed. ++ * ++ * Queue lock must be held here. Recall not to use bfqq after calling ++ * this function on it. ++ */ ++static void bfq_put_queue(struct bfq_queue *bfqq) ++{ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ struct bfq_group *bfqg = bfqq_group(bfqq); ++#endif ++ ++ BUG_ON(bfqq->ref <= 0); ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p %d", bfqq, bfqq->ref); ++ bfqq->ref--; ++ if (bfqq->ref) ++ return; ++ ++ BUG_ON(rb_first(&bfqq->sort_list)); ++ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); ++ BUG_ON(bfqq->entity.tree); ++ BUG_ON(bfq_bfqq_busy(bfqq)); ++ ++ if (!hlist_unhashed(&bfqq->burst_list_node)) { ++ hlist_del_init(&bfqq->burst_list_node); ++ /* ++ * Decrement also burst size after the removal, if the ++ * process associated with bfqq is exiting, and thus ++ * does not contribute to the burst any longer. This ++ * decrement helps filter out false positives of large ++ * bursts, when some short-lived process (often due to ++ * the execution of commands by some service) happens ++ * to start and exit while a complex application is ++ * starting, and thus spawning several processes that ++ * do I/O (and that *must not* be treated as a large ++ * burst, see comments on bfq_handle_burst). ++ * ++ * In particular, the decrement is performed only if: ++ * 1) bfqq is not a merged queue, because, if it is, ++ * then this free of bfqq is not triggered by the exit ++ * of the process bfqq is associated with, but exactly ++ * by the fact that bfqq has just been merged. ++ * 2) burst_size is greater than 0, to handle ++ * unbalanced decrements. Unbalanced decrements may ++ * happen in te following case: bfqq is inserted into ++ * the current burst list--without incrementing ++ * bust_size--because of a split, but the current ++ * burst list is not the burst list bfqq belonged to ++ * (see comments on the case of a split in ++ * bfq_set_request). ++ */ ++ if (bfqq->bic && bfqq->bfqd->burst_size > 0) ++ bfqq->bfqd->burst_size--; ++ } ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p freed", bfqq); ++ ++ kmem_cache_free(bfq_pool, bfqq); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ bfqg_put(bfqg); ++#endif ++} ++ ++static void bfq_put_cooperator(struct bfq_queue *bfqq) ++{ ++ struct bfq_queue *__bfqq, *next; ++ ++ /* ++ * If this queue was scheduled to merge with another queue, be ++ * sure to drop the reference taken on that queue (and others in ++ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. ++ */ ++ __bfqq = bfqq->new_bfqq; ++ while (__bfqq) { ++ if (__bfqq == bfqq) ++ break; ++ next = __bfqq->new_bfqq; ++ bfq_put_queue(__bfqq); ++ __bfqq = next; ++ } ++} ++ ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) ++{ ++ if (bfqq == bfqd->in_service_queue) { ++ __bfq_bfqq_expire(bfqd, bfqq); ++ bfq_schedule_dispatch(bfqd); ++ } ++ ++ bfq_log_bfqq(bfqd, bfqq, "%p, %d", bfqq, bfqq->ref); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); /* release process reference */ ++} ++ ++static void bfq_init_icq(struct io_cq *icq) ++{ ++ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); ++} ++ ++static void bfq_exit_icq(struct io_cq *icq) ++{ ++ struct bfq_io_cq *bic = icq_to_bic(icq); ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ ++ if (bic_to_bfqq(bic, false)) { ++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); ++ bic_set_bfqq(bic, NULL, false); ++ } ++ ++ if (bic_to_bfqq(bic, true)) { ++ /* ++ * If the bic is using a shared queue, put the reference ++ * taken on the io_context when the bic started using a ++ * shared bfq_queue. ++ */ ++ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) ++ put_io_context(icq->ioc); ++ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); ++ bic_set_bfqq(bic, NULL, true); ++ } ++} ++ ++/* ++ * Update the entity prio values; note that the new values will not ++ * be used until the next (re)activation. ++ */ ++static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ struct task_struct *tsk = current; ++ int ioprio_class; ++ ++ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ switch (ioprio_class) { ++ default: ++ dev_err(bfqq->bfqd->queue->backing_dev_info->dev, ++ "bfq: bad prio class %d\n", ioprio_class); ++ case IOPRIO_CLASS_NONE: ++ /* ++ * No prio set, inherit CPU scheduling settings. ++ */ ++ bfqq->new_ioprio = task_nice_ioprio(tsk); ++ bfqq->new_ioprio_class = task_nice_ioclass(tsk); ++ break; ++ case IOPRIO_CLASS_RT: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; ++ break; ++ case IOPRIO_CLASS_BE: ++ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; ++ break; ++ case IOPRIO_CLASS_IDLE: ++ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; ++ bfqq->new_ioprio = 7; ++ break; ++ } ++ ++ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { ++ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", ++ bfqq->new_ioprio); ++ BUG(); ++ } ++ ++ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); ++ bfqq->entity.prio_changed = 1; ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "bic_class %d prio %d class %d", ++ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); ++} ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) ++{ ++ struct bfq_data *bfqd = bic_to_bfqd(bic); ++ struct bfq_queue *bfqq; ++ unsigned long uninitialized_var(flags); ++ int ioprio = bic->icq.ioc->ioprio; ++ ++ /* ++ * This condition may trigger on a newly created bic, be sure to ++ * drop the lock before returning. ++ */ ++ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) ++ return; ++ ++ bic->ioprio = ioprio; ++ ++ bfqq = bic_to_bfqq(bic, false); ++ if (bfqq) { ++ /* release process reference on this queue */ ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); ++ bic_set_bfqq(bic, bfqq, false); ++ bfq_log_bfqq(bfqd, bfqq, ++ "bfqq %p %d", ++ bfqq, bfqq->ref); ++ } ++ ++ bfqq = bic_to_bfqq(bic, true); ++ if (bfqq) ++ bfq_set_next_ioprio_data(bfqq, bic); ++} ++ ++static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic, pid_t pid, int is_sync) ++{ ++ RB_CLEAR_NODE(&bfqq->entity.rb_node); ++ INIT_LIST_HEAD(&bfqq->fifo); ++ INIT_HLIST_NODE(&bfqq->burst_list_node); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bfqq->ref = 0; ++ bfqq->bfqd = bfqd; ++ ++ if (bic) ++ bfq_set_next_ioprio_data(bfqq, bic); ++ ++ if (is_sync) { ++ /* ++ * No need to mark as has_short_ttime if in ++ * idle_class, because no device idling is performed ++ * for queues in idle class ++ */ ++ if (!bfq_class_idle(bfqq)) ++ /* tentatively mark as has_short_ttime */ ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ bfq_mark_bfqq_sync(bfqq); ++ bfq_mark_bfqq_just_created(bfqq); ++ /* ++ * Aggressively inject a lot of service: up to 90%. ++ * This coefficient remains constant during bfqq life, ++ * but this behavior might be changed, after enough ++ * testing and tuning. ++ */ ++ bfqq->inject_coeff = 1; ++ } else ++ bfq_clear_bfqq_sync(bfqq); ++ bfq_mark_bfqq_IO_bound(bfqq); ++ ++ /* Tentative initial value to trade off between thr and lat */ ++ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; ++ bfqq->pid = pid; ++ ++ bfqq->wr_coeff = 1; ++ bfqq->last_wr_start_finish = jiffies; ++ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); ++ bfqq->budget_timeout = bfq_smallest_from_now(); ++ bfqq->split_time = bfq_smallest_from_now(); ++ ++ /* ++ * To not forget the possibly high bandwidth consumed by a ++ * process/queue in the recent past, ++ * bfq_bfqq_softrt_next_start() returns a value at least equal ++ * to the current value of bfqq->soft_rt_next_start (see ++ * comments on bfq_bfqq_softrt_next_start). Set ++ * soft_rt_next_start to now, to mean that bfqq has consumed ++ * no bandwidth so far. ++ */ ++ bfqq->soft_rt_next_start = jiffies; ++ ++ /* first request is almost certainly seeky */ ++ bfqq->seek_history = 1; ++} ++ ++static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, ++ struct bfq_group *bfqg, ++ int ioprio_class, int ioprio) ++{ ++ switch (ioprio_class) { ++ case IOPRIO_CLASS_RT: ++ return &bfqg->async_bfqq[0][ioprio]; ++ case IOPRIO_CLASS_NONE: ++ ioprio = IOPRIO_NORM; ++ /* fall through */ ++ case IOPRIO_CLASS_BE: ++ return &bfqg->async_bfqq[1][ioprio]; ++ case IOPRIO_CLASS_IDLE: ++ return &bfqg->async_idle_bfqq; ++ default: ++ BUG(); ++ } ++} ++ ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic) ++{ ++ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); ++ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); ++ struct bfq_queue **async_bfqq = NULL; ++ struct bfq_queue *bfqq; ++ struct bfq_group *bfqg; ++ ++ rcu_read_lock(); ++ ++ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); ++ if (!bfqg) { ++ bfqq = &bfqd->oom_bfqq; ++ goto out; ++ } ++ ++ if (!is_sync) { ++ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, ++ ioprio); ++ bfqq = *async_bfqq; ++ if (bfqq) ++ goto out; ++ } ++ ++ bfqq = kmem_cache_alloc_node(bfq_pool, ++ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, ++ bfqd->queue->node); ++ ++ if (bfqq) { ++ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, ++ is_sync); ++ bfq_init_entity(&bfqq->entity, bfqg); ++ bfq_log_bfqq(bfqd, bfqq, "allocated"); ++ } else { ++ bfqq = &bfqd->oom_bfqq; ++ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); ++ goto out; ++ } ++ ++ /* ++ * Pin the queue now that it's allocated, scheduler exit will ++ * prune it. ++ */ ++ if (async_bfqq) { ++ bfqq->ref++; /* ++ * Extra group reference, w.r.t. sync ++ * queue. This extra reference is removed ++ * only if bfqq->bfqg disappears, to ++ * guarantee that this queue is not freed ++ * until its group goes away. ++ */ ++ bfq_log_bfqq(bfqd, bfqq, "bfqq not in async: %p, %d", ++ bfqq, bfqq->ref); ++ *async_bfqq = bfqq; ++ } ++ ++out: ++ bfqq->ref++; /* get a process reference to this queue */ ++ bfq_log_bfqq(bfqd, bfqq, "at end: %p, %d", bfqq, bfqq->ref); ++ rcu_read_unlock(); ++ return bfqq; ++} ++ ++static void bfq_update_io_thinktime(struct bfq_data *bfqd, ++ struct bfq_io_cq *bic) ++{ ++ struct bfq_ttime *ttime = &bic->ttime; ++ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; ++ ++ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); ++ ++ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; ++ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); ++ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, ++ ttime->ttime_samples); ++} ++ ++static void ++bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ bfqq->seek_history <<= 1; ++ bfqq->seek_history |= BFQ_RQ_SEEKY(bfqd, bfqq->last_request_pos, rq); ++} ++ ++static void bfq_update_has_short_ttime(struct bfq_data *bfqd, ++ struct bfq_queue *bfqq, ++ struct bfq_io_cq *bic) ++{ ++ bool has_short_ttime = true; ++ ++ /* ++ * No need to update has_short_ttime if bfqq is async or in ++ * idle io prio class, or if bfq_slice_idle is zero, because ++ * no device idling is performed for bfqq in this case. ++ */ ++ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq) || ++ bfqd->bfq_slice_idle == 0) ++ return; ++ ++ /* Idle window just restored, statistics are meaningless. */ ++ if (time_is_after_eq_jiffies(bfqq->split_time + ++ bfqd->bfq_wr_min_idle_time)) ++ return; ++ ++ /* Think time is infinite if no process is linked to ++ * bfqq. Otherwise check average think time to ++ * decide whether to mark as has_short_ttime ++ */ ++ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || ++ (bfq_sample_valid(bic->ttime.ttime_samples) && ++ bic->ttime.ttime_mean > bfqd->bfq_slice_idle)) ++ has_short_ttime = false; ++ ++ bfq_log_bfqq(bfqd, bfqq, "has_short_ttime %d", ++ has_short_ttime); ++ ++ if (has_short_ttime) ++ bfq_mark_bfqq_has_short_ttime(bfqq); ++ else ++ bfq_clear_bfqq_has_short_ttime(bfqq); ++} ++ ++/* ++ * Called when a new fs request (rq) is added to bfqq. Check if there's ++ * something we should do about it. ++ */ ++static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, ++ struct request *rq) ++{ ++ struct bfq_io_cq *bic = RQ_BIC(rq); ++ ++ if (rq->cmd_flags & REQ_META) ++ bfqq->meta_pending++; ++ ++ bfq_update_io_thinktime(bfqd, bic); ++ bfq_update_has_short_ttime(bfqd, bfqq, bic); ++ bfq_update_io_seektime(bfqd, bfqq, rq); ++ ++ bfq_log_bfqq(bfqd, bfqq, ++ "has_short_ttime=%d (seeky %d)", ++ bfq_bfqq_has_short_ttime(bfqq), BFQQ_SEEKY(bfqq)); ++ ++ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { ++ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && ++ blk_rq_sectors(rq) < 32; ++ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); ++ ++ /* ++ * There is just this request queued: if ++ * - the request is small, and ++ * - we are idling to boost throughput, and ++ * - the queue is not to be expired, ++ * then just exit. ++ * ++ * In this way, if the device is being idled to wait ++ * for a new request from the in-service queue, we ++ * avoid unplugging the device and committing the ++ * device to serve just a small request. In contrast ++ * we wait for the block layer to decide when to ++ * unplug the device: hopefully, new requests will be ++ * merged to this one quickly, then the device will be ++ * unplugged and larger requests will be dispatched. ++ */ ++ if (small_req && idling_boosts_thr_without_issues(bfqd, bfqq) && ++ !budget_timeout) ++ return; ++ ++ /* ++ * A large enough request arrived, or idling is being ++ * performed to preserve service guarantees, or ++ * finally the queue is to be expired: in all these ++ * cases disk idling is to be stopped, so clear ++ * wait_request flag and reset timer. ++ */ ++ bfq_clear_bfqq_wait_request(bfqq); ++ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); ++ bfqg_stats_update_idle_time(bfqq_group(bfqq)); ++ ++ /* ++ * The queue is not empty, because a new request just ++ * arrived. Hence we can safely expire the queue, in ++ * case of budget timeout, without risking that the ++ * timestamps of the queue are not updated correctly. ++ * See [1] for more details. ++ */ ++ if (budget_timeout) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ ++ /* ++ * Let the request rip immediately, or let a new queue be ++ * selected if bfqq has just been expired. ++ */ ++ __blk_run_queue(bfqd->queue); ++ } ++} ++ ++static void bfq_insert_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ ++ /* ++ * An unplug may trigger a requeue of a request from the device ++ * driver: make sure we are in process context while trying to ++ * merge two bfq_queues. ++ */ ++ if (!in_interrupt()) { ++ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); ++ if (new_bfqq) { ++ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) ++ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); ++ /* ++ * Release the request's reference to the old bfqq ++ * and make sure one is taken to the shared queue. ++ */ ++ new_bfqq->allocated[rq_data_dir(rq)]++; ++ bfqq->allocated[rq_data_dir(rq)]--; ++ new_bfqq->ref++; ++ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) ++ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), ++ bfqq, new_bfqq); ++ ++ bfq_clear_bfqq_just_created(bfqq); ++ /* ++ * rq is about to be enqueued into new_bfqq, ++ * release rq reference on bfqq ++ */ ++ bfq_put_queue(bfqq); ++ rq->elv.priv[1] = new_bfqq; ++ bfqq = new_bfqq; ++ } ++ } ++ ++ bfq_add_request(rq); ++ ++ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; ++ list_add_tail(&rq->queuelist, &bfqq->fifo); ++ ++ bfq_rq_enqueued(bfqd, bfqq, rq); ++} ++ ++static void bfq_update_hw_tag(struct bfq_data *bfqd) ++{ ++ struct bfq_queue *bfqq = bfqd->in_service_queue; ++ ++ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, ++ bfqd->rq_in_driver); ++ ++ if (bfqd->hw_tag == 1) ++ return; ++ ++ /* ++ * This sample is valid if the number of outstanding requests ++ * is large enough to allow a queueing behavior. Note that the ++ * sum is not exact, as it's not taking into account deactivated ++ * requests. ++ */ ++ if (bfqd->rq_in_driver + bfqd->queued <= BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ /* ++ * If active queue hasn't enough requests and can idle, bfq might not ++ * dispatch sufficient requests to hardware. Don't zero hw_tag in this ++ * case ++ */ ++ if (bfqq && bfq_bfqq_has_short_ttime(bfqq) && ++ bfqq->dispatched + bfqq->queued[0] + bfqq->queued[1] < ++ BFQ_HW_QUEUE_THRESHOLD && bfqd->rq_in_driver < BFQ_HW_QUEUE_THRESHOLD) ++ return; ++ ++ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) ++ return; ++ ++ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; ++ bfqd->max_rq_in_driver = 0; ++ bfqd->hw_tag_samples = 0; ++} ++ ++static void bfq_completed_request(struct request_queue *q, struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ struct bfq_data *bfqd = bfqq->bfqd; ++ u64 now_ns; ++ u32 delta_us; ++ ++ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", ++ blk_rq_sectors(rq)); ++ ++ assert_spin_locked(bfqd->queue->queue_lock); ++ bfq_update_hw_tag(bfqd); ++ ++ BUG_ON(!bfqd->rq_in_driver); ++ BUG_ON(!bfqq->dispatched); ++ bfqd->rq_in_driver--; ++ bfqq->dispatched--; ++ bfqg_stats_update_completion(bfqq_group(bfqq), ++ rq->start_time_ns, ++ rq->io_start_time_ns, ++ rq->cmd_flags); ++ ++ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { ++ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); ++ /* ++ * Set budget_timeout (which we overload to store the ++ * time at which the queue remains with no backlog and ++ * no outstanding request; used by the weight-raising ++ * mechanism). ++ */ ++ bfqq->budget_timeout = jiffies; ++ ++ bfq_weights_tree_remove(bfqd, bfqq); ++ } ++ ++ now_ns = ktime_get_ns(); ++ ++ RQ_BIC(rq)->ttime.last_end_request = now_ns; ++ ++ /* ++ * Using us instead of ns, to get a reasonable precision in ++ * computing rate in next check. ++ */ ++ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); ++ ++ bfq_log(bfqd, "delta %uus/%luus max_size %u rate %llu/%llu", ++ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, ++ delta_us > 0 ? ++ (USEC_PER_SEC* ++ (u64)((bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us)) ++ >>BFQ_RATE_SHIFT : ++ (USEC_PER_SEC* ++ (u64)(bfqd->last_rq_max_size<<BFQ_RATE_SHIFT))>>BFQ_RATE_SHIFT, ++ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); ++ ++ /* ++ * If the request took rather long to complete, and, according ++ * to the maximum request size recorded, this completion latency ++ * implies that the request was certainly served at a very low ++ * rate (less than 1M sectors/sec), then the whole observation ++ * interval that lasts up to this time instant cannot be a ++ * valid time interval for computing a new peak rate. Invoke ++ * bfq_update_rate_reset to have the following three steps ++ * taken: ++ * - close the observation interval at the last (previous) ++ * request dispatch or completion ++ * - compute rate, if possible, for that observation interval ++ * - reset to zero samples, which will trigger a proper ++ * re-initialization of the observation interval on next ++ * dispatch ++ */ ++ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && ++ (bfqd->last_rq_max_size<<BFQ_RATE_SHIFT)/delta_us < ++ 1UL<<(BFQ_RATE_SHIFT - 10)) ++ bfq_update_rate_reset(bfqd, NULL); ++ bfqd->last_completion = now_ns; ++ ++ /* ++ * If we are waiting to discover whether the request pattern ++ * of the task associated with the queue is actually ++ * isochronous, and both requisites for this condition to hold ++ * are now satisfied, then compute soft_rt_next_start (see the ++ * comments on the function bfq_bfqq_softrt_next_start()). We ++ * do not compute soft_rt_next_start if bfqq is in interactive ++ * weight raising (see the comments in bfq_bfqq_expire() for ++ * an explanation). We schedule this delayed update when bfqq ++ * expires, if it still has in-flight requests. ++ */ ++ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && ++ RB_EMPTY_ROOT(&bfqq->sort_list) && ++ bfqq->wr_coeff != bfqd->bfq_wr_coeff) ++ bfqq->soft_rt_next_start = ++ bfq_bfqq_softrt_next_start(bfqd, bfqq); ++ ++ /* ++ * If this is the in-service queue, check if it needs to be expired, ++ * or if we want to idle in case it has no pending requests. ++ */ ++ if (bfqd->in_service_queue == bfqq) { ++ if (bfq_bfqq_must_idle(bfqq)) { ++ if (bfqq->dispatched == 0) ++ bfq_arm_slice_timer(bfqd); ++ /* ++ * If we get here, we do not expire bfqq, even ++ * if bfqq was in budget timeout or had no ++ * more requests (as controlled in the next ++ * conditional instructions). The reason for ++ * not expiring bfqq is as follows. ++ * ++ * Here bfqq->dispatched > 0 holds, but ++ * bfq_bfqq_must_idle() returned true. This ++ * implies that, even if no request arrives ++ * for bfqq before bfqq->dispatched reaches 0, ++ * bfqq will, however, not be expired on the ++ * completion event that causes bfqq->dispatch ++ * to reach zero. In contrast, on this event, ++ * bfqq will start enjoying device idling ++ * (I/O-dispatch plugging). ++ * ++ * But, if we expired bfqq here, bfqq would ++ * not have the chance to enjoy device idling ++ * when bfqq->dispatched finally reaches ++ * zero. This would expose bfqq to violation ++ * of its reserved service guarantees. ++ */ ++ goto out; ++ } else if (bfq_may_expire_for_budg_timeout(bfqq)) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_BUDGET_TIMEOUT); ++ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && ++ (bfqq->dispatched == 0 || ++ !bfq_better_to_idle(bfqq))) ++ bfq_bfqq_expire(bfqd, bfqq, false, ++ BFQ_BFQQ_NO_MORE_REQUESTS); ++ } ++ ++ if (!bfqd->rq_in_driver) ++ bfq_schedule_dispatch(bfqd); ++ ++out: ++ return; ++} ++ ++static int __bfq_may_queue(struct bfq_queue *bfqq) ++{ ++ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { ++ bfq_clear_bfqq_must_alloc(bfqq); ++ return ELV_MQUEUE_MUST; ++ } ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++static int bfq_may_queue(struct request_queue *q, unsigned int op) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct task_struct *tsk = current; ++ struct bfq_io_cq *bic; ++ struct bfq_queue *bfqq; ++ ++ /* ++ * Don't force setup of a queue from here, as a call to may_queue ++ * does not necessarily imply that a request actually will be ++ * queued. So just lookup a possibly existing queue, or return ++ * 'may queue' if that fails. ++ */ ++ bic = bfq_bic_lookup(bfqd, tsk->io_context); ++ if (!bic) ++ return ELV_MQUEUE_MAY; ++ ++ bfqq = bic_to_bfqq(bic, op_is_sync(op)); ++ if (bfqq) ++ return __bfq_may_queue(bfqq); ++ ++ return ELV_MQUEUE_MAY; ++} ++ ++/* ++ * Queue lock held here. ++ */ ++static void bfq_put_request(struct request *rq) ++{ ++ struct bfq_queue *bfqq = RQ_BFQQ(rq); ++ ++ if (bfqq) { ++ const int rw = rq_data_dir(rq); ++ ++ BUG_ON(!bfqq->allocated[rw]); ++ bfqq->allocated[rw]--; ++ ++ rq->elv.priv[0] = NULL; ++ rq->elv.priv[1] = NULL; ++ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "%p, %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ } ++} ++ ++/* ++ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this ++ * was the last process referring to that bfqq. ++ */ ++static struct bfq_queue * ++bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) ++{ ++ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); ++ ++ put_io_context(bic->icq.ioc); ++ ++ if (bfqq_process_refs(bfqq) == 1) { ++ bfqq->pid = current->pid; ++ bfq_clear_bfqq_coop(bfqq); ++ bfq_clear_bfqq_split_coop(bfqq); ++ return bfqq; ++ } ++ ++ bic_set_bfqq(bic, NULL, 1); ++ ++ bfq_put_cooperator(bfqq); ++ ++ bfq_put_queue(bfqq); ++ return NULL; ++} ++ ++/* ++ * Allocate bfq data structures associated with this request. ++ */ ++static int bfq_set_request(struct request_queue *q, struct request *rq, ++ struct bio *bio, gfp_t gfp_mask) ++{ ++ struct bfq_data *bfqd = q->elevator->elevator_data; ++ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); ++ const int rw = rq_data_dir(rq); ++ const int is_sync = rq_is_sync(rq); ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ bool bfqq_already_existing = false, split = false; ++ ++ spin_lock_irqsave(q->queue_lock, flags); ++ ++ if (!bic) ++ goto queue_fail; ++ ++ bfq_check_ioprio_change(bic, bio); ++ ++ bfq_bic_update_cgroup(bic, bio); ++ ++new_queue: ++ bfqq = bic_to_bfqq(bic, is_sync); ++ if (!bfqq || bfqq == &bfqd->oom_bfqq) { ++ if (bfqq) ++ bfq_put_queue(bfqq); ++ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); ++ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); ++ ++ bic_set_bfqq(bic, bfqq, is_sync); ++ if (split && is_sync) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "was_in_list %d " ++ "was_in_large_burst %d " ++ "large burst in progress %d", ++ bic->was_in_burst_list, ++ bic->saved_in_large_burst, ++ bfqd->large_burst); ++ ++ if ((bic->was_in_burst_list && bfqd->large_burst) || ++ bic->saved_in_large_burst) { ++ bfq_log_bfqq(bfqd, bfqq, ++ "marking in " ++ "large burst"); ++ bfq_mark_bfqq_in_large_burst(bfqq); ++ } else { ++ bfq_log_bfqq(bfqd, bfqq, ++ "clearing in " ++ "large burst"); ++ bfq_clear_bfqq_in_large_burst(bfqq); ++ if (bic->was_in_burst_list) ++ /* ++ * If bfqq was in the current ++ * burst list before being ++ * merged, then we have to add ++ * it back. And we do not need ++ * to increase burst_size, as ++ * we did not decrement ++ * burst_size when we removed ++ * bfqq from the burst list as ++ * a consequence of a merge ++ * (see comments in ++ * bfq_put_queue). In this ++ * respect, it would be rather ++ * costly to know whether the ++ * current burst list is still ++ * the same burst list from ++ * which bfqq was removed on ++ * the merge. To avoid this ++ * cost, if bfqq was in a ++ * burst list, then we add ++ * bfqq to the current burst ++ * list without any further ++ * check. This can cause ++ * inappropriate insertions, ++ * but rarely enough to not ++ * harm the detection of large ++ * bursts significantly. ++ */ ++ hlist_add_head(&bfqq->burst_list_node, ++ &bfqd->burst_list); ++ } ++ bfqq->split_time = jiffies; ++ } ++ } else { ++ /* If the queue was seeky for too long, break it apart. */ ++ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { ++ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); ++ ++ /* Update bic before losing reference to bfqq */ ++ if (bfq_bfqq_in_large_burst(bfqq)) ++ bic->saved_in_large_burst = true; ++ ++ bfqq = bfq_split_bfqq(bic, bfqq); ++ split = true; ++ if (!bfqq) ++ goto new_queue; ++ else ++ bfqq_already_existing = true; ++ } ++ } ++ ++ bfqq->allocated[rw]++; ++ bfqq->ref++; ++ bfq_log_bfqq(bfqd, bfqq, "bfqq %p, %d", bfqq, bfqq->ref); ++ ++ rq->elv.priv[0] = bic; ++ rq->elv.priv[1] = bfqq; ++ ++ /* ++ * If a bfq_queue has only one process reference, it is owned ++ * by only one bfq_io_cq: we can set the bic field of the ++ * bfq_queue to the address of that structure. Also, if the ++ * queue has just been split, mark a flag so that the ++ * information is available to the other scheduler hooks. ++ */ ++ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { ++ bfqq->bic = bic; ++ if (split) { ++ /* ++ * If the queue has just been split from a shared ++ * queue, restore the idle window and the possible ++ * weight raising period. ++ */ ++ bfq_bfqq_resume_state(bfqq, bfqd, bic, ++ bfqq_already_existing); ++ } ++ } ++ ++ if (unlikely(bfq_bfqq_just_created(bfqq))) ++ bfq_handle_burst(bfqd, bfqq); ++ ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 0; ++ ++queue_fail: ++ bfq_schedule_dispatch(bfqd); ++ spin_unlock_irqrestore(q->queue_lock, flags); ++ ++ return 1; ++} ++ ++static void bfq_kick_queue(struct work_struct *work) ++{ ++ struct bfq_data *bfqd = ++ container_of(work, struct bfq_data, unplug_work); ++ struct request_queue *q = bfqd->queue; ++ ++ spin_lock_irq(q->queue_lock); ++ __blk_run_queue(q); ++ spin_unlock_irq(q->queue_lock); ++} ++ ++/* ++ * Handler of the expiration of the timer running if the in-service queue ++ * is idling inside its time slice. ++ */ ++static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) ++{ ++ struct bfq_data *bfqd = container_of(timer, struct bfq_data, ++ idle_slice_timer); ++ struct bfq_queue *bfqq; ++ unsigned long flags; ++ enum bfqq_expiration reason; ++ ++ spin_lock_irqsave(bfqd->queue->queue_lock, flags); ++ ++ bfqq = bfqd->in_service_queue; ++ /* ++ * Theoretical race here: the in-service queue can be NULL or ++ * different from the queue that was idling if the timer handler ++ * spins on the queue_lock and a new request arrives for the ++ * current queue and there is a full dispatch cycle that changes ++ * the in-service queue. This can hardly happen, but in the worst ++ * case we just expire a queue too early. ++ */ ++ if (bfqq) { ++ bfq_log_bfqq(bfqd, bfqq, "expired"); ++ bfq_clear_bfqq_wait_request(bfqq); ++ ++ if (bfq_bfqq_budget_timeout(bfqq)) ++ /* ++ * Also here the queue can be safely expired ++ * for budget timeout without wasting ++ * guarantees ++ */ ++ reason = BFQ_BFQQ_BUDGET_TIMEOUT; ++ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) ++ /* ++ * The queue may not be empty upon timer expiration, ++ * because we may not disable the timer when the ++ * first request of the in-service queue arrives ++ * during disk idling. ++ */ ++ reason = BFQ_BFQQ_TOO_IDLE; ++ else ++ goto schedule_dispatch; ++ ++ bfq_bfqq_expire(bfqd, bfqq, true, reason); ++ } ++ ++schedule_dispatch: ++ bfq_schedule_dispatch(bfqd); ++ ++ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); ++ return HRTIMER_NORESTART; ++} ++ ++static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) ++{ ++ hrtimer_cancel(&bfqd->idle_slice_timer); ++ cancel_work_sync(&bfqd->unplug_work); ++} ++ ++static void __bfq_put_async_bfqq(struct bfq_data *bfqd, ++ struct bfq_queue **bfqq_ptr) ++{ ++ struct bfq_group *root_group = bfqd->root_group; ++ struct bfq_queue *bfqq = *bfqq_ptr; ++ ++ bfq_log(bfqd, "%p", bfqq); ++ if (bfqq) { ++ bfq_bfqq_move(bfqd, bfqq, root_group); ++ bfq_log_bfqq(bfqd, bfqq, "putting %p, %d", ++ bfqq, bfqq->ref); ++ bfq_put_queue(bfqq); ++ *bfqq_ptr = NULL; ++ } ++} ++ ++/* ++ * Release all the bfqg references to its async queues. If we are ++ * deallocating the group these queues may still contain requests, so ++ * we reparent them to the root cgroup (i.e., the only one that will ++ * exist for sure until all the requests on a device are gone). ++ */ ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) ++{ ++ int i, j; ++ ++ for (i = 0; i < 2; i++) ++ for (j = 0; j < IOPRIO_BE_NR; j++) ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); ++ ++ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); ++} ++ ++static void bfq_exit_queue(struct elevator_queue *e) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ struct request_queue *q = bfqd->queue; ++ struct bfq_queue *bfqq, *n; ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ spin_lock_irq(q->queue_lock); ++ ++ BUG_ON(bfqd->in_service_queue); ++ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) ++ bfq_deactivate_bfqq(bfqd, bfqq, false, false); ++ ++ spin_unlock_irq(q->queue_lock); ++ ++ bfq_shutdown_timer_wq(bfqd); ++ ++ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ /* release oom-queue reference to root group */ ++ bfqg_put(bfqd->root_group); ++ ++ blkcg_deactivate_policy(q, &blkcg_policy_bfq); ++#else ++ bfq_put_async_queues(bfqd, bfqd->root_group); ++ kfree(bfqd->root_group); ++#endif ++ ++ kfree(bfqd); ++} ++ ++static void bfq_init_root_group(struct bfq_group *root_group, ++ struct bfq_data *bfqd) ++{ ++ int i; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ root_group->entity.parent = NULL; ++ root_group->my_entity = NULL; ++ root_group->bfqd = bfqd; ++#endif ++ root_group->rq_pos_tree = RB_ROOT; ++ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) ++ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; ++ root_group->sched_data.bfq_class_idle_last_service = jiffies; ++} ++ ++static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) ++{ ++ struct bfq_data *bfqd; ++ struct elevator_queue *eq; ++ ++ eq = elevator_alloc(q, e); ++ if (!eq) ++ return -ENOMEM; ++ ++ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); ++ if (!bfqd) { ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++ } ++ eq->elevator_data = bfqd; ++ ++ /* ++ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. ++ * Grab a permanent reference to it, so that the normal code flow ++ * will not attempt to free it. ++ */ ++ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); ++ bfqd->oom_bfqq.ref++; ++ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; ++ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; ++ bfqd->oom_bfqq.entity.new_weight = ++ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); ++ ++ /* oom_bfqq does not participate to bursts */ ++ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); ++ /* ++ * Trigger weight initialization, according to ioprio, at the ++ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio ++ * class won't be changed any more. ++ */ ++ bfqd->oom_bfqq.entity.prio_changed = 1; ++ ++ bfqd->queue = q; ++ ++ spin_lock_irq(q->queue_lock); ++ q->elevator = eq; ++ spin_unlock_irq(q->queue_lock); ++ ++ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); ++ if (!bfqd->root_group) ++ goto out_free; ++ bfq_init_root_group(bfqd->root_group, bfqd); ++ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); ++ ++ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, ++ HRTIMER_MODE_REL); ++ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; ++ ++ bfqd->queue_weights_tree = RB_ROOT; ++ bfqd->num_groups_with_pending_reqs = 0; ++ ++ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); ++ ++ INIT_LIST_HEAD(&bfqd->active_list); ++ INIT_LIST_HEAD(&bfqd->idle_list); ++ INIT_HLIST_HEAD(&bfqd->burst_list); ++ ++ bfqd->hw_tag = -1; ++ ++ bfqd->bfq_max_budget = bfq_default_max_budget; ++ ++ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; ++ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; ++ bfqd->bfq_back_max = bfq_back_max; ++ bfqd->bfq_back_penalty = bfq_back_penalty; ++ bfqd->bfq_slice_idle = bfq_slice_idle; ++ bfqd->bfq_timeout = bfq_timeout; ++ ++ bfqd->bfq_requests_within_timer = 120; ++ ++ bfqd->bfq_large_burst_thresh = 8; ++ bfqd->bfq_burst_interval = msecs_to_jiffies(180); ++ ++ bfqd->low_latency = true; ++ ++ /* ++ * Trade-off between responsiveness and fairness. ++ */ ++ bfqd->bfq_wr_coeff = 30; ++ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); ++ bfqd->bfq_wr_max_time = 0; ++ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); ++ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); ++ bfqd->bfq_wr_max_softrt_rate = 7000; /* ++ * Approximate rate required ++ * to playback or record a ++ * high-definition compressed ++ * video. ++ */ ++ bfqd->wr_busy_queues = 0; ++ ++ /* ++ * Begin by assuming, optimistically, that the device peak ++ * rate is equal to 2/3 of the highest reference rate. ++ */ ++ bfqd->rate_dur_prod = ref_rate[blk_queue_nonrot(bfqd->queue)] * ++ ref_wr_duration[blk_queue_nonrot(bfqd->queue)]; ++ bfqd->peak_rate = ref_rate[blk_queue_nonrot(bfqd->queue)] * 2 / 3; ++ ++ return 0; ++ ++out_free: ++ kfree(bfqd); ++ kobject_put(&eq->kobj); ++ return -ENOMEM; ++} ++ ++static void bfq_registered_queue(struct request_queue *q) ++{ ++ wbt_disable_default(q); ++} ++ ++static void bfq_slab_kill(void) ++{ ++ kmem_cache_destroy(bfq_pool); ++} ++ ++static int __init bfq_slab_setup(void) ++{ ++ bfq_pool = KMEM_CACHE(bfq_queue, 0); ++ if (!bfq_pool) ++ return -ENOMEM; ++ return 0; ++} ++ ++static ssize_t bfq_var_show(unsigned int var, char *page) ++{ ++ return sprintf(page, "%u\n", var); ++} ++ ++static ssize_t bfq_var_store(unsigned long *var, const char *page, ++ size_t count) ++{ ++ unsigned long new_val; ++ int ret = kstrtoul(page, 10, &new_val); ++ ++ if (ret == 0) ++ *var = new_val; ++ ++ return count; ++} ++ ++static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ ++ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? ++ jiffies_to_msecs(bfqd->bfq_wr_max_time) : ++ jiffies_to_msecs(bfq_wr_duration(bfqd))); ++} ++ ++static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) ++{ ++ struct bfq_queue *bfqq; ++ struct bfq_data *bfqd = e->elevator_data; ++ ssize_t num_char = 0; ++ ++ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", ++ bfqd->queued); ++ ++ spin_lock_irq(bfqd->queue->queue_lock); ++ ++ num_char += sprintf(page + num_char, "Active:\n"); ++ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, nr_queued %d %d, ", ++ bfqq->pid, ++ bfqq->entity.weight, ++ bfqq->queued[0], ++ bfqq->queued[1]); ++ num_char += sprintf(page + num_char, ++ "dur %d/%u\n", ++ jiffies_to_msecs( ++ jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ num_char += sprintf(page + num_char, "Idle:\n"); ++ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { ++ num_char += sprintf(page + num_char, ++ "pid%d: weight %hu, dur %d/%u\n", ++ bfqq->pid, ++ bfqq->entity.weight, ++ jiffies_to_msecs(jiffies - ++ bfqq->last_wr_start_finish), ++ jiffies_to_msecs(bfqq->wr_cur_max_time)); ++ } ++ ++ spin_unlock_irq(bfqd->queue->queue_lock); ++ ++ return num_char; ++} ++ ++#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ if (__CONV == 1) \ ++ __data = jiffies_to_msecs(__data); \ ++ else if (__CONV == 2) \ ++ __data = div_u64(__data, NSEC_PER_MSEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); ++SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); ++SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); ++SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); ++SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); ++SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); ++SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); ++SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); ++SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); ++SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); ++SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); ++SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); ++SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, ++ 1); ++SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); ++#undef SHOW_FUNCTION ++ ++#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ ++static ssize_t __FUNC(struct elevator_queue *e, char *page) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ u64 __data = __VAR; \ ++ __data = div_u64(__data, NSEC_PER_USEC); \ ++ return bfq_var_show(__data, (page)); \ ++} ++USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); ++#undef USEC_SHOW_FUNCTION ++ ++#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ ++static ssize_t \ ++__FUNC(struct elevator_queue *e, const char *page, size_t count) \ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ if (__CONV == 1) \ ++ *(__PTR) = msecs_to_jiffies(__data); \ ++ else if (__CONV == 2) \ ++ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ ++ else \ ++ *(__PTR) = __data; \ ++ return ret; \ ++} ++STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, ++ INT_MAX, 2); ++STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); ++STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, ++ INT_MAX, 0); ++STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); ++STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); ++STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, ++ 1); ++STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, ++ INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, ++ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); ++STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, ++ INT_MAX, 0); ++#undef STORE_FUNCTION ++ ++#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ ++static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ ++{ \ ++ struct bfq_data *bfqd = e->elevator_data; \ ++ unsigned long uninitialized_var(__data); \ ++ int ret = bfq_var_store(&__data, (page), count); \ ++ if (__data < (MIN)) \ ++ __data = (MIN); \ ++ else if (__data > (MAX)) \ ++ __data = (MAX); \ ++ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ ++ return ret; \ ++} ++USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, ++ UINT_MAX); ++#undef USEC_STORE_FUNCTION ++ ++/* do nothing for the moment */ ++static ssize_t bfq_weights_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ return count; ++} ++ ++static ssize_t bfq_max_budget_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ else { ++ if (__data > INT_MAX) ++ __data = INT_MAX; ++ bfqd->bfq_max_budget = __data; ++ } ++ ++ bfqd->bfq_user_max_budget = __data; ++ ++ return ret; ++} ++ ++/* ++ * Leaving this name to preserve name compatibility with cfq ++ * parameters, but this timeout is used for both sync and async. ++ */ ++static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data < 1) ++ __data = 1; ++ else if (__data > INT_MAX) ++ __data = INT_MAX; ++ ++ bfqd->bfq_timeout = msecs_to_jiffies(__data); ++ if (bfqd->bfq_user_max_budget == 0) ++ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); ++ ++ return ret; ++} ++ ++static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (!bfqd->strict_guarantees && __data == 1 ++ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) ++ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; ++ ++ bfqd->strict_guarantees = __data; ++ ++ return ret; ++} ++ ++static ssize_t bfq_low_latency_store(struct elevator_queue *e, ++ const char *page, size_t count) ++{ ++ struct bfq_data *bfqd = e->elevator_data; ++ unsigned long uninitialized_var(__data); ++ int ret = bfq_var_store(&__data, (page), count); ++ ++ if (__data > 1) ++ __data = 1; ++ if (__data == 0 && bfqd->low_latency != 0) ++ bfq_end_wr(bfqd); ++ bfqd->low_latency = __data; ++ ++ return ret; ++} ++ ++#define BFQ_ATTR(name) \ ++ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) ++ ++static struct elv_fs_entry bfq_attrs[] = { ++ BFQ_ATTR(fifo_expire_sync), ++ BFQ_ATTR(fifo_expire_async), ++ BFQ_ATTR(back_seek_max), ++ BFQ_ATTR(back_seek_penalty), ++ BFQ_ATTR(slice_idle), ++ BFQ_ATTR(slice_idle_us), ++ BFQ_ATTR(max_budget), ++ BFQ_ATTR(timeout_sync), ++ BFQ_ATTR(strict_guarantees), ++ BFQ_ATTR(low_latency), ++ BFQ_ATTR(wr_coeff), ++ BFQ_ATTR(wr_max_time), ++ BFQ_ATTR(wr_rt_max_time), ++ BFQ_ATTR(wr_min_idle_time), ++ BFQ_ATTR(wr_min_inter_arr_async), ++ BFQ_ATTR(wr_max_softrt_rate), ++ BFQ_ATTR(weights), ++ __ATTR_NULL ++}; ++ ++static struct elevator_type iosched_bfq = { ++ .ops.sq = { ++ .elevator_merge_fn = bfq_merge, ++ .elevator_merged_fn = bfq_merged_request, ++ .elevator_merge_req_fn = bfq_merged_requests, ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ .elevator_bio_merged_fn = bfq_bio_merged, ++#endif ++ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, ++ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, ++ .elevator_dispatch_fn = bfq_dispatch_requests, ++ .elevator_add_req_fn = bfq_insert_request, ++ .elevator_activate_req_fn = bfq_activate_request, ++ .elevator_deactivate_req_fn = bfq_deactivate_request, ++ .elevator_completed_req_fn = bfq_completed_request, ++ .elevator_former_req_fn = elv_rb_former_request, ++ .elevator_latter_req_fn = elv_rb_latter_request, ++ .elevator_init_icq_fn = bfq_init_icq, ++ .elevator_exit_icq_fn = bfq_exit_icq, ++ .elevator_set_req_fn = bfq_set_request, ++ .elevator_put_req_fn = bfq_put_request, ++ .elevator_may_queue_fn = bfq_may_queue, ++ .elevator_init_fn = bfq_init_queue, ++ .elevator_exit_fn = bfq_exit_queue, ++ .elevator_registered_fn = bfq_registered_queue, ++ }, ++ .icq_size = sizeof(struct bfq_io_cq), ++ .icq_align = __alignof__(struct bfq_io_cq), ++ .elevator_attrs = bfq_attrs, ++ .elevator_name = "bfq-sq", ++ .elevator_owner = THIS_MODULE, ++}; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct blkcg_policy blkcg_policy_bfq = { ++ .dfl_cftypes = bfq_blkg_files, ++ .legacy_cftypes = bfq_blkcg_legacy_files, ++ ++ .cpd_alloc_fn = bfq_cpd_alloc, ++ .cpd_init_fn = bfq_cpd_init, ++ .cpd_bind_fn = bfq_cpd_init, ++ .cpd_free_fn = bfq_cpd_free, ++ ++ .pd_alloc_fn = bfq_pd_alloc, ++ .pd_init_fn = bfq_pd_init, ++ .pd_offline_fn = bfq_pd_offline, ++ .pd_free_fn = bfq_pd_free, ++ .pd_reset_stats_fn = bfq_pd_reset_stats, ++}; ++#endif ++ ++static int __init bfq_init(void) ++{ ++ int ret; ++ char msg[60] = "BFQ I/O-scheduler: v9"; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ ret = blkcg_policy_register(&blkcg_policy_bfq); ++ if (ret) ++ return ret; ++#endif ++ ++ ret = -ENOMEM; ++ if (bfq_slab_setup()) ++ goto err_pol_unreg; ++ ++ /* ++ * Times to load large popular applications for the typical ++ * systems installed on the reference devices (see the ++ * comments before the definition of the next ++ * array). Actually, we use slightly lower values, as the ++ * estimated peak rate tends to be smaller than the actual ++ * peak rate. The reason for this last fact is that estimates ++ * are computed over much shorter time intervals than the long ++ * intervals typically used for benchmarking. Why? First, to ++ * adapt more quickly to variations. Second, because an I/O ++ * scheduler cannot rely on a peak-rate-evaluation workload to ++ * be run for a long time. ++ */ ++ ref_wr_duration[0] = msecs_to_jiffies(7000); /* actually 8 sec */ ++ ref_wr_duration[1] = msecs_to_jiffies(2500); /* actually 3 sec */ ++ ++ ret = elv_register(&iosched_bfq); ++ if (ret) ++ goto slab_kill; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ strcat(msg, " (with cgroups support)"); ++#endif ++ pr_info("%s", msg); ++ ++ return 0; ++ ++slab_kill: ++ bfq_slab_kill(); ++err_pol_unreg: ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ return ret; ++} ++ ++static void __exit bfq_exit(void) ++{ ++ elv_unregister(&iosched_bfq); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ blkcg_policy_unregister(&blkcg_policy_bfq); ++#endif ++ bfq_slab_kill(); ++} ++ ++module_init(bfq_init); ++module_exit(bfq_exit); ++ ++MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); ++MODULE_LICENSE("GPL"); +diff --git a/block/bfq.h b/block/bfq.h +new file mode 100644 +index 000000000000..0177fc7205d7 +--- /dev/null ++++ b/block/bfq.h +@@ -0,0 +1,1074 @@ ++/* ++ * BFQ v9: data structures and common functions prototypes. ++ * ++ * Based on ideas and code from CFQ: ++ * Copyright (C) 2003 Jens Axboe <axboe@kernel.dk> ++ * ++ * Copyright (C) 2008 Fabio Checconi <fabio@gandalf.sssup.it> ++ * Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2015 Paolo Valente <paolo.valente@unimore.it> ++ * ++ * Copyright (C) 2017 Paolo Valente <paolo.valente@linaro.org> ++ */ ++ ++#ifndef _BFQ_H ++#define _BFQ_H ++ ++#include <linux/hrtimer.h> ++#include <linux/blk-cgroup.h> ++ ++/* ++ * Define an alternative macro to compile cgroups support. This is one ++ * of the steps needed to let bfq-mq share the files bfq-sched.c and ++ * bfq-cgroup.c with bfq-sq. For bfq-mq, the macro ++ * BFQ_GROUP_IOSCHED_ENABLED will be defined as a function of whether ++ * the configuration option CONFIG_BFQ_MQ_GROUP_IOSCHED, and not ++ * CONFIG_BFQ_GROUP_IOSCHED, is defined. ++ */ ++#ifdef CONFIG_BFQ_SQ_GROUP_IOSCHED ++#define BFQ_GROUP_IOSCHED_ENABLED ++#endif ++ ++#define BFQ_IOPRIO_CLASSES 3 ++#define BFQ_CL_IDLE_TIMEOUT (HZ/5) ++ ++#define BFQ_MIN_WEIGHT 1 ++#define BFQ_MAX_WEIGHT 1000 ++#define BFQ_WEIGHT_CONVERSION_COEFF 10 ++ ++#define BFQ_DEFAULT_QUEUE_IOPRIO 4 ++ ++#define BFQ_WEIGHT_LEGACY_DFL 100 ++#define BFQ_DEFAULT_GRP_IOPRIO 0 ++#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE ++ ++/* ++ * Soft real-time applications are extremely more latency sensitive ++ * than interactive ones. Over-raise the weight of the former to ++ * privilege them against the latter. ++ */ ++#define BFQ_SOFTRT_WEIGHT_FACTOR 100 ++ ++struct bfq_entity; ++ ++/** ++ * struct bfq_service_tree - per ioprio_class service tree. ++ * ++ * Each service tree represents a B-WF2Q+ scheduler on its own. Each ++ * ioprio_class has its own independent scheduler, and so its own ++ * bfq_service_tree. All the fields are protected by the queue lock ++ * of the containing bfqd. ++ */ ++struct bfq_service_tree { ++ /* tree for active entities (i.e., those backlogged) */ ++ struct rb_root active; ++ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ ++ struct rb_root idle; ++ ++ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ ++ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ ++ ++ u64 vtime; /* scheduler virtual time */ ++ /* scheduler weight sum; active and idle entities contribute to it */ ++ unsigned long wsum; ++}; ++ ++/** ++ * struct bfq_sched_data - multi-class scheduler. ++ * ++ * bfq_sched_data is the basic scheduler queue. It supports three ++ * ioprio_classes, and can be used either as a toplevel queue or as an ++ * intermediate queue in a hierarchical setup. ++ * ++ * The supported ioprio_classes are the same as in CFQ, in descending ++ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. ++ * Requests from higher priority queues are served before all the ++ * requests from lower priority queues; among requests of the same ++ * queue requests are served according to B-WF2Q+. ++ * ++ * The schedule is implemented by the service trees, plus the field ++ * @next_in_service, which points to the entity on the active trees ++ * that will be served next, if 1) no changes in the schedule occurs ++ * before the current in-service entity is expired, 2) the in-service ++ * queue becomes idle when it expires, and 3) if the entity pointed by ++ * in_service_entity is not a queue, then the in-service child entity ++ * of the entity pointed by in_service_entity becomes idle on ++ * expiration. This peculiar definition allows for the following ++ * optimization, not yet exploited: while a given entity is still in ++ * service, we already know which is the best candidate for next ++ * service among the other active entitities in the same parent ++ * entity. We can then quickly compare the timestamps of the ++ * in-service entity with those of such best candidate. ++ * ++ * All the fields are protected by the queue lock of the containing ++ * bfqd. ++ */ ++struct bfq_sched_data { ++ struct bfq_entity *in_service_entity; /* entity in service */ ++ /* head-of-the-line entity in the scheduler (see comments above) */ ++ struct bfq_entity *next_in_service; ++ /* array of service trees, one per ioprio_class */ ++ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; ++ /* last time CLASS_IDLE was served */ ++ unsigned long bfq_class_idle_last_service; ++ ++}; ++ ++/** ++ * struct bfq_weight_counter - counter of the number of all active queues ++ * with a given weight. ++ */ ++struct bfq_weight_counter { ++ unsigned int weight; /* weight of the queues this counter refers to */ ++ unsigned int num_active; /* nr of active queues with this weight */ ++ /* ++ * Weights tree member (see bfq_data's @queue_weights_tree) ++ */ ++ struct rb_node weights_node; ++}; ++ ++/** ++ * struct bfq_entity - schedulable entity. ++ * ++ * A bfq_entity is used to represent either a bfq_queue (leaf node in the ++ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each ++ * entity belongs to the sched_data of the parent group in the cgroup ++ * hierarchy. Non-leaf entities have also their own sched_data, stored ++ * in @my_sched_data. ++ * ++ * Each entity stores independently its priority values; this would ++ * allow different weights on different devices, but this ++ * functionality is not exported to userspace by now. Priorities and ++ * weights are updated lazily, first storing the new values into the ++ * new_* fields, then setting the @prio_changed flag. As soon as ++ * there is a transition in the entity state that allows the priority ++ * update to take place the effective and the requested priority ++ * values are synchronized. ++ * ++ * Unless cgroups are used, the weight value is calculated from the ++ * ioprio to export the same interface as CFQ. When dealing with ++ * ``well-behaved'' queues (i.e., queues that do not spend too much ++ * time to consume their budget and have true sequential behavior, and ++ * when there are no external factors breaking anticipation) the ++ * relative weights at each level of the cgroups hierarchy should be ++ * guaranteed. All the fields are protected by the queue lock of the ++ * containing bfqd. ++ */ ++struct bfq_entity { ++ struct rb_node rb_node; /* service_tree member */ ++ ++ /* ++ * Flag, true if the entity is on a tree (either the active or ++ * the idle one of its service_tree) or is in service. ++ */ ++ bool on_st; ++ ++ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ ++ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ ++ ++ /* tree the entity is enqueued into; %NULL if not on a tree */ ++ struct rb_root *tree; ++ ++ /* ++ * minimum start time of the (active) subtree rooted at this ++ * entity; used for O(log N) lookups into active trees ++ */ ++ u64 min_start; ++ ++ /* amount of service received during the last service slot */ ++ int service; ++ ++ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ ++ int budget; ++ ++ unsigned int weight; /* weight of the queue */ ++ unsigned int new_weight; /* next weight if a change is in progress */ ++ ++ /* original weight, used to implement weight boosting */ ++ unsigned int orig_weight; ++ ++ /* parent entity, for hierarchical scheduling */ ++ struct bfq_entity *parent; ++ ++ /* ++ * For non-leaf nodes in the hierarchy, the associated ++ * scheduler queue, %NULL on leaf nodes. ++ */ ++ struct bfq_sched_data *my_sched_data; ++ /* the scheduler queue this entity belongs to */ ++ struct bfq_sched_data *sched_data; ++ ++ /* flag, set to request a weight, ioprio or ioprio_class change */ ++ int prio_changed; ++ ++ /* flag, set if the entity is counted in groups_with_pending_reqs */ ++ bool in_groups_with_pending_reqs; ++}; ++ ++struct bfq_group; ++ ++/** ++ * struct bfq_queue - leaf schedulable entity. ++ * ++ * A bfq_queue is a leaf request queue; it can be associated with an ++ * io_context or more, if it is async or shared between cooperating ++ * processes. @cgroup holds a reference to the cgroup, to be sure that it ++ * does not disappear while a bfqq still references it (mostly to avoid ++ * races between request issuing and task migration followed by cgroup ++ * destruction). ++ * All the fields are protected by the queue lock of the containing bfqd. ++ */ ++struct bfq_queue { ++ /* reference counter */ ++ int ref; ++ /* parent bfq_data */ ++ struct bfq_data *bfqd; ++ ++ /* current ioprio and ioprio class */ ++ unsigned short ioprio, ioprio_class; ++ /* next ioprio and ioprio class if a change is in progress */ ++ unsigned short new_ioprio, new_ioprio_class; ++ ++ /* ++ * Shared bfq_queue if queue is cooperating with one or more ++ * other queues. ++ */ ++ struct bfq_queue *new_bfqq; ++ /* request-position tree member (see bfq_group's @rq_pos_tree) */ ++ struct rb_node pos_node; ++ /* request-position tree root (see bfq_group's @rq_pos_tree) */ ++ struct rb_root *pos_root; ++ ++ /* sorted list of pending requests */ ++ struct rb_root sort_list; ++ /* if fifo isn't expired, next request to serve */ ++ struct request *next_rq; ++ /* number of sync and async requests queued */ ++ int queued[2]; ++ /* number of sync and async requests currently allocated */ ++ int allocated[2]; ++ /* number of pending metadata requests */ ++ int meta_pending; ++ /* fifo list of requests in sort_list */ ++ struct list_head fifo; ++ ++ /* entity representing this queue in the scheduler */ ++ struct bfq_entity entity; ++ ++ /* pointer to the weight counter associated with this queue */ ++ struct bfq_weight_counter *weight_counter; ++ ++ /* maximum budget allowed from the feedback mechanism */ ++ int max_budget; ++ /* budget expiration (in jiffies) */ ++ unsigned long budget_timeout; ++ ++ /* number of requests on the dispatch list or inside driver */ ++ int dispatched; ++ ++ unsigned int flags; /* status flags.*/ ++ ++ /* node for active/idle bfqq list inside parent bfqd */ ++ struct list_head bfqq_list; ++ ++ /* bit vector: a 1 for each seeky requests in history */ ++ u32 seek_history; ++ ++ /* node for the device's burst list */ ++ struct hlist_node burst_list_node; ++ ++ /* position of the last request enqueued */ ++ sector_t last_request_pos; ++ ++ /* Number of consecutive pairs of request completion and ++ * arrival, such that the queue becomes idle after the ++ * completion, but the next request arrives within an idle ++ * time slice; used only if the queue's IO_bound flag has been ++ * cleared. ++ */ ++ unsigned int requests_within_timer; ++ ++ /* pid of the process owning the queue, used for logging purposes */ ++ pid_t pid; ++ ++ /* ++ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL ++ * if the queue is shared. ++ */ ++ struct bfq_io_cq *bic; ++ ++ /* current maximum weight-raising time for this queue */ ++ unsigned long wr_cur_max_time; ++ /* ++ * Minimum time instant such that, only if a new request is ++ * enqueued after this time instant in an idle @bfq_queue with ++ * no outstanding requests, then the task associated with the ++ * queue it is deemed as soft real-time (see the comments on ++ * the function bfq_bfqq_softrt_next_start()) ++ */ ++ unsigned long soft_rt_next_start; ++ /* ++ * Start time of the current weight-raising period if ++ * the @bfq-queue is being weight-raised, otherwise ++ * finish time of the last weight-raising period. ++ */ ++ unsigned long last_wr_start_finish; ++ /* factor by which the weight of this queue is multiplied */ ++ unsigned int wr_coeff; ++ /* ++ * Time of the last transition of the @bfq_queue from idle to ++ * backlogged. ++ */ ++ unsigned long last_idle_bklogged; ++ /* ++ * Cumulative service received from the @bfq_queue since the ++ * last transition from idle to backlogged. ++ */ ++ unsigned long service_from_backlogged; ++ /* ++ * Cumulative service received from the @bfq_queue since its ++ * last transition to weight-raised state. ++ */ ++ unsigned long service_from_wr; ++ /* ++ * Value of wr start time when switching to soft rt ++ */ ++ unsigned long wr_start_at_switch_to_srt; ++ ++ unsigned long split_time; /* time of last split */ ++ ++ unsigned long first_IO_time; /* time of first I/O for this queue */ ++ ++ /* max service rate measured so far */ ++ u32 max_service_rate; ++ /* ++ * Ratio between the service received by bfqq while it is in ++ * service, and the cumulative service (of requests of other ++ * queues) that may be injected while bfqq is empty but still ++ * in service. To increase precision, the coefficient is ++ * measured in tenths of unit. Here are some example of (1) ++ * ratios, (2) resulting percentages of service injected ++ * w.r.t. to the total service dispatched while bfqq is in ++ * service, and (3) corresponding values of the coefficient: ++ * 1 (50%) -> 10 ++ * 2 (33%) -> 20 ++ * 10 (9%) -> 100 ++ * 9.9 (9%) -> 99 ++ * 1.5 (40%) -> 15 ++ * 0.5 (66%) -> 5 ++ * 0.1 (90%) -> 1 ++ * ++ * So, if the coefficient is lower than 10, then ++ * injected service is more than bfqq service. ++ */ ++ unsigned int inject_coeff; ++ /* amount of service injected in current service slot */ ++ unsigned int injected_service; ++}; ++ ++/** ++ * struct bfq_ttime - per process thinktime stats. ++ */ ++struct bfq_ttime { ++ u64 last_end_request; /* completion time of last request */ ++ ++ u64 ttime_total; /* total process thinktime */ ++ unsigned long ttime_samples; /* number of thinktime samples */ ++ u64 ttime_mean; /* average process thinktime */ ++ ++}; ++ ++/** ++ * struct bfq_io_cq - per (request_queue, io_context) structure. ++ */ ++struct bfq_io_cq { ++ /* associated io_cq structure */ ++ struct io_cq icq; /* must be the first member */ ++ /* array of two process queues, the sync and the async */ ++ struct bfq_queue *bfqq[2]; ++ /* associated @bfq_ttime struct */ ++ struct bfq_ttime ttime; ++ /* per (request_queue, blkcg) ioprio */ ++ int ioprio; ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ uint64_t blkcg_serial_nr; /* the current blkcg serial */ ++#endif ++ ++ /* ++ * Snapshot of the has_short_time flag before merging; taken ++ * to remember its value while the queue is merged, so as to ++ * be able to restore it in case of split. ++ */ ++ bool saved_has_short_ttime; ++ /* ++ * Same purpose as the previous two fields for the I/O bound ++ * classification of a queue. ++ */ ++ bool saved_IO_bound; ++ ++ /* ++ * Same purpose as the previous fields for the value of the ++ * field keeping the queue's belonging to a large burst ++ */ ++ bool saved_in_large_burst; ++ /* ++ * True if the queue belonged to a burst list before its merge ++ * with another cooperating queue. ++ */ ++ bool was_in_burst_list; ++ ++ /* ++ * Similar to previous fields: save wr information. ++ */ ++ unsigned long saved_wr_coeff; ++ unsigned long saved_last_wr_start_finish; ++ unsigned long saved_wr_start_at_switch_to_srt; ++ unsigned int saved_wr_cur_max_time; ++}; ++ ++/** ++ * struct bfq_data - per-device data structure. ++ * ++ * All the fields are protected by the @queue lock. ++ */ ++struct bfq_data { ++ /* request queue for the device */ ++ struct request_queue *queue; ++ ++ /* root bfq_group for the device */ ++ struct bfq_group *root_group; ++ ++ /* ++ * rbtree of weight counters of @bfq_queues, sorted by ++ * weight. Used to keep track of whether all @bfq_queues have ++ * the same weight. The tree contains one counter for each ++ * distinct weight associated to some active and not ++ * weight-raised @bfq_queue (see the comments to the functions ++ * bfq_weights_tree_[add|remove] for further details). ++ */ ++ struct rb_root queue_weights_tree; ++ ++ /* ++ * Number of groups with at least one descendant process that ++ * has at least one request waiting for completion. Note that ++ * this accounts for also requests already dispatched, but not ++ * yet completed. Therefore this number of groups may differ ++ * (be larger) than the number of active groups, as a group is ++ * considered active only if its corresponding entity has ++ * descendant queues with at least one request queued. This ++ * number is used to decide whether a scenario is symmetric. ++ * For a detailed explanation see comments on the computation ++ * of the variable asymmetric_scenario in the function ++ * bfq_better_to_idle(). ++ * ++ * However, it is hard to compute this number exactly, for ++ * groups with multiple descendant processes. Consider a group ++ * that is inactive, i.e., that has no descendant process with ++ * pending I/O inside BFQ queues. Then suppose that ++ * num_groups_with_pending_reqs is still accounting for this ++ * group, because the group has descendant processes with some ++ * I/O request still in flight. num_groups_with_pending_reqs ++ * should be decremented when the in-flight request of the ++ * last descendant process is finally completed (assuming that ++ * nothing else has changed for the group in the meantime, in ++ * terms of composition of the group and active/inactive state of child ++ * groups and processes). To accomplish this, an additional ++ * pending-request counter must be added to entities, and must ++ * be updated correctly. To avoid this additional field and operations, ++ * we resort to the following tradeoff between simplicity and ++ * accuracy: for an inactive group that is still counted in ++ * num_groups_with_pending_reqs, we decrement ++ * num_groups_with_pending_reqs when the first descendant ++ * process of the group remains with no request waiting for ++ * completion. ++ * ++ * Even this simpler decrement strategy requires a little ++ * carefulness: to avoid multiple decrements, we flag a group, ++ * more precisely an entity representing a group, as still ++ * counted in num_groups_with_pending_reqs when it becomes ++ * inactive. Then, when the first descendant queue of the ++ * entity remains with no request waiting for completion, ++ * num_groups_with_pending_reqs is decremented, and this flag ++ * is reset. After this flag is reset for the entity, ++ * num_groups_with_pending_reqs won't be decremented any ++ * longer in case a new descendant queue of the entity remains ++ * with no request waiting for completion. ++ */ ++ unsigned int num_groups_with_pending_reqs; ++ ++ /* ++ * Per-class (RT, BE, IDLE) number of bfq_queues containing ++ * requests (including the queue in service, even if it is ++ * idling). ++ */ ++ unsigned int busy_queues[3]; ++ /* number of weight-raised busy @bfq_queues */ ++ int wr_busy_queues; ++ /* number of queued requests */ ++ int queued; ++ /* number of requests dispatched and waiting for completion */ ++ int rq_in_driver; ++ ++ /* ++ * Maximum number of requests in driver in the last ++ * @hw_tag_samples completed requests. ++ */ ++ int max_rq_in_driver; ++ /* number of samples used to calculate hw_tag */ ++ int hw_tag_samples; ++ /* flag set to one if the driver is showing a queueing behavior */ ++ int hw_tag; ++ ++ /* number of budgets assigned */ ++ int budgets_assigned; ++ ++ /* ++ * Timer set when idling (waiting) for the next request from ++ * the queue in service. ++ */ ++ struct hrtimer idle_slice_timer; ++ /* delayed work to restart dispatching on the request queue */ ++ struct work_struct unplug_work; ++ ++ /* bfq_queue in service */ ++ struct bfq_queue *in_service_queue; ++ /* bfq_io_cq (bic) associated with the @in_service_queue */ ++ struct bfq_io_cq *in_service_bic; ++ ++ /* on-disk position of the last served request */ ++ sector_t last_position; ++ ++ /* position of the last served request for the in-service queue */ ++ sector_t in_serv_last_pos; ++ ++ /* time of last request completion (ns) */ ++ u64 last_completion; ++ ++ /* time of first rq dispatch in current observation interval (ns) */ ++ u64 first_dispatch; ++ /* time of last rq dispatch in current observation interval (ns) */ ++ u64 last_dispatch; ++ ++ /* beginning of the last budget */ ++ ktime_t last_budget_start; ++ /* beginning of the last idle slice */ ++ ktime_t last_idling_start; ++ ++ /* number of samples in current observation interval */ ++ int peak_rate_samples; ++ /* num of samples of seq dispatches in current observation interval */ ++ u32 sequential_samples; ++ /* total num of sectors transferred in current observation interval */ ++ u64 tot_sectors_dispatched; ++ /* max rq size seen during current observation interval (sectors) */ ++ u32 last_rq_max_size; ++ /* time elapsed from first dispatch in current observ. interval (us) */ ++ u64 delta_from_first; ++ /* ++ * Current estimate of the device peak rate, measured in ++ * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by ++ * BFQ_RATE_SHIFT is performed to increase precision in ++ * fixed-point calculations. ++ */ ++ u32 peak_rate; ++ ++ /* maximum budget allotted to a bfq_queue before rescheduling */ ++ int bfq_max_budget; ++ ++ /* list of all the bfq_queues active on the device */ ++ struct list_head active_list; ++ /* list of all the bfq_queues idle on the device */ ++ struct list_head idle_list; ++ ++ /* ++ * Timeout for async/sync requests; when it fires, requests ++ * are served in fifo order. ++ */ ++ u64 bfq_fifo_expire[2]; ++ /* weight of backward seeks wrt forward ones */ ++ unsigned int bfq_back_penalty; ++ /* maximum allowed backward seek */ ++ unsigned int bfq_back_max; ++ /* maximum idling time */ ++ u32 bfq_slice_idle; ++ ++ /* user-configured max budget value (0 for auto-tuning) */ ++ int bfq_user_max_budget; ++ /* ++ * Timeout for bfq_queues to consume their budget; used to ++ * prevent seeky queues from imposing long latencies to ++ * sequential or quasi-sequential ones (this also implies that ++ * seeky queues cannot receive guarantees in the service ++ * domain; after a timeout they are charged for the time they ++ * have been in service, to preserve fairness among them, but ++ * without service-domain guarantees). ++ */ ++ unsigned int bfq_timeout; ++ ++ /* ++ * Number of consecutive requests that must be issued within ++ * the idle time slice to set again idling to a queue which ++ * was marked as non-I/O-bound (see the definition of the ++ * IO_bound flag for further details). ++ */ ++ unsigned int bfq_requests_within_timer; ++ ++ /* ++ * Force device idling whenever needed to provide accurate ++ * service guarantees, without caring about throughput ++ * issues. CAVEAT: this may even increase latencies, in case ++ * of useless idling for processes that did stop doing I/O. ++ */ ++ bool strict_guarantees; ++ ++ /* ++ * Last time at which a queue entered the current burst of ++ * queues being activated shortly after each other; for more ++ * details about this and the following parameters related to ++ * a burst of activations, see the comments on the function ++ * bfq_handle_burst. ++ */ ++ unsigned long last_ins_in_burst; ++ /* ++ * Reference time interval used to decide whether a queue has ++ * been activated shortly after @last_ins_in_burst. ++ */ ++ unsigned long bfq_burst_interval; ++ /* number of queues in the current burst of queue activations */ ++ int burst_size; ++ ++ /* common parent entity for the queues in the burst */ ++ struct bfq_entity *burst_parent_entity; ++ /* Maximum burst size above which the current queue-activation ++ * burst is deemed as 'large'. ++ */ ++ unsigned long bfq_large_burst_thresh; ++ /* true if a large queue-activation burst is in progress */ ++ bool large_burst; ++ /* ++ * Head of the burst list (as for the above fields, more ++ * details in the comments on the function bfq_handle_burst). ++ */ ++ struct hlist_head burst_list; ++ ++ /* if set to true, low-latency heuristics are enabled */ ++ bool low_latency; ++ /* ++ * Maximum factor by which the weight of a weight-raised queue ++ * is multiplied. ++ */ ++ unsigned int bfq_wr_coeff; ++ /* maximum duration of a weight-raising period (jiffies) */ ++ unsigned int bfq_wr_max_time; ++ ++ /* Maximum weight-raising duration for soft real-time processes */ ++ unsigned int bfq_wr_rt_max_time; ++ /* ++ * Minimum idle period after which weight-raising may be ++ * reactivated for a queue (in jiffies). ++ */ ++ unsigned int bfq_wr_min_idle_time; ++ /* ++ * Minimum period between request arrivals after which ++ * weight-raising may be reactivated for an already busy async ++ * queue (in jiffies). ++ */ ++ unsigned long bfq_wr_min_inter_arr_async; ++ ++ /* Max service-rate for a soft real-time queue, in sectors/sec */ ++ unsigned int bfq_wr_max_softrt_rate; ++ /* ++ * Cached value of the product ref_rate*ref_wr_duration, used ++ * for computing the maximum duration of weight raising ++ * automatically. ++ */ ++ u64 rate_dur_prod; ++ ++ /* fallback dummy bfqq for extreme OOM conditions */ ++ struct bfq_queue oom_bfqq; ++}; ++ ++enum bfqq_state_flags { ++ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ ++ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ ++ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ ++ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* ++ * waiting for a request ++ * without idling the device ++ */ ++ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ ++ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ ++ BFQ_BFQQ_FLAG_has_short_ttime, /* queue has a short think time */ ++ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ ++ BFQ_BFQQ_FLAG_IO_bound, /* ++ * bfqq has timed-out at least once ++ * having consumed at most 2/10 of ++ * its budget ++ */ ++ BFQ_BFQQ_FLAG_in_large_burst, /* ++ * bfqq activated in a large burst, ++ * see comments to bfq_handle_burst. ++ */ ++ BFQ_BFQQ_FLAG_softrt_update, /* ++ * may need softrt-next-start ++ * update ++ */ ++ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ ++ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ ++}; ++ ++#define BFQ_BFQQ_FNS(name) \ ++static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ ++{ \ ++ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ ++} \ ++static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ ++{ \ ++ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ ++} ++ ++BFQ_BFQQ_FNS(just_created); ++BFQ_BFQQ_FNS(busy); ++BFQ_BFQQ_FNS(wait_request); ++BFQ_BFQQ_FNS(non_blocking_wait_rq); ++BFQ_BFQQ_FNS(must_alloc); ++BFQ_BFQQ_FNS(fifo_expire); ++BFQ_BFQQ_FNS(has_short_ttime); ++BFQ_BFQQ_FNS(sync); ++BFQ_BFQQ_FNS(IO_bound); ++BFQ_BFQQ_FNS(in_large_burst); ++BFQ_BFQQ_FNS(coop); ++BFQ_BFQQ_FNS(split_coop); ++BFQ_BFQQ_FNS(softrt_update); ++#undef BFQ_BFQQ_FNS ++ ++/* Logging facilities. */ ++#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE ++ ++static const char *checked_dev_name(const struct device *dev) ++{ ++ static const char nodev[] = "nodev"; ++ ++ if (dev) ++ return dev_name(dev); ++ ++ return nodev; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ assert_spin_locked((bfqd)->queue->queue_lock); \ ++ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ ++ pr_crit("%s bfq%d%c %s [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __pbuf, __func__, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ ++ pr_crit("%s %s [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ __pbuf, __func__, ##args); \ ++} while (0) ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ pr_crit("%s bfq%d%c [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ (bfqq)->pid, bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __func__, ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ pr_crit("%s bfq [%s] " fmt "\n", \ ++ checked_dev_name((bfqd)->queue->backing_dev_info->dev), \ ++ __func__, ##args) ++ ++#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++#if !defined(CONFIG_BLK_DEV_IO_TRACE) ++ ++/* Avoid possible "unused-variable" warning. See commit message. */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) ((void) (bfqq)) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) ((void) (bfqg)) ++ ++#define bfq_log(bfqd, fmt, args...) do {} while (0) ++ ++#else /* CONFIG_BLK_DEV_IO_TRACE */ ++ ++#include <linux/blktrace_api.h> ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); ++static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ assert_spin_locked((bfqd)->queue->queue_lock); \ ++ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s [%s] " fmt, \ ++ (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __pbuf, __func__, ##args); \ ++} while (0) ++ ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ ++ char __pbuf[128]; \ ++ \ ++ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ ++ blk_add_trace_msg((bfqd)->queue, "%s [%s] " fmt, __pbuf, \ ++ __func__, ##args); \ ++} while (0) ++ ++#else /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq%d%c [%s] " fmt, (bfqq)->pid, \ ++ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ ++ __func__, ##args) ++#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) ++ ++#endif /* BFQ_GROUP_IOSCHED_ENABLED */ ++ ++#define bfq_log(bfqd, fmt, args...) \ ++ blk_add_trace_msg((bfqd)->queue, "bfq [%s] " fmt, __func__, ##args) ++ ++#endif /* CONFIG_BLK_DEV_IO_TRACE */ ++#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ ++ ++/* Expiration reasons. */ ++enum bfqq_expiration { ++ BFQ_BFQQ_TOO_IDLE = 0, /* ++ * queue has been idling for ++ * too long ++ */ ++ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ ++ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ ++ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ ++ BFQ_BFQQ_PREEMPTED /* preemption in progress */ ++}; ++ ++ ++struct bfqg_stats { ++#if defined(BFQ_GROUP_IOSCHED_ENABLED) && defined(CONFIG_DEBUG_BLK_CGROUP) ++ /* number of ios merged */ ++ struct blkg_rwstat merged; ++ /* total time spent on device in ns, may not be accurate w/ queueing */ ++ struct blkg_rwstat service_time; ++ /* total time spent waiting in scheduler queue in ns */ ++ struct blkg_rwstat wait_time; ++ /* number of IOs queued up */ ++ struct blkg_rwstat queued; ++ /* total disk time and nr sectors dispatched by this group */ ++ struct blkg_stat time; ++ /* sum of number of ios queued across all samples */ ++ struct blkg_stat avg_queue_size_sum; ++ /* count of samples taken for average */ ++ struct blkg_stat avg_queue_size_samples; ++ /* how many times this group has been removed from service tree */ ++ struct blkg_stat dequeue; ++ /* total time spent waiting for it to be assigned a timeslice. */ ++ struct blkg_stat group_wait_time; ++ /* time spent idling for this blkcg_gq */ ++ struct blkg_stat idle_time; ++ /* total time with empty current active q with other requests queued */ ++ struct blkg_stat empty_time; ++ /* fields after this shouldn't be cleared on stat reset */ ++ uint64_t start_group_wait_time; ++ uint64_t start_idle_time; ++ uint64_t start_empty_time; ++ uint16_t flags; ++#endif /* BFQ_GROUP_IOSCHED_ENABLED && CONFIG_DEBUG_BLK_CGROUP */ ++}; ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++/* ++ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. ++ * ++ * @ps: @blkcg_policy_storage that this structure inherits ++ * @weight: weight of the bfq_group ++ */ ++struct bfq_group_data { ++ /* must be the first member */ ++ struct blkcg_policy_data pd; ++ ++ unsigned int weight; ++}; ++ ++/** ++ * struct bfq_group - per (device, cgroup) data structure. ++ * @entity: schedulable entity to insert into the parent group sched_data. ++ * @sched_data: own sched_data, to contain child entities (they may be ++ * both bfq_queues and bfq_groups). ++ * @bfqd: the bfq_data for the device this group acts upon. ++ * @async_bfqq: array of async queues for all the tasks belonging to ++ * the group, one queue per ioprio value per ioprio_class, ++ * except for the idle class that has only one queue. ++ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). ++ * @my_entity: pointer to @entity, %NULL for the toplevel group; used ++ * to avoid too many special cases during group creation/ ++ * migration. ++ * @active_entities: number of active entities belonging to the group; ++ * unused for the root group. Used to know whether there ++ * are groups with more than one active @bfq_entity ++ * (see the comments to the function ++ * bfq_bfqq_may_idle()). ++ * @rq_pos_tree: rbtree sorted by next_request position, used when ++ * determining if two or more queues have interleaving ++ * requests (see bfq_find_close_cooperator()). ++ * ++ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup ++ * there is a set of bfq_groups, each one collecting the lower-level ++ * entities belonging to the group that are acting on the same device. ++ * ++ * Locking works as follows: ++ * o @bfqd is protected by the queue lock, RCU is used to access it ++ * from the readers. ++ * o All the other fields are protected by the @bfqd queue lock. ++ */ ++struct bfq_group { ++ /* must be the first member */ ++ struct blkg_policy_data pd; ++ ++ struct bfq_entity entity; ++ struct bfq_sched_data sched_data; ++ ++ void *bfqd; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct bfq_entity *my_entity; ++ ++ int active_entities; ++ ++ struct rb_root rq_pos_tree; ++ ++ struct bfqg_stats stats; ++}; ++ ++#else ++struct bfq_group { ++ struct bfq_sched_data sched_data; ++ ++ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; ++ struct bfq_queue *async_idle_bfqq; ++ ++ struct rb_root rq_pos_tree; ++}; ++#endif ++ ++static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); ++ ++static unsigned int bfq_class_idx(struct bfq_entity *entity) ++{ ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ ++ return bfqq ? bfqq->ioprio_class - 1 : ++ BFQ_DEFAULT_GRP_CLASS - 1; ++} ++ ++static unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd) ++{ ++ return bfqd->busy_queues[0] + bfqd->busy_queues[1] + ++ bfqd->busy_queues[2]; ++} ++ ++static struct bfq_service_tree * ++bfq_entity_service_tree(struct bfq_entity *entity) ++{ ++ struct bfq_sched_data *sched_data = entity->sched_data; ++ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); ++ unsigned int idx = bfq_class_idx(entity); ++ ++ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); ++ BUG_ON(sched_data == NULL); ++ ++ if (bfqq) ++ bfq_log_bfqq(bfqq->bfqd, bfqq, ++ "%p %d", ++ sched_data->service_tree + idx, idx); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ else { ++ struct bfq_group *bfqg = ++ container_of(entity, struct bfq_group, entity); ++ ++ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, ++ "%p %d", ++ sched_data->service_tree + idx, idx); ++ } ++#endif ++ return sched_data->service_tree + idx; ++} ++ ++static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) ++{ ++ return bic->bfqq[is_sync]; ++} ++ ++static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, ++ bool is_sync) ++{ ++ bic->bfqq[is_sync] = bfqq; ++} ++ ++static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) ++{ ++ return bic->icq.q->elevator->elevator_data; ++} ++ ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ struct bfq_entity *group_entity = bfqq->entity.parent; ++ ++ if (!group_entity) ++ group_entity = &bfqq->bfqd->root_group->entity; ++ ++ return container_of(group_entity, struct bfq_group, entity); ++} ++ ++#else ++ ++static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) ++{ ++ return bfqq->bfqd->root_group; ++} ++ ++#endif ++ ++static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); ++static void bfq_put_queue(struct bfq_queue *bfqq); ++static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); ++static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, ++ struct bio *bio, bool is_sync, ++ struct bfq_io_cq *bic); ++static void bfq_end_wr_async_queues(struct bfq_data *bfqd, ++ struct bfq_group *bfqg); ++#ifdef BFQ_GROUP_IOSCHED_ENABLED ++static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); ++#endif ++static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); ++ ++#endif /* _BFQ_H */ +diff --git a/block/blk-mq.c b/block/blk-mq.c +index e3c39ea8e17b..7a57368841f6 100644 +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -2878,6 +2878,8 @@ int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) + } + if (ret) + break; ++ if (q->elevator && q->elevator->type->ops.mq.depth_updated) ++ q->elevator->type->ops.mq.depth_updated(hctx); + } + + if (!ret) +diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h +index 6980014357d4..8c4568ea6884 100644 +--- a/include/linux/blkdev.h ++++ b/include/linux/blkdev.h +@@ -54,7 +54,7 @@ struct blk_stat_callback; + * Maximum number of blkcg policies allowed to be registered concurrently. + * Defined here to simplify include dependency. + */ +-#define BLKCG_MAX_POLS 5 ++#define BLKCG_MAX_POLS 7 + + typedef void (rq_end_io_fn)(struct request *, blk_status_t); + +@@ -127,6 +127,10 @@ typedef __u32 __bitwise req_flags_t; + #define RQF_MQ_POLL_SLEPT ((__force req_flags_t)(1 << 20)) + /* ->timeout has been called, don't expire again */ + #define RQF_TIMED_OUT ((__force req_flags_t)(1 << 21)) ++/* DEBUG: rq in bfq-mq dispatch list */ ++#define RQF_DISP_LIST ((__force req_flags_t)(1 << 22)) ++/* DEBUG: rq had get_rq_private executed on it */ ++#define RQF_GOT ((__force req_flags_t)(1 << 23)) + + /* flags that prevent us from merging requests: */ + #define RQF_NOMERGE_FLAGS \ +diff --git a/include/linux/elevator.h b/include/linux/elevator.h +index a02deea30185..a2bf4a6b9316 100644 +--- a/include/linux/elevator.h ++++ b/include/linux/elevator.h +@@ -99,6 +99,7 @@ struct elevator_mq_ops { + void (*exit_sched)(struct elevator_queue *); + int (*init_hctx)(struct blk_mq_hw_ctx *, unsigned int); + void (*exit_hctx)(struct blk_mq_hw_ctx *, unsigned int); ++ void (*depth_updated)(struct blk_mq_hw_ctx *); + + bool (*allow_merge)(struct request_queue *, struct request *, struct bio *); + bool (*bio_merge)(struct blk_mq_hw_ctx *, struct bio *); diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-drop_ancient-and-wrong-msg.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-drop_ancient-and-wrong-msg.patch new file mode 100644 index 00000000..f184b08e --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-drop_ancient-and-wrong-msg.patch @@ -0,0 +1,29 @@ +diff -Naur linux-4.15.1/drivers/edac/amd64_edac.c linux-4.15.1-p/drivers/edac/amd64_edac.c +--- linux-4.15.1/drivers/edac/amd64_edac.c 2018-02-03 17:58:44.000000000 +0100 ++++ linux-4.15.1-p/drivers/edac/amd64_edac.c 2018-02-12 01:52:10.411149240 +0100 +@@ -3020,17 +3020,6 @@ + amd64_warn("Error restoring NB MCGCTL settings!\n"); + } + +-/* +- * EDAC requires that the BIOS have ECC enabled before +- * taking over the processing of ECC errors. A command line +- * option allows to force-enable hardware ECC later in +- * enable_ecc_error_reporting(). +- */ +-static const char *ecc_msg = +- "ECC disabled in the BIOS or no ECC capability, module will not load.\n" +- " Either enable ECC checking or force module loading by setting " +- "'ecc_enable_override'.\n" +- " (Note that use of the override may cause unknown side effects.)\n"; + + static bool ecc_enabled(struct pci_dev *F3, u16 nid) + { +@@ -3083,7 +3072,6 @@ + nid, (ecc_en ? "enabled" : "disabled")); + + if (!ecc_en || !nb_mce_en) { +- amd64_info("%s", ecc_msg); + return false; + } + return true; diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-enable_alx_wol.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-enable_alx_wol.patch new file mode 100644 index 00000000..1b7f6e13 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-enable_alx_wol.patch @@ -0,0 +1,485 @@ +diff --git a/drivers/net/ethernet/atheros/alx/ethtool.c b/drivers/net/ethernet/atheros/alx/ethtool.c +index 2f4eabf652e8..859e27236ce4 100644 +--- a/drivers/net/ethernet/atheros/alx/ethtool.c ++++ b/drivers/net/ethernet/atheros/alx/ethtool.c +@@ -310,11 +310,47 @@ static int alx_get_sset_count(struct net_device *netdev, int sset) + } + } + ++static void alx_get_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) ++{ ++ struct alx_priv *alx = netdev_priv(netdev); ++ struct alx_hw *hw = &alx->hw; ++ ++ wol->supported = WAKE_MAGIC | WAKE_PHY; ++ wol->wolopts = 0; ++ ++ if (hw->sleep_ctrl & ALX_SLEEP_WOL_MAGIC) ++ wol->wolopts |= WAKE_MAGIC; ++ if (hw->sleep_ctrl & ALX_SLEEP_WOL_PHY) ++ wol->wolopts |= WAKE_PHY; ++} ++ ++static int alx_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol) ++{ ++ struct alx_priv *alx = netdev_priv(netdev); ++ struct alx_hw *hw = &alx->hw; ++ ++ if (wol->wolopts & ~(WAKE_MAGIC | WAKE_PHY)) ++ return -EOPNOTSUPP; ++ ++ hw->sleep_ctrl = 0; ++ ++ if (wol->wolopts & WAKE_MAGIC) ++ hw->sleep_ctrl |= ALX_SLEEP_WOL_MAGIC; ++ if (wol->wolopts & WAKE_PHY) ++ hw->sleep_ctrl |= ALX_SLEEP_WOL_PHY; ++ ++ device_set_wakeup_enable(&alx->hw.pdev->dev, hw->sleep_ctrl); ++ ++ return 0; ++} ++ + const struct ethtool_ops alx_ethtool_ops = { + .get_pauseparam = alx_get_pauseparam, + .set_pauseparam = alx_set_pauseparam, + .get_msglevel = alx_get_msglevel, + .set_msglevel = alx_set_msglevel, ++ .get_wol = alx_get_wol, ++ .set_wol = alx_set_wol, + .get_link = ethtool_op_get_link, + .get_strings = alx_get_strings, + .get_sset_count = alx_get_sset_count, +diff --git a/drivers/net/ethernet/atheros/alx/hw.c b/drivers/net/ethernet/atheros/alx/hw.c +index 6ac40b0003a3..4791b9dbbe26 100644 +--- a/drivers/net/ethernet/atheros/alx/hw.c ++++ b/drivers/net/ethernet/atheros/alx/hw.c +@@ -332,6 +332,16 @@ void alx_set_macaddr(struct alx_hw *hw, const u8 *addr) + alx_write_mem32(hw, ALX_STAD1, val); + } + ++static void alx_enable_osc(struct alx_hw *hw) ++{ ++ u32 val; ++ ++ /* rising edge */ ++ val = alx_read_mem32(hw, ALX_MISC); ++ alx_write_mem32(hw, ALX_MISC, val & ~ALX_MISC_INTNLOSC_OPEN); ++ alx_write_mem32(hw, ALX_MISC, val | ALX_MISC_INTNLOSC_OPEN); ++} ++ + static void alx_reset_osc(struct alx_hw *hw, u8 rev) + { + u32 val, val2; +@@ -848,6 +858,66 @@ void alx_post_phy_link(struct alx_hw *hw) + } + } + ++ ++/* NOTE: ++ * 1. phy link must be established before calling this function ++ * 2. wol option (pattern,magic,link,etc.) is configed before call it. ++ */ ++int alx_pre_suspend(struct alx_hw *hw, int speed, u8 duplex) ++{ ++ u32 master, mac, phy, val; ++ int err = 0; ++ ++ master = alx_read_mem32(hw, ALX_MASTER); ++ master &= ~ALX_MASTER_PCLKSEL_SRDS; ++ mac = hw->rx_ctrl; ++ /* 10/100 half */ ++ ALX_SET_FIELD(mac, ALX_MAC_CTRL_SPEED, ALX_MAC_CTRL_SPEED_10_100); ++ mac &= ~(ALX_MAC_CTRL_FULLD | ALX_MAC_CTRL_RX_EN | ALX_MAC_CTRL_TX_EN); ++ ++ phy = alx_read_mem32(hw, ALX_PHY_CTRL); ++ phy &= ~(ALX_PHY_CTRL_DSPRST_OUT | ALX_PHY_CTRL_CLS); ++ phy |= ALX_PHY_CTRL_RST_ANALOG | ALX_PHY_CTRL_HIB_PULSE | ++ ALX_PHY_CTRL_HIB_EN; ++ ++ /* without any activity */ ++ if (!(hw->sleep_ctrl & ALX_SLEEP_ACTIVE)) { ++ err = alx_write_phy_reg(hw, ALX_MII_IER, 0); ++ if (err) ++ return err; ++ phy |= ALX_PHY_CTRL_IDDQ | ALX_PHY_CTRL_POWER_DOWN; ++ } else { ++ if (hw->sleep_ctrl & (ALX_SLEEP_WOL_MAGIC | ALX_SLEEP_CIFS)) ++ mac |= ALX_MAC_CTRL_RX_EN | ALX_MAC_CTRL_BRD_EN; ++ if (hw->sleep_ctrl & ALX_SLEEP_CIFS) ++ mac |= ALX_MAC_CTRL_TX_EN; ++ if (duplex == DUPLEX_FULL) ++ mac |= ALX_MAC_CTRL_FULLD; ++ if (speed == SPEED_1000) ++ ALX_SET_FIELD(mac, ALX_MAC_CTRL_SPEED, ++ ALX_MAC_CTRL_SPEED_1000); ++ phy |= ALX_PHY_CTRL_DSPRST_OUT; ++ err = alx_write_phy_ext(hw, ALX_MIIEXT_ANEG, ++ ALX_MIIEXT_S3DIG10, ++ ALX_MIIEXT_S3DIG10_SL); ++ if (err) ++ return err; ++ } ++ ++ alx_enable_osc(hw); ++ hw->rx_ctrl = mac; ++ alx_write_mem32(hw, ALX_MASTER, master); ++ alx_write_mem32(hw, ALX_MAC_CTRL, mac); ++ alx_write_mem32(hw, ALX_PHY_CTRL, phy); ++ ++ /* set val of PDLL D3PLLOFF */ ++ val = alx_read_mem32(hw, ALX_PDLL_TRNS1); ++ val |= ALX_PDLL_TRNS1_D3PLLOFF_EN; ++ alx_write_mem32(hw, ALX_PDLL_TRNS1, val); ++ ++ return 0; ++} ++ + bool alx_phy_configured(struct alx_hw *hw) + { + u32 cfg, hw_cfg; +@@ -920,6 +990,26 @@ int alx_clear_phy_intr(struct alx_hw *hw) + return alx_read_phy_reg(hw, ALX_MII_ISR, &isr); + } + ++int alx_config_wol(struct alx_hw *hw) ++{ ++ u32 wol = 0; ++ int err = 0; ++ ++ /* turn on magic packet event */ ++ if (hw->sleep_ctrl & ALX_SLEEP_WOL_MAGIC) ++ wol |= ALX_WOL0_MAGIC_EN | ALX_WOL0_PME_MAGIC_EN; ++ ++ /* turn on link up event */ ++ if (hw->sleep_ctrl & ALX_SLEEP_WOL_PHY) { ++ wol |= ALX_WOL0_LINK_EN | ALX_WOL0_PME_LINK; ++ /* only link up can wake up */ ++ err = alx_write_phy_reg(hw, ALX_MII_IER, ALX_IER_LINK_UP); ++ } ++ alx_write_mem32(hw, ALX_WOL0, wol); ++ ++ return err; ++} ++ + void alx_disable_rss(struct alx_hw *hw) + { + u32 ctrl = alx_read_mem32(hw, ALX_RXQ0); +@@ -1045,6 +1135,71 @@ void alx_mask_msix(struct alx_hw *hw, int index, bool mask) + } + + ++int alx_select_powersaving_speed(struct alx_hw *hw, int *speed, u8 *duplex) ++{ ++ int i, err; ++ u16 lpa; ++ ++ err = alx_read_phy_link(hw); ++ if (err) ++ return err; ++ ++ if (hw->link_speed == SPEED_UNKNOWN) { ++ *speed = SPEED_UNKNOWN; ++ *duplex = DUPLEX_UNKNOWN; ++ return 0; ++ } ++ ++ err = alx_read_phy_reg(hw, MII_LPA, &lpa); ++ if (err) ++ return err; ++ ++ if (!(lpa & LPA_LPACK)) { ++ *speed = hw->link_speed; ++ return 0; ++ } ++ ++ if (lpa & LPA_10FULL) { ++ *speed = SPEED_10; ++ *duplex = DUPLEX_FULL; ++ } else if (lpa & LPA_10HALF) { ++ *speed = SPEED_10; ++ *duplex = DUPLEX_HALF; ++ } else if (lpa & LPA_100FULL) { ++ *speed = SPEED_100; ++ *duplex = DUPLEX_FULL; ++ } else { ++ *speed = SPEED_100; ++ *duplex = DUPLEX_HALF; ++ } ++ ++ if (*speed == hw->link_speed && *duplex == hw->duplex) ++ return 0; ++ err = alx_write_phy_reg(hw, ALX_MII_IER, 0); ++ if (err) ++ return err; ++ err = alx_setup_speed_duplex(hw, alx_speed_to_ethadv(*speed, *duplex) | ++ ADVERTISED_Autoneg, ALX_FC_ANEG | ++ ALX_FC_RX | ALX_FC_TX); ++ if (err) ++ return err; ++ ++ /* wait for linkup */ ++ for (i = 0; i < ALX_MAX_SETUP_LNK_CYCLE; i++) { ++ msleep(100); ++ ++ err = alx_read_phy_link(hw); ++ if (err < 0) ++ return err; ++ if (hw->link_speed != SPEED_UNKNOWN) ++ break; ++ } ++ if (i == ALX_MAX_SETUP_LNK_CYCLE) ++ return -ETIMEDOUT; ++ ++ return 0; ++} ++ + bool alx_get_phy_info(struct alx_hw *hw) + { + u16 devs1, devs2; +diff --git a/drivers/net/ethernet/atheros/alx/hw.h b/drivers/net/ethernet/atheros/alx/hw.h +index e42d7e0947eb..a7fb6c8d846a 100644 +--- a/drivers/net/ethernet/atheros/alx/hw.h ++++ b/drivers/net/ethernet/atheros/alx/hw.h +@@ -487,6 +487,8 @@ struct alx_hw { + u8 flowctrl; + u32 adv_cfg; + ++ u32 sleep_ctrl; ++ + spinlock_t mdio_lock; + struct mdio_if_info mdio; + u16 phy_id[2]; +@@ -549,12 +551,14 @@ void alx_reset_pcie(struct alx_hw *hw); + void alx_enable_aspm(struct alx_hw *hw, bool l0s_en, bool l1_en); + int alx_setup_speed_duplex(struct alx_hw *hw, u32 ethadv, u8 flowctrl); + void alx_post_phy_link(struct alx_hw *hw); ++int alx_pre_suspend(struct alx_hw *hw, int speed, u8 duplex); + int alx_read_phy_reg(struct alx_hw *hw, u16 reg, u16 *phy_data); + int alx_write_phy_reg(struct alx_hw *hw, u16 reg, u16 phy_data); + int alx_read_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 *pdata); + int alx_write_phy_ext(struct alx_hw *hw, u8 dev, u16 reg, u16 data); + int alx_read_phy_link(struct alx_hw *hw); + int alx_clear_phy_intr(struct alx_hw *hw); ++int alx_config_wol(struct alx_hw *hw); + void alx_cfg_mac_flowcontrol(struct alx_hw *hw, u8 fc); + void alx_start_mac(struct alx_hw *hw); + int alx_reset_mac(struct alx_hw *hw); +@@ -563,6 +567,7 @@ bool alx_phy_configured(struct alx_hw *hw); + void alx_configure_basic(struct alx_hw *hw); + void alx_mask_msix(struct alx_hw *hw, int index, bool mask); + void alx_disable_rss(struct alx_hw *hw); ++int alx_select_powersaving_speed(struct alx_hw *hw, int *speed, u8 *duplex); + bool alx_get_phy_info(struct alx_hw *hw); + void alx_update_hw_stats(struct alx_hw *hw); + +diff --git a/drivers/net/ethernet/atheros/alx/main.c b/drivers/net/ethernet/atheros/alx/main.c +index 5e5022fa1d04..7adaf10a1929 100644 +--- a/drivers/net/ethernet/atheros/alx/main.c ++++ b/drivers/net/ethernet/atheros/alx/main.c +@@ -1070,6 +1070,7 @@ static int alx_init_sw(struct alx_priv *alx) + alx->dev->max_mtu = ALX_MAX_FRAME_LEN(ALX_MAX_FRAME_SIZE); + alx->tx_ringsz = 256; + alx->rx_ringsz = 512; ++ hw->sleep_ctrl = ALX_SLEEP_WOL_MAGIC | ALX_SLEEP_WOL_PHY; + hw->imt = 200; + alx->int_mask = ALX_ISR_MISC; + hw->dma_chnl = hw->max_dma_chnl; +@@ -1345,6 +1346,65 @@ static int alx_stop(struct net_device *netdev) + __alx_stop(netdev_priv(netdev)); + return 0; + } ++static int __alx_shutdown(struct pci_dev *pdev, bool *wol_en) ++{ ++ struct alx_priv *alx = pci_get_drvdata(pdev); ++ struct net_device *netdev = alx->dev; ++ struct alx_hw *hw = &alx->hw; ++ int err, speed; ++ u8 duplex; ++ ++ netif_device_detach(netdev); ++ ++ if (netif_running(netdev)) ++ __alx_stop(alx); ++ ++#ifdef CONFIG_PM_SLEEP ++ err = pci_save_state(pdev); ++ if (err) ++ return err; ++#endif ++ ++ err = alx_select_powersaving_speed(hw, &speed, &duplex); ++ if (err) ++ return err; ++ err = alx_clear_phy_intr(hw); ++ if (err) ++ return err; ++ err = alx_pre_suspend(hw, speed, duplex); ++ if (err) ++ return err; ++ err = alx_config_wol(hw); ++ if (err) ++ return err; ++ ++ *wol_en = false; ++ if (hw->sleep_ctrl & ALX_SLEEP_ACTIVE) { ++ netif_info(alx, wol, netdev, ++ "wol: ctrl=%X, speed=%X\n", ++ hw->sleep_ctrl, speed); ++ device_set_wakeup_enable(&pdev->dev, true); ++ *wol_en = true; ++ } ++ ++ pci_disable_device(pdev); ++ ++ return 0; ++} ++ ++static void alx_shutdown(struct pci_dev *pdev) ++{ ++ int err; ++ bool wol_en; ++ ++ err = __alx_shutdown(pdev, &wol_en); ++ if (!err) { ++ pci_wake_from_d3(pdev, wol_en); ++ pci_set_power_state(pdev, PCI_D3hot); ++ } else { ++ dev_err(&pdev->dev, "shutdown fail %d\n", err); ++ } ++} + + static void alx_link_check(struct work_struct *work) + { +@@ -1841,6 +1901,8 @@ static int alx_probe(struct pci_dev *pdev, const struct pci_device_id *ent) + goto out_unmap; + } + ++ device_set_wakeup_enable(&pdev->dev, hw->sleep_ctrl); ++ + netdev_info(netdev, + "Qualcomm Atheros AR816x/AR817x Ethernet [%pM]\n", + netdev->dev_addr); +@@ -1883,12 +1945,21 @@ static void alx_remove(struct pci_dev *pdev) + static int alx_suspend(struct device *dev) + { + struct pci_dev *pdev = to_pci_dev(dev); +- struct alx_priv *alx = pci_get_drvdata(pdev); ++ int err; ++ bool wol_en; + +- if (!netif_running(alx->dev)) +- return 0; +- netif_device_detach(alx->dev); +- __alx_stop(alx); ++ err = __alx_shutdown(pdev, &wol_en); ++ if (err) { ++ dev_err(&pdev->dev, "shutdown fail in suspend %d\n", err); ++ return err; ++ } ++ ++ if (wol_en) { ++ pci_prepare_to_sleep(pdev); ++ } else { ++ pci_wake_from_d3(pdev, false); ++ pci_set_power_state(pdev, PCI_D3hot); ++ } + return 0; + } + +@@ -1896,26 +1967,49 @@ static int alx_resume(struct device *dev) + { + struct pci_dev *pdev = to_pci_dev(dev); + struct alx_priv *alx = pci_get_drvdata(pdev); +- struct alx_hw *hw = &alx->hw; +- int err; +- +- alx_reset_phy(hw); +- +- if (!netif_running(alx->dev)) +- return 0; +- netif_device_attach(alx->dev); +- +- rtnl_lock(); +- err = __alx_open(alx, true); +- rtnl_unlock(); +- ++ struct net_device *netdev = alx->dev; ++ struct alx_hw *hw = &alx->hw; ++ int err; ++ ++ pci_set_power_state(pdev, PCI_D0); ++ pci_restore_state(pdev); ++ pci_save_state(pdev); ++ ++ pci_enable_wake(pdev, PCI_D3hot, 0); ++ pci_enable_wake(pdev, PCI_D3cold, 0); ++ ++ hw->link_speed = SPEED_UNKNOWN; ++ alx->int_mask = ALX_ISR_MISC; ++ ++ alx_reset_pcie(hw); ++ alx_reset_phy(hw); ++ ++ err = alx_reset_mac(hw); ++ if (err) { ++ netif_err(alx, hw, alx->dev, ++ "resume:reset_mac fail %d\n", err); ++ return -EIO; ++ } ++ ++ err = alx_setup_speed_duplex(hw, hw->adv_cfg, hw->flowctrl); ++ if (err) { ++ netif_err(alx, hw, alx->dev, ++ "resume:setup_speed_duplex fail %d\n", err); ++ return -EIO; ++ } ++ ++ if (netif_running(netdev)) { ++ rtnl_lock(); ++ err = __alx_open(alx, true); ++ rtnl_unlock(); ++ if (err) ++ return err; ++ } ++ ++ netif_device_attach(netdev); + return err; + } + +-static SIMPLE_DEV_PM_OPS(alx_pm_ops, alx_suspend, alx_resume); +-#define ALX_PM_OPS (&alx_pm_ops) +-#else +-#define ALX_PM_OPS NULL + #endif + + +@@ -1961,6 +2055,8 @@ static pci_ers_result_t alx_pci_error_slot_reset(struct pci_dev *pdev) + } + + pci_set_master(pdev); ++ pci_enable_wake(pdev, PCI_D3hot, 0); ++ pci_enable_wake(pdev, PCI_D3cold, 0); + + alx_reset_pcie(hw); + if (!alx_reset_mac(hw)) +@@ -2012,11 +2108,19 @@ static const struct pci_device_id alx_pci_tbl[] = { + {} + }; + ++#ifdef CONFIG_PM_SLEEP ++static SIMPLE_DEV_PM_OPS(alx_pm_ops, alx_suspend, alx_resume); ++#define ALX_PM_OPS (&alx_pm_ops) ++#else ++#define ALX_PM_OPS NULL ++#endif ++ + static struct pci_driver alx_driver = { + .name = alx_drv_name, + .id_table = alx_pci_tbl, + .probe = alx_probe, + .remove = alx_remove, ++ .shutdown = alx_shutdown, + .err_handler = &alx_err_handlers, + .driver.pm = ALX_PM_OPS, + }; diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-linux-hardened.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-linux-hardened.patch new file mode 100644 index 00000000..42ba2084 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-linux-hardened.patch @@ -0,0 +1,2732 @@ +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index f5acf35c712f..191e7eb6b9ce 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -496,16 +496,6 @@ + nosocket -- Disable socket memory accounting. + nokmem -- Disable kernel memory accounting. + +- checkreqprot [SELINUX] Set initial checkreqprot flag value. +- Format: { "0" | "1" } +- See security/selinux/Kconfig help text. +- 0 -- check protection applied by kernel (includes +- any implied execute protection). +- 1 -- check protection requested by application. +- Default value is set via a kernel config option. +- Value can be changed at runtime via +- /selinux/checkreqprot. +- + cio_ignore= [S390] + See Documentation/s390/CommonIO for details. + clk_ignore_unused +@@ -3105,6 +3095,11 @@ + the specified number of seconds. This is to be used if + your oopses keep scrolling off the screen. + ++ extra_latent_entropy ++ Enable a very simple form of latent entropy extraction ++ from the first 4GB of memory as the bootmem allocator ++ passes the memory pages to the buddy allocator. ++ + pcbit= [HW,ISDN] + + pcd. [PARIDE] +diff --git a/Documentation/sysctl/kernel.txt b/Documentation/sysctl/kernel.txt +index 37a679501ddc..59b747920f4d 100644 +--- a/Documentation/sysctl/kernel.txt ++++ b/Documentation/sysctl/kernel.txt +@@ -94,6 +94,7 @@ show up in /proc/sys/kernel: + - sysctl_writes_strict + - tainted + - threads-max ++- tiocsti_restrict + - unknown_nmi_panic + - watchdog + - watchdog_thresh +@@ -1041,6 +1042,26 @@ available RAM pages threads-max is reduced accordingly. + + ============================================================== + ++tiocsti_restrict: ++ ++This toggle indicates whether unprivileged users are prevented ++from using the TIOCSTI ioctl to inject commands into other processes ++which share a tty session. ++ ++When tiocsti_restrict is set to (0) there are no restrictions(accept ++the default restriction of only being able to injection commands into ++one's own tty). When tiocsti_restrict is set to (1), users must ++have CAP_SYS_ADMIN to use the TIOCSTI ioctl. ++ ++When user namespaces are in use, the check for the capability ++CAP_SYS_ADMIN is done against the user namespace that originally ++opened the tty. ++ ++The kernel config option CONFIG_SECURITY_TIOCSTI_RESTRICT sets the ++default value of tiocsti_restrict. ++ ++============================================================== ++ + unknown_nmi_panic: + + The value in this file affects behavior of handling NMI. When the +diff --git a/Makefile b/Makefile +index f1859811dca1..432040e2d299 100644 +--- a/Makefile ++++ b/Makefile +@@ -698,6 +698,9 @@ stackp-flags-$(CONFIG_STACKPROTECTOR_STRONG) := -fstack-protector-strong + KBUILD_CFLAGS += $(stackp-flags-y) + + ifeq ($(cc-name),clang) ++ifdef CONFIG_LOCAL_INIT ++KBUILD_CFLAGS += -fsanitize=local-init ++endif + KBUILD_CPPFLAGS += $(call cc-option,-Qunused-arguments,) + KBUILD_CFLAGS += $(call cc-disable-warning, format-invalid-specifier) + KBUILD_CFLAGS += $(call cc-disable-warning, gnu) +diff --git a/arch/Kconfig b/arch/Kconfig +index 6801123932a5..d331769f18cd 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -598,7 +598,7 @@ config ARCH_MMAP_RND_BITS + int "Number of bits to use for ASLR of mmap base address" if EXPERT + range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX + default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT +- default ARCH_MMAP_RND_BITS_MIN ++ default ARCH_MMAP_RND_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_BITS + help + This value can be used to select the number of bits to use to +@@ -632,7 +632,7 @@ config ARCH_MMAP_RND_COMPAT_BITS + int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT + range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX + default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT +- default ARCH_MMAP_RND_COMPAT_BITS_MIN ++ default ARCH_MMAP_RND_COMPAT_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS + help + This value can be used to select the number of bits to use to +@@ -837,6 +837,7 @@ config ARCH_HAS_REFCOUNT + + config REFCOUNT_FULL + bool "Perform full reference count validation at the expense of speed" ++ default y + help + Enabling this switches the refcounting infrastructure from a fast + unchecked atomic_t implementation to a fully state checked +diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig +index 1b1a0e95c751..2397d505747f 100644 +--- a/arch/arm64/Kconfig ++++ b/arch/arm64/Kconfig +@@ -1013,6 +1013,7 @@ endif + + config ARM64_SW_TTBR0_PAN + bool "Emulate Privileged Access Never using TTBR0_EL1 switching" ++ default y + help + Enabling this option prevents the kernel from accessing + user-space memory directly by pointing TTBR0_EL1 to a reserved +@@ -1188,6 +1189,7 @@ config RANDOMIZE_BASE + bool "Randomize the address of the kernel image" + select ARM64_MODULE_PLTS if MODULES + select RELOCATABLE ++ default y + help + Randomizes the virtual address at which the kernel image is + loaded, as a security feature that deters exploit attempts +diff --git a/arch/arm64/Kconfig.debug b/arch/arm64/Kconfig.debug +index 69c9170bdd24..a786227db0e3 100644 +--- a/arch/arm64/Kconfig.debug ++++ b/arch/arm64/Kconfig.debug +@@ -42,6 +42,7 @@ config ARM64_RANDOMIZE_TEXT_OFFSET + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select ARM64_PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/arm64/configs/defconfig b/arch/arm64/configs/defconfig +index db8d364f8476..67441db36c07 100644 +--- a/arch/arm64/configs/defconfig ++++ b/arch/arm64/configs/defconfig +@@ -1,4 +1,3 @@ +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_AUDIT=y + CONFIG_NO_HZ_IDLE=y +diff --git a/arch/arm64/include/asm/elf.h b/arch/arm64/include/asm/elf.h +index 433b9554c6a1..1f4b06317c9f 100644 +--- a/arch/arm64/include/asm/elf.h ++++ b/arch/arm64/include/asm/elf.h +@@ -114,10 +114,10 @@ + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +-#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) ++#define ELF_ET_DYN_BASE 0x100000000UL + + #ifndef __ASSEMBLY__ + +@@ -171,10 +171,10 @@ extern int arch_setup_additional_pages(struct linux_binprm *bprm, + /* 1GB of VA */ + #ifdef CONFIG_COMPAT + #define STACK_RND_MASK (test_thread_flag(TIF_32BIT) ? \ +- 0x7ff >> (PAGE_SHIFT - 12) : \ +- 0x3ffff >> (PAGE_SHIFT - 12)) ++ ((1UL << mmap_rnd_compat_bits) - 1) >> (PAGE_SHIFT - 12) : \ ++ ((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #else +-#define STACK_RND_MASK (0x3ffff >> (PAGE_SHIFT - 12)) ++#define STACK_RND_MASK (((1UL << mmap_rnd_bits) - 1) >> (PAGE_SHIFT - 12)) + #endif + + #ifdef __AARCH64EB__ +diff --git a/arch/arm64/kernel/process.c b/arch/arm64/kernel/process.c +index 7f1628effe6d..38bd2f95a961 100644 +--- a/arch/arm64/kernel/process.c ++++ b/arch/arm64/kernel/process.c +@@ -481,9 +481,9 @@ unsigned long arch_align_stack(unsigned long sp) + unsigned long arch_randomize_brk(struct mm_struct *mm) + { + if (is_compat_task()) +- return randomize_page(mm->brk, SZ_32M); ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; + else +- return randomize_page(mm->brk, SZ_1G); ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + /* +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 44c6a82b7ce5..62aba195aae8 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -1189,8 +1189,7 @@ config VM86 + default X86_LEGACY_VM86 + + config X86_16BIT +- bool "Enable support for 16-bit segments" if EXPERT +- default y ++ bool "Enable support for 16-bit segments" + depends on MODIFY_LDT_SYSCALL + ---help--- + This option is required by programs like Wine to run 16-bit +@@ -2280,7 +2279,7 @@ config COMPAT_VDSO + choice + prompt "vsyscall table for legacy applications" + depends on X86_64 +- default LEGACY_VSYSCALL_EMULATE ++ default LEGACY_VSYSCALL_NONE + help + Legacy user code that does not know how to find the vDSO expects + to be able to issue three syscalls by calling fixed addresses in +@@ -2361,8 +2360,7 @@ config CMDLINE_OVERRIDE + be set to 'N' under normal conditions. + + config MODIFY_LDT_SYSCALL +- bool "Enable the LDT (local descriptor table)" if EXPERT +- default y ++ bool "Enable the LDT (local descriptor table)" + ---help--- + Linux can allow user programs to install a per-process x86 + Local Descriptor Table (LDT) using the modify_ldt(2) system +diff --git a/arch/x86/Kconfig.debug b/arch/x86/Kconfig.debug +index 7d68f0c7cfb1..85f04bbeadd8 100644 +--- a/arch/x86/Kconfig.debug ++++ b/arch/x86/Kconfig.debug +@@ -101,6 +101,7 @@ config EFI_PGT_DUMP + config DEBUG_WX + bool "Warn on W+X mappings at boot" + select X86_PTDUMP_CORE ++ default y + ---help--- + Generate a warning if any W+X mappings are found at boot. + +diff --git a/arch/x86/configs/x86_64_defconfig b/arch/x86/configs/x86_64_defconfig +index e32fc1f274d8..d08acc76502a 100644 +--- a/arch/x86/configs/x86_64_defconfig ++++ b/arch/x86/configs/x86_64_defconfig +@@ -1,5 +1,4 @@ + # CONFIG_LOCALVERSION_AUTO is not set +-CONFIG_SYSVIPC=y + CONFIG_POSIX_MQUEUE=y + CONFIG_BSD_PROCESS_ACCT=y + CONFIG_TASKSTATS=y +diff --git a/arch/x86/entry/vdso/vma.c b/arch/x86/entry/vdso/vma.c +index 5b8b556dbb12..a569f08b4478 100644 +--- a/arch/x86/entry/vdso/vma.c ++++ b/arch/x86/entry/vdso/vma.c +@@ -204,55 +204,9 @@ static int map_vdso(const struct vdso_image *image, unsigned long addr) + } + + #ifdef CONFIG_X86_64 +-/* +- * Put the vdso above the (randomized) stack with another randomized +- * offset. This way there is no hole in the middle of address space. +- * To save memory make sure it is still in the same PTE as the stack +- * top. This doesn't give that many random bits. +- * +- * Note that this algorithm is imperfect: the distribution of the vdso +- * start address within a PMD is biased toward the end. +- * +- * Only used for the 64-bit and x32 vdsos. +- */ +-static unsigned long vdso_addr(unsigned long start, unsigned len) +-{ +- unsigned long addr, end; +- unsigned offset; +- +- /* +- * Round up the start address. It can start out unaligned as a result +- * of stack start randomization. +- */ +- start = PAGE_ALIGN(start); +- +- /* Round the lowest possible end address up to a PMD boundary. */ +- end = (start + len + PMD_SIZE - 1) & PMD_MASK; +- if (end >= TASK_SIZE_MAX) +- end = TASK_SIZE_MAX; +- end -= len; +- +- if (end > start) { +- offset = get_random_int() % (((end - start) >> PAGE_SHIFT) + 1); +- addr = start + (offset << PAGE_SHIFT); +- } else { +- addr = start; +- } +- +- /* +- * Forcibly align the final address in case we have a hardware +- * issue that requires alignment for performance reasons. +- */ +- addr = align_vdso_addr(addr); +- +- return addr; +-} +- + static int map_vdso_randomized(const struct vdso_image *image) + { +- unsigned long addr = vdso_addr(current->mm->start_stack, image->size-image->sym_vvar_start); +- +- return map_vdso(image, addr); ++ return map_vdso(image, 0); + } + #endif + +diff --git a/arch/x86/include/asm/elf.h b/arch/x86/include/asm/elf.h +index 0d157d2a1e2a..770c8ae97f92 100644 +--- a/arch/x86/include/asm/elf.h ++++ b/arch/x86/include/asm/elf.h +@@ -249,11 +249,11 @@ extern int force_personality32; + + /* + * This is the base location for PIE (ET_DYN with INTERP) loads. On +- * 64-bit, this is above 4GB to leave the entire 32-bit address ++ * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ + #define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ +- (DEFAULT_MAP_WINDOW / 3 * 2)) ++ 0x100000000UL) + + /* This yields a mask that user programs can use to figure out what + instruction set this CPU supports. This could be done in user space, +@@ -313,8 +313,8 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); + + #ifdef CONFIG_X86_32 + +-#define __STACK_RND_MASK(is32bit) (0x7ff) +-#define STACK_RND_MASK (0x7ff) ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#define STACK_RND_MASK ((1UL << mmap_rnd_bits) - 1) + + #define ARCH_DLINFO ARCH_DLINFO_IA32 + +@@ -323,7 +323,11 @@ extern bool mmap_address_hint_valid(unsigned long addr, unsigned long len); + #else /* CONFIG_X86_32 */ + + /* 1GB for 64bit, 8MB for 32bit */ +-#define __STACK_RND_MASK(is32bit) ((is32bit) ? 0x7ff : 0x3fffff) ++#ifdef CONFIG_COMPAT ++#define __STACK_RND_MASK(is32bit) ((is32bit) ? (1UL << mmap_rnd_compat_bits) - 1 : (1UL << mmap_rnd_bits) - 1) ++#else ++#define __STACK_RND_MASK(is32bit) ((1UL << mmap_rnd_bits) - 1) ++#endif + #define STACK_RND_MASK __STACK_RND_MASK(mmap_is_ia32()) + + #define ARCH_DLINFO \ +@@ -381,5 +385,4 @@ struct va_alignment { + } ____cacheline_aligned; + + extern struct va_alignment va_align; +-extern unsigned long align_vdso_addr(unsigned long); + #endif /* _ASM_X86_ELF_H */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 79ec7add5f98..2950448e00ac 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -310,6 +310,7 @@ static inline void cr4_set_bits(unsigned long mask) + + local_irq_save(flags); + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 | mask) != cr4) + __cr4_set(cr4 | mask); + local_irq_restore(flags); +@@ -322,6 +323,7 @@ static inline void cr4_clear_bits(unsigned long mask) + + local_irq_save(flags); + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + if ((cr4 & ~mask) != cr4) + __cr4_set(cr4 & ~mask); + local_irq_restore(flags); +@@ -332,6 +334,7 @@ static inline void cr4_toggle_bits_irqsoff(unsigned long mask) + unsigned long cr4; + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + __cr4_set(cr4 ^ mask); + } + +@@ -438,6 +441,7 @@ static inline void __native_flush_tlb_global(void) + raw_local_irq_save(flags); + + cr4 = this_cpu_read(cpu_tlbstate.cr4); ++ BUG_ON(cr4 != __read_cr4()); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ +diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c +index 44c4ef3d989b..05943ca7b59a 100644 +--- a/arch/x86/kernel/cpu/common.c ++++ b/arch/x86/kernel/cpu/common.c +@@ -1730,7 +1730,6 @@ void cpu_init(void) + wrmsrl(MSR_KERNEL_GS_BASE, 0); + barrier(); + +- x86_configure_nx(); + x2apic_setup(); + + /* +diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c +index 7d31192296a8..4f87550d814c 100644 +--- a/arch/x86/kernel/process.c ++++ b/arch/x86/kernel/process.c +@@ -39,6 +39,8 @@ + #include <asm/desc.h> + #include <asm/prctl.h> + #include <asm/spec-ctrl.h> ++#include <asm/elf.h> ++#include <linux/sizes.h> + + #include "process.h" + +@@ -779,7 +781,10 @@ unsigned long arch_align_stack(unsigned long sp) + + unsigned long arch_randomize_brk(struct mm_struct *mm) + { +- return randomize_page(mm->brk, 0x02000000); ++ if (mmap_is_ia32()) ++ return mm->brk + get_random_long() % SZ_32M + PAGE_SIZE; ++ else ++ return mm->brk + get_random_long() % SZ_1G + PAGE_SIZE; + } + + /* +diff --git a/arch/x86/kernel/sys_x86_64.c b/arch/x86/kernel/sys_x86_64.c +index 6a78d4b36a79..715009f7a96c 100644 +--- a/arch/x86/kernel/sys_x86_64.c ++++ b/arch/x86/kernel/sys_x86_64.c +@@ -54,13 +54,6 @@ static unsigned long get_align_bits(void) + return va_align.bits & get_align_mask(); + } + +-unsigned long align_vdso_addr(unsigned long addr) +-{ +- unsigned long align_mask = get_align_mask(); +- addr = (addr + align_mask) & ~align_mask; +- return addr | get_align_bits(); +-} +- + static int __init control_va_addr_alignment(char *str) + { + /* guard against enabling this on other CPU families */ +@@ -122,10 +115,7 @@ static void find_start_end(unsigned long addr, unsigned long flags, + } + + *begin = get_mmap_base(1); +- if (in_compat_syscall()) +- *end = task_size_32bit(); +- else +- *end = task_size_64bit(addr > DEFAULT_MAP_WINDOW); ++ *end = get_mmap_base(0); + } + + unsigned long +@@ -210,7 +200,7 @@ arch_get_unmapped_area_topdown(struct file *filp, const unsigned long addr0, + + info.flags = VM_UNMAPPED_AREA_TOPDOWN; + info.length = len; +- info.low_limit = PAGE_SIZE; ++ info.low_limit = get_mmap_base(1); + info.high_limit = get_mmap_base(0); + + /* +diff --git a/arch/x86/mm/init_32.c b/arch/x86/mm/init_32.c +index 979e0a02cbe1..d6ab882a0091 100644 +--- a/arch/x86/mm/init_32.c ++++ b/arch/x86/mm/init_32.c +@@ -560,9 +560,9 @@ static void __init pagetable_init(void) + + #define DEFAULT_PTE_MASK ~(_PAGE_NX | _PAGE_GLOBAL) + /* Bits supported by the hardware: */ +-pteval_t __supported_pte_mask __read_mostly = DEFAULT_PTE_MASK; ++pteval_t __supported_pte_mask __ro_after_init = DEFAULT_PTE_MASK; + /* Bits allowed in normal kernel mappings: */ +-pteval_t __default_kernel_pte_mask __read_mostly = DEFAULT_PTE_MASK; ++pteval_t __default_kernel_pte_mask __ro_after_init = DEFAULT_PTE_MASK; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ + EXPORT_SYMBOL(__default_kernel_pte_mask); +@@ -873,7 +873,7 @@ int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap) + #endif + #endif + +-int kernel_set_to_readonly __read_mostly; ++int kernel_set_to_readonly __ro_after_init; + + void set_kernel_text_rw(void) + { +@@ -925,12 +925,11 @@ void mark_rodata_ro(void) + unsigned long start = PFN_ALIGN(_text); + unsigned long size = PFN_ALIGN(_etext) - start; + ++ kernel_set_to_readonly = 1; + set_pages_ro(virt_to_page(start), size >> PAGE_SHIFT); + printk(KERN_INFO "Write protecting the kernel text: %luk\n", + size >> 10); + +- kernel_set_to_readonly = 1; +- + #ifdef CONFIG_CPA_DEBUG + printk(KERN_INFO "Testing CPA: Reverting %lx-%lx\n", + start, start+size); +diff --git a/arch/x86/mm/init_64.c b/arch/x86/mm/init_64.c +index a3e9c6ee3cf2..40bbcd978b0a 100644 +--- a/arch/x86/mm/init_64.c ++++ b/arch/x86/mm/init_64.c +@@ -66,9 +66,9 @@ + */ + + /* Bits supported by the hardware: */ +-pteval_t __supported_pte_mask __read_mostly = ~0; ++pteval_t __supported_pte_mask __ro_after_init = ~0; + /* Bits allowed in normal kernel mappings: */ +-pteval_t __default_kernel_pte_mask __read_mostly = ~0; ++pteval_t __default_kernel_pte_mask __ro_after_init = ~0; + EXPORT_SYMBOL_GPL(__supported_pte_mask); + /* Used in PAGE_KERNEL_* macros which are reasonably used out-of-tree: */ + EXPORT_SYMBOL(__default_kernel_pte_mask); +@@ -1201,7 +1201,7 @@ void __init mem_init(void) + mem_init_print_info(NULL); + } + +-int kernel_set_to_readonly; ++int kernel_set_to_readonly __ro_after_init; + + void set_kernel_text_rw(void) + { +@@ -1250,9 +1250,8 @@ void mark_rodata_ro(void) + + printk(KERN_INFO "Write protecting the kernel read-only data: %luk\n", + (end - start) >> 10); +- set_memory_ro(start, (end - start) >> PAGE_SHIFT); +- + kernel_set_to_readonly = 1; ++ set_memory_ro(start, (end - start) >> PAGE_SHIFT); + + /* + * The rodata/data/bss/brk section (but not the kernel text!) +diff --git a/block/blk-softirq.c b/block/blk-softirq.c +index 15c1f5e12eb8..ff72cccec5b8 100644 +--- a/block/blk-softirq.c ++++ b/block/blk-softirq.c +@@ -20,7 +20,7 @@ static DEFINE_PER_CPU(struct list_head, blk_cpu_done); + * Softirq action handler - move entries to local list and loop over them + * while passing them to the queue registered handler. + */ +-static __latent_entropy void blk_done_softirq(struct softirq_action *h) ++static __latent_entropy void blk_done_softirq(void) + { + struct list_head *cpu_list, local_list; + +diff --git a/drivers/ata/libata-core.c b/drivers/ata/libata-core.c +index b8c3f9e6af89..bf65bc091cb6 100644 +--- a/drivers/ata/libata-core.c ++++ b/drivers/ata/libata-core.c +@@ -5157,7 +5157,7 @@ void ata_qc_free(struct ata_queued_cmd *qc) + struct ata_port *ap; + unsigned int tag; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + ap = qc->ap; + + qc->flags = 0; +@@ -5174,7 +5174,7 @@ void __ata_qc_complete(struct ata_queued_cmd *qc) + struct ata_port *ap; + struct ata_link *link; + +- WARN_ON_ONCE(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ ++ BUG_ON(qc == NULL); /* ata_qc_from_tag _might_ return NULL */ + WARN_ON_ONCE(!(qc->flags & ATA_QCFLAG_ACTIVE)); + ap = qc->ap; + link = qc->dev->link; +diff --git a/drivers/char/Kconfig b/drivers/char/Kconfig +index 40728491f37b..b4f3ccfa2993 100644 +--- a/drivers/char/Kconfig ++++ b/drivers/char/Kconfig +@@ -9,7 +9,6 @@ source "drivers/tty/Kconfig" + + config DEVMEM + bool "/dev/mem virtual device support" +- default y + help + Say Y here if you want to support the /dev/mem device. + The /dev/mem device is used to access areas of physical +@@ -531,7 +530,6 @@ config TELCLOCK + config DEVPORT + bool "/dev/port character device" + depends on ISA || PCI +- default y + help + Say Y here if you want to support the /dev/port device. The /dev/port + device is similar to /dev/mem, but for I/O ports. +diff --git a/drivers/tty/Kconfig b/drivers/tty/Kconfig +index 0840d27381ea..ae292fcedaca 100644 +--- a/drivers/tty/Kconfig ++++ b/drivers/tty/Kconfig +@@ -122,7 +122,6 @@ config UNIX98_PTYS + + config LEGACY_PTYS + bool "Legacy (BSD) PTY support" +- default y + ---help--- + A pseudo terminal (PTY) is a software device consisting of two + halves: a master and a slave. The slave device behaves identical to +diff --git a/drivers/tty/tty_io.c b/drivers/tty/tty_io.c +index e7d192ebecd7..1c682abd31ca 100644 +--- a/drivers/tty/tty_io.c ++++ b/drivers/tty/tty_io.c +@@ -172,6 +172,7 @@ static void free_tty_struct(struct tty_struct *tty) + put_device(tty->dev); + kfree(tty->write_buf); + tty->magic = 0xDEADDEAD; ++ put_user_ns(tty->owner_user_ns); + kfree(tty); + } + +@@ -2175,11 +2176,19 @@ static int tty_fasync(int fd, struct file *filp, int on) + * FIXME: may race normal receive processing + */ + ++int tiocsti_restrict = IS_ENABLED(CONFIG_SECURITY_TIOCSTI_RESTRICT); ++ + static int tiocsti(struct tty_struct *tty, char __user *p) + { + char ch, mbz = 0; + struct tty_ldisc *ld; + ++ if (tiocsti_restrict && ++ !ns_capable(tty->owner_user_ns, CAP_SYS_ADMIN)) { ++ dev_warn_ratelimited(tty->dev, ++ "Denied TIOCSTI ioctl for non-privileged process\n"); ++ return -EPERM; ++ } + if ((current->signal->tty != tty) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + if (get_user(ch, p)) +@@ -2863,6 +2872,7 @@ struct tty_struct *alloc_tty_struct(struct tty_driver *driver, int idx) + tty->index = idx; + tty_line_name(driver, idx, tty->name); + tty->dev = tty_get_device(tty); ++ tty->owner_user_ns = get_user_ns(current_user_ns()); + + return tty; + } +diff --git a/drivers/usb/core/hub.c b/drivers/usb/core/hub.c +index cc62707c0251..21d78ae4b4ae 100644 +--- a/drivers/usb/core/hub.c ++++ b/drivers/usb/core/hub.c +@@ -41,6 +41,8 @@ + #define USB_TP_TRANSMISSION_DELAY 40 /* ns */ + #define USB_TP_TRANSMISSION_DELAY_MAX 65535 /* ns */ + ++extern int deny_new_usb; ++ + /* Protect struct usb_device->state and ->children members + * Note: Both are also protected by ->dev.sem, except that ->state can + * change to USB_STATE_NOTATTACHED even when the semaphore isn't held. */ +@@ -4933,6 +4935,12 @@ static void hub_port_connect(struct usb_hub *hub, int port1, u16 portstatus, + goto done; + return; + } ++ ++ if (deny_new_usb) { ++ dev_err(&port_dev->dev, "denied insert of USB device on port %d\n", port1); ++ goto done; ++ } ++ + if (hub_is_superspeed(hub->hdev)) + unit_load = 150; + else +diff --git a/fs/exec.c b/fs/exec.c +index 1ebf6e5a521d..73b8d839927c 100644 +--- a/fs/exec.c ++++ b/fs/exec.c +@@ -62,6 +62,7 @@ + #include <linux/oom.h> + #include <linux/compat.h> + #include <linux/vmalloc.h> ++#include <linux/random.h> + + #include <linux/uaccess.h> + #include <asm/mmu_context.h> +@@ -320,6 +321,8 @@ static int __bprm_mm_init(struct linux_binprm *bprm) + arch_bprm_mm_init(mm, vma); + up_write(&mm->mmap_sem); + bprm->p = vma->vm_end - sizeof(void *); ++ if (randomize_va_space) ++ bprm->p ^= get_random_int() & ~PAGE_MASK; + return 0; + err: + up_write(&mm->mmap_sem); +diff --git a/fs/namei.c b/fs/namei.c +index 914178cdbe94..7422b5ce077a 100644 +--- a/fs/namei.c ++++ b/fs/namei.c +@@ -885,10 +885,10 @@ static inline void put_link(struct nameidata *nd) + path_put(&last->link); + } + +-int sysctl_protected_symlinks __read_mostly = 0; +-int sysctl_protected_hardlinks __read_mostly = 0; +-int sysctl_protected_fifos __read_mostly; +-int sysctl_protected_regular __read_mostly; ++int sysctl_protected_symlinks __read_mostly = 1; ++int sysctl_protected_hardlinks __read_mostly = 1; ++int sysctl_protected_fifos __read_mostly = 2; ++int sysctl_protected_regular __read_mostly = 2; + + /** + * may_follow_link - Check symlink following for unsafe situations +diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig +index 5f93cfacb3d1..cea0d7d3b23e 100644 +--- a/fs/nfs/Kconfig ++++ b/fs/nfs/Kconfig +@@ -195,4 +195,3 @@ config NFS_DEBUG + bool + depends on NFS_FS && SUNRPC_DEBUG + select CRC32 +- default y +diff --git a/fs/proc/Kconfig b/fs/proc/Kconfig +index 817c02b13b1d..b8cd62b5cbc3 100644 +--- a/fs/proc/Kconfig ++++ b/fs/proc/Kconfig +@@ -40,7 +40,6 @@ config PROC_KCORE + config PROC_VMCORE + bool "/proc/vmcore support" + depends on PROC_FS && CRASH_DUMP +- default y + help + Exports the dump image of crashed kernel in ELF format. + +diff --git a/fs/stat.c b/fs/stat.c +index f8e6fb2c3657..240c1432e18f 100644 +--- a/fs/stat.c ++++ b/fs/stat.c +@@ -40,8 +40,13 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) + stat->gid = inode->i_gid; + stat->rdev = inode->i_rdev; + stat->size = i_size_read(inode); +- stat->atime = inode->i_atime; +- stat->mtime = inode->i_mtime; ++ if (is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = inode->i_ctime; ++ stat->mtime = inode->i_ctime; ++ } else { ++ stat->atime = inode->i_atime; ++ stat->mtime = inode->i_mtime; ++ } + stat->ctime = inode->i_ctime; + stat->blksize = i_blocksize(inode); + stat->blocks = inode->i_blocks; +@@ -75,9 +80,14 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, + stat->result_mask |= STATX_BASIC_STATS; + request_mask &= STATX_ALL; + query_flags &= KSTAT_QUERY_FLAGS; +- if (inode->i_op->getattr) +- return inode->i_op->getattr(path, stat, request_mask, +- query_flags); ++ if (inode->i_op->getattr) { ++ int retval = inode->i_op->getattr(path, stat, request_mask, query_flags); ++ if (!retval && is_sidechannel_device(inode) && !capable_noaudit(CAP_MKNOD)) { ++ stat->atime = stat->ctime; ++ stat->mtime = stat->ctime; ++ } ++ return retval; ++ } + + generic_fillattr(inode, stat); + return 0; +diff --git a/include/linux/cache.h b/include/linux/cache.h +index 750621e41d1c..e7157c18c62c 100644 +--- a/include/linux/cache.h ++++ b/include/linux/cache.h +@@ -31,6 +31,8 @@ + #define __ro_after_init __attribute__((__section__(".data..ro_after_init"))) + #endif + ++#define __read_only __ro_after_init ++ + #ifndef ____cacheline_aligned + #define ____cacheline_aligned __attribute__((__aligned__(SMP_CACHE_BYTES))) + #endif +diff --git a/include/linux/capability.h b/include/linux/capability.h +index f640dcbc880c..2b4f5d651f19 100644 +--- a/include/linux/capability.h ++++ b/include/linux/capability.h +@@ -207,6 +207,7 @@ extern bool has_capability_noaudit(struct task_struct *t, int cap); + extern bool has_ns_capability_noaudit(struct task_struct *t, + struct user_namespace *ns, int cap); + extern bool capable(int cap); ++extern bool capable_noaudit(int cap); + extern bool ns_capable(struct user_namespace *ns, int cap); + extern bool ns_capable_noaudit(struct user_namespace *ns, int cap); + #else +@@ -232,6 +233,10 @@ static inline bool capable(int cap) + { + return true; + } ++static inline bool capable_noaudit(int cap) ++{ ++ return true; ++} + static inline bool ns_capable(struct user_namespace *ns, int cap) + { + return true; +diff --git a/include/linux/fs.h b/include/linux/fs.h +index 7b6084854bfe..cee4467da4a7 100644 +--- a/include/linux/fs.h ++++ b/include/linux/fs.h +@@ -3456,4 +3456,15 @@ extern void inode_nohighmem(struct inode *inode); + extern int vfs_fadvise(struct file *file, loff_t offset, loff_t len, + int advice); + ++extern int device_sidechannel_restrict; ++ ++static inline bool is_sidechannel_device(const struct inode *inode) ++{ ++ umode_t mode; ++ if (!device_sidechannel_restrict) ++ return false; ++ mode = inode->i_mode; ++ return ((S_ISCHR(mode) || S_ISBLK(mode)) && (mode & (S_IROTH | S_IWOTH))); ++} ++ + #endif /* _LINUX_FS_H */ +diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h +index fd1ce10553bf..1905d2476d32 100644 +--- a/include/linux/fsnotify.h ++++ b/include/linux/fsnotify.h +@@ -177,6 +177,9 @@ static inline void fsnotify_access(struct file *file) + struct inode *inode = file_inode(file); + __u32 mask = FS_ACCESS; + ++ if (is_sidechannel_device(inode)) ++ return; ++ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + +@@ -195,6 +198,9 @@ static inline void fsnotify_modify(struct file *file) + struct inode *inode = file_inode(file); + __u32 mask = FS_MODIFY; + ++ if (is_sidechannel_device(inode)) ++ return; ++ + if (S_ISDIR(inode->i_mode)) + mask |= FS_ISDIR; + +diff --git a/include/linux/gfp.h b/include/linux/gfp.h +index 24bcc5eec6b4..b1cdfc350596 100644 +--- a/include/linux/gfp.h ++++ b/include/linux/gfp.h +@@ -530,9 +530,9 @@ extern struct page *alloc_pages_vma(gfp_t gfp_mask, int order, + extern unsigned long __get_free_pages(gfp_t gfp_mask, unsigned int order); + extern unsigned long get_zeroed_page(gfp_t gfp_mask); + +-void *alloc_pages_exact(size_t size, gfp_t gfp_mask); ++void *alloc_pages_exact(size_t size, gfp_t gfp_mask) __attribute__((alloc_size(1))); + void free_pages_exact(void *virt, size_t size); +-void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask); ++void * __meminit alloc_pages_exact_nid(int nid, size_t size, gfp_t gfp_mask) __attribute__((alloc_size(2))); + + #define __get_free_page(gfp_mask) \ + __get_free_pages((gfp_mask), 0) +diff --git a/include/linux/highmem.h b/include/linux/highmem.h +index 0690679832d4..b9394bc86fad 100644 +--- a/include/linux/highmem.h ++++ b/include/linux/highmem.h +@@ -191,6 +191,13 @@ static inline void clear_highpage(struct page *page) + kunmap_atomic(kaddr); + } + ++static inline void verify_zero_highpage(struct page *page) ++{ ++ void *kaddr = kmap_atomic(page); ++ BUG_ON(memchr_inv(kaddr, 0, PAGE_SIZE)); ++ kunmap_atomic(kaddr); ++} ++ + static inline void zero_user_segments(struct page *page, + unsigned start1, unsigned end1, + unsigned start2, unsigned end2) +diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h +index eeceac3376fc..78ad558bce5f 100644 +--- a/include/linux/interrupt.h ++++ b/include/linux/interrupt.h +@@ -490,7 +490,7 @@ extern const char * const softirq_to_name[NR_SOFTIRQS]; + + struct softirq_action + { +- void (*action)(struct softirq_action *); ++ void (*action)(void); + }; + + asmlinkage void do_softirq(void); +@@ -505,7 +505,7 @@ static inline void do_softirq_own_stack(void) + } + #endif + +-extern void open_softirq(int nr, void (*action)(struct softirq_action *)); ++extern void __init open_softirq(int nr, void (*action)(void)); + extern void softirq_init(void); + extern void __raise_softirq_irqoff(unsigned int nr); + +diff --git a/include/linux/kobject_ns.h b/include/linux/kobject_ns.h +index 069aa2ebef90..cb9e3637a620 100644 +--- a/include/linux/kobject_ns.h ++++ b/include/linux/kobject_ns.h +@@ -45,7 +45,7 @@ struct kobj_ns_type_operations { + void (*drop_ns)(void *); + }; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops); ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops); + int kobj_ns_type_registered(enum kobj_ns_type type); + const struct kobj_ns_type_operations *kobj_child_ns_ops(struct kobject *parent); + const struct kobj_ns_type_operations *kobj_ns_ops(struct kobject *kobj); +diff --git a/include/linux/mm.h b/include/linux/mm.h +index e899460f1bc5..bca0cbed3269 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -571,7 +571,7 @@ static inline int is_vmalloc_or_module_addr(const void *x) + } + #endif + +-extern void *kvmalloc_node(size_t size, gfp_t flags, int node); ++extern void *kvmalloc_node(size_t size, gfp_t flags, int node) __attribute__((alloc_size(1))); + static inline void *kvmalloc(size_t size, gfp_t flags) + { + return kvmalloc_node(size, flags, NUMA_NO_NODE); +diff --git a/include/linux/percpu.h b/include/linux/percpu.h +index 70b7123f38c7..09f3019489b2 100644 +--- a/include/linux/percpu.h ++++ b/include/linux/percpu.h +@@ -129,7 +129,7 @@ extern int __init pcpu_page_first_chunk(size_t reserved_size, + pcpu_fc_populate_pte_fn_t populate_pte_fn); + #endif + +-extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_reserved_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern bool __is_kernel_percpu_address(unsigned long addr, unsigned long *can_addr); + extern bool is_kernel_percpu_address(unsigned long addr); + +@@ -137,8 +137,8 @@ extern bool is_kernel_percpu_address(unsigned long addr); + extern void __init setup_per_cpu_areas(void); + #endif + +-extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp); +-extern void __percpu *__alloc_percpu(size_t size, size_t align); ++extern void __percpu *__alloc_percpu_gfp(size_t size, size_t align, gfp_t gfp) __attribute__((alloc_size(1))); ++extern void __percpu *__alloc_percpu(size_t size, size_t align) __attribute__((alloc_size(1))); + extern void free_percpu(void __percpu *__pdata); + extern phys_addr_t per_cpu_ptr_to_phys(void *addr); + +diff --git a/include/linux/perf_event.h b/include/linux/perf_event.h +index 53c500f0ca79..15c236b8aba3 100644 +--- a/include/linux/perf_event.h ++++ b/include/linux/perf_event.h +@@ -1179,6 +1179,11 @@ extern int perf_cpu_time_max_percent_handler(struct ctl_table *table, int write, + int perf_event_max_stack_handler(struct ctl_table *table, int write, + void __user *buffer, size_t *lenp, loff_t *ppos); + ++static inline bool perf_paranoid_any(void) ++{ ++ return sysctl_perf_event_paranoid > 2; ++} ++ + static inline bool perf_paranoid_tracepoint_raw(void) + { + return sysctl_perf_event_paranoid > -1; +diff --git a/include/linux/slab.h b/include/linux/slab.h +index ed9cbddeb4a6..e76e18c7165f 100644 +--- a/include/linux/slab.h ++++ b/include/linux/slab.h +@@ -178,8 +178,8 @@ void memcg_destroy_kmem_caches(struct mem_cgroup *); + /* + * Common kmalloc functions provided by all allocators + */ +-void * __must_check __krealloc(const void *, size_t, gfp_t); +-void * __must_check krealloc(const void *, size_t, gfp_t); ++void * __must_check __krealloc(const void *, size_t, gfp_t) __attribute__((alloc_size(2))); ++void * __must_check krealloc(const void *, size_t, gfp_t) __attribute((alloc_size(2))); + void kfree(const void *); + void kzfree(const void *); + size_t ksize(const void *); +@@ -352,7 +352,7 @@ static __always_inline unsigned int kmalloc_index(size_t size) + } + #endif /* !CONFIG_SLOB */ + +-void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc; ++void *__kmalloc(size_t size, gfp_t flags) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc(struct kmem_cache *, gfp_t flags) __assume_slab_alignment __malloc; + void kmem_cache_free(struct kmem_cache *, void *); + +@@ -376,7 +376,7 @@ static __always_inline void kfree_bulk(size_t size, void **p) + } + + #ifdef CONFIG_NUMA +-void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc; ++void *__kmalloc_node(size_t size, gfp_t flags, int node) __assume_kmalloc_alignment __malloc __attribute__((alloc_size(1))); + void *kmem_cache_alloc_node(struct kmem_cache *, gfp_t flags, int node) __assume_slab_alignment __malloc; + #else + static __always_inline void *__kmalloc_node(size_t size, gfp_t flags, int node) +@@ -498,7 +498,7 @@ static __always_inline void *kmalloc_large(size_t size, gfp_t flags) + * for general use, and so are not documented here. For a full list of + * potential flags, always refer to linux/gfp.h. + */ +-static __always_inline void *kmalloc(size_t size, gfp_t flags) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc(size_t size, gfp_t flags) + { + if (__builtin_constant_p(size)) { + if (size > KMALLOC_MAX_CACHE_SIZE) +@@ -538,7 +538,7 @@ static __always_inline unsigned int kmalloc_size(unsigned int n) + return 0; + } + +-static __always_inline void *kmalloc_node(size_t size, gfp_t flags, int node) ++static __always_inline __attribute__((alloc_size(1))) void *kmalloc_node(size_t size, gfp_t flags, int node) + { + #ifndef CONFIG_SLOB + if (__builtin_constant_p(size) && +diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h +index 3a1a1dbc6f49..ff38fec9eb76 100644 +--- a/include/linux/slub_def.h ++++ b/include/linux/slub_def.h +@@ -121,6 +121,11 @@ struct kmem_cache { + unsigned long random; + #endif + ++#ifdef CONFIG_SLAB_CANARY ++ unsigned long random_active; ++ unsigned long random_inactive; ++#endif ++ + #ifdef CONFIG_NUMA + /* + * Defragmentation by allocating from a remote node. +diff --git a/include/linux/string.h b/include/linux/string.h +index 4a5a0eb7df51..be86cf21d0ce 100644 +--- a/include/linux/string.h ++++ b/include/linux/string.h +@@ -235,10 +235,16 @@ void __read_overflow2(void) __compiletime_error("detected read beyond size of ob + void __read_overflow3(void) __compiletime_error("detected read beyond size of object passed as 3rd parameter"); + void __write_overflow(void) __compiletime_error("detected write beyond size of object passed as 1st parameter"); + ++#ifdef CONFIG_FORTIFY_SOURCE_STRICT_STRING ++#define __string_size(p) __builtin_object_size(p, 1) ++#else ++#define __string_size(p) __builtin_object_size(p, 0) ++#endif ++ + #if !defined(__NO_FORTIFY) && defined(__OPTIMIZE__) && defined(CONFIG_FORTIFY_SOURCE) + __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (__builtin_constant_p(size) && p_size < size) + __write_overflow(); + if (p_size < size) +@@ -248,7 +254,7 @@ __FORTIFY_INLINE char *strncpy(char *p, const char *q, __kernel_size_t size) + + __FORTIFY_INLINE char *strcat(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + if (p_size == (size_t)-1) + return __builtin_strcat(p, q); + if (strlcat(p, q, p_size) >= p_size) +@@ -259,7 +265,7 @@ __FORTIFY_INLINE char *strcat(char *p, const char *q) + __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + { + __kernel_size_t ret; +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + + /* Work around gcc excess stack consumption issue */ + if (p_size == (size_t)-1 || +@@ -274,7 +280,7 @@ __FORTIFY_INLINE __kernel_size_t strlen(const char *p) + extern __kernel_size_t __real_strnlen(const char *, __kernel_size_t) __RENAME(strnlen); + __FORTIFY_INLINE __kernel_size_t strnlen(const char *p, __kernel_size_t maxlen) + { +- size_t p_size = __builtin_object_size(p, 0); ++ size_t p_size = __string_size(p); + __kernel_size_t ret = __real_strnlen(p, maxlen < p_size ? maxlen : p_size); + if (p_size <= ret && maxlen != ret) + fortify_panic(__func__); +@@ -286,8 +292,8 @@ extern size_t __real_strlcpy(char *, const char *, size_t) __RENAME(strlcpy); + __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + { + size_t ret; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __real_strlcpy(p, q, size); + ret = strlen(q); +@@ -307,8 +313,8 @@ __FORTIFY_INLINE size_t strlcpy(char *p, const char *q, size_t size) + __FORTIFY_INLINE char *strncat(char *p, const char *q, __kernel_size_t count) + { + size_t p_len, copy_len; +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strncat(p, q, count); + p_len = strlen(p); +@@ -421,8 +427,8 @@ __FORTIFY_INLINE void *kmemdup(const void *p, size_t size, gfp_t gfp) + /* defined after fortified strlen and memcpy to reuse them */ + __FORTIFY_INLINE char *strcpy(char *p, const char *q) + { +- size_t p_size = __builtin_object_size(p, 0); +- size_t q_size = __builtin_object_size(q, 0); ++ size_t p_size = __string_size(p); ++ size_t q_size = __string_size(q); + if (p_size == (size_t)-1 && q_size == (size_t)-1) + return __builtin_strcpy(p, q); + memcpy(p, q, strlen(q) + 1); +diff --git a/include/linux/tty.h b/include/linux/tty.h +index 808fbfe86f85..e4429b7d6e8e 100644 +--- a/include/linux/tty.h ++++ b/include/linux/tty.h +@@ -14,6 +14,7 @@ + #include <uapi/linux/tty.h> + #include <linux/rwsem.h> + #include <linux/llist.h> ++#include <linux/user_namespace.h> + + + /* +@@ -336,6 +337,7 @@ struct tty_struct { + /* If the tty has a pending do_SAK, queue it here - akpm */ + struct work_struct SAK_work; + struct tty_port *port; ++ struct user_namespace *owner_user_ns; + } __randomize_layout; + + /* Each of a tty's open files has private_data pointing to tty_file_private */ +@@ -345,6 +347,8 @@ struct tty_file_private { + struct list_head list; + }; + ++extern int tiocsti_restrict; ++ + /* tty magic number */ + #define TTY_MAGIC 0x5401 + +diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h +index 398e9c95cd61..baab7195306a 100644 +--- a/include/linux/vmalloc.h ++++ b/include/linux/vmalloc.h +@@ -69,19 +69,19 @@ static inline void vmalloc_init(void) + } + #endif + +-extern void *vmalloc(unsigned long size); +-extern void *vzalloc(unsigned long size); +-extern void *vmalloc_user(unsigned long size); +-extern void *vmalloc_node(unsigned long size, int node); +-extern void *vzalloc_node(unsigned long size, int node); +-extern void *vmalloc_exec(unsigned long size); +-extern void *vmalloc_32(unsigned long size); +-extern void *vmalloc_32_user(unsigned long size); +-extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot); ++extern void *vmalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vzalloc(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vzalloc_node(unsigned long size, int node) __attribute__((alloc_size(1))); ++extern void *vmalloc_exec(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32(unsigned long size) __attribute__((alloc_size(1))); ++extern void *vmalloc_32_user(unsigned long size) __attribute__((alloc_size(1))); ++extern void *__vmalloc(unsigned long size, gfp_t gfp_mask, pgprot_t prot) __attribute__((alloc_size(1))); + extern void *__vmalloc_node_range(unsigned long size, unsigned long align, + unsigned long start, unsigned long end, gfp_t gfp_mask, + pgprot_t prot, unsigned long vm_flags, int node, +- const void *caller); ++ const void *caller) __attribute__((alloc_size(1))); + #ifndef CONFIG_MMU + extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); + static inline void *__vmalloc_node_flags_caller(unsigned long size, int node, +diff --git a/init/Kconfig b/init/Kconfig +index 864af10bb1b9..643bb9448bb9 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -323,6 +323,7 @@ config USELIB + config AUDIT + bool "Auditing support" + depends on NET ++ default y + help + Enable auditing infrastructure that can be used with another + kernel subsystem, such as SELinux (which requires this for +@@ -1088,6 +1089,12 @@ config CC_OPTIMIZE_FOR_SIZE + + endchoice + ++config LOCAL_INIT ++ bool "Zero uninitialized locals" ++ help ++ Zero-fill uninitialized local variables, other than variable-length ++ arrays. Requires compiler support. ++ + config HAVE_LD_DEAD_CODE_DATA_ELIMINATION + bool + help +@@ -1374,8 +1381,7 @@ config SHMEM + which may be appropriate on small systems without swap. + + config AIO +- bool "Enable AIO support" if EXPERT +- default y ++ bool "Enable AIO support" + help + This option enables POSIX asynchronous I/O which may by used + by some high performance threaded applications. Disabling +@@ -1592,7 +1598,7 @@ config VM_EVENT_COUNTERS + + config SLUB_DEBUG + default y +- bool "Enable SLUB debugging support" if EXPERT ++ bool "Enable SLUB debugging support" + depends on SLUB && SYSFS + help + SLUB has extensive debug support features. Disabling these can +@@ -1616,7 +1622,6 @@ config SLUB_MEMCG_SYSFS_ON + + config COMPAT_BRK + bool "Disable heap randomization" +- default y + help + Randomizing heap placement makes heap exploits harder, but it + also breaks ancient binaries (including anything libc5 based). +@@ -1663,7 +1668,6 @@ endchoice + + config SLAB_MERGE_DEFAULT + bool "Allow slab caches to be merged" +- default y + help + For reduced kernel memory fragmentation, slab caches can be + merged when they share the same size and other characteristics. +@@ -1676,9 +1680,9 @@ config SLAB_MERGE_DEFAULT + command line. + + config SLAB_FREELIST_RANDOM +- default n + depends on SLAB || SLUB + bool "SLAB freelist randomization" ++ default y + help + Randomizes the freelist order used on creating new pages. This + security feature reduces the predictability of the kernel slab +@@ -1687,12 +1691,56 @@ config SLAB_FREELIST_RANDOM + config SLAB_FREELIST_HARDENED + bool "Harden slab freelist metadata" + depends on SLUB ++ default y + help + Many kernel heap attacks try to target slab cache metadata and + other infrastructure. This options makes minor performance + sacrifies to harden the kernel slab allocator against common + freelist exploit methods. + ++config SLAB_HARDENED ++ default y ++ depends on SLUB ++ bool "Hardened SLAB infrastructure" ++ help ++ Make minor performance sacrifices to harden the kernel slab ++ allocator. ++ ++config SLAB_CANARY ++ depends on SLUB ++ depends on !SLAB_MERGE_DEFAULT ++ bool "SLAB canaries" ++ default y ++ help ++ Place canaries at the end of kernel slab allocations, sacrificing ++ some performance and memory usage for security. ++ ++ Canaries can detect some forms of heap corruption when allocations ++ are freed and as part of the HARDENED_USERCOPY feature. It provides ++ basic use-after-free detection for HARDENED_USERCOPY. ++ ++ Canaries absorb small overflows (rendering them harmless), mitigate ++ non-NUL terminated C string overflows on 64-bit via a guaranteed zero ++ byte and provide basic double-free detection. ++ ++config SLAB_SANITIZE ++ bool "Sanitize SLAB allocations" ++ depends on SLUB ++ default y ++ help ++ Zero fill slab allocations on free, reducing the lifetime of ++ sensitive data and helping to mitigate use-after-free bugs. ++ ++ For slabs with debug poisoning enabling, this has no impact. ++ ++config SLAB_SANITIZE_VERIFY ++ depends on SLAB_SANITIZE && PAGE_SANITIZE ++ default y ++ bool "Verify sanitized SLAB allocations" ++ help ++ Verify that newly allocated slab allocations are zeroed to detect ++ write-after-free bugs. ++ + config SLUB_CPU_PARTIAL + default y + depends on SLUB && SMP +diff --git a/kernel/audit.c b/kernel/audit.c +index 2a8058764aa6..14e7a763db43 100644 +--- a/kernel/audit.c ++++ b/kernel/audit.c +@@ -1628,6 +1628,9 @@ static int __init audit_enable(char *str) + + if (audit_default == AUDIT_OFF) + audit_initialized = AUDIT_DISABLED; ++ else if (!audit_ever_enabled) ++ audit_initialized = AUDIT_UNINITIALIZED; ++ + if (audit_set_enabled(audit_default)) + pr_err("audit: error setting audit state (%d)\n", + audit_default); +diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c +index 474525e3a9db..644a87f6ad28 100644 +--- a/kernel/bpf/core.c ++++ b/kernel/bpf/core.c +@@ -368,7 +368,7 @@ void bpf_prog_kallsyms_del_all(struct bpf_prog *fp) + #ifdef CONFIG_BPF_JIT + /* All BPF JIT sysctl knobs here. */ + int bpf_jit_enable __read_mostly = IS_BUILTIN(CONFIG_BPF_JIT_ALWAYS_ON); +-int bpf_jit_harden __read_mostly; ++int bpf_jit_harden __read_mostly = 2; + int bpf_jit_kallsyms __read_mostly; + + static __always_inline void +diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c +index 382c09dddf93..11f436e79170 100644 +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -48,7 +48,7 @@ static DEFINE_SPINLOCK(prog_idr_lock); + static DEFINE_IDR(map_idr); + static DEFINE_SPINLOCK(map_idr_lock); + +-int sysctl_unprivileged_bpf_disabled __read_mostly; ++int sysctl_unprivileged_bpf_disabled __read_mostly = 1; + + static const struct bpf_map_ops * const bpf_map_types[] = { + #define BPF_PROG_TYPE(_id, _ops) +diff --git a/kernel/capability.c b/kernel/capability.c +index 1e1c0236f55b..452062fe45ce 100644 +--- a/kernel/capability.c ++++ b/kernel/capability.c +@@ -431,6 +431,12 @@ bool capable(int cap) + return ns_capable(&init_user_ns, cap); + } + EXPORT_SYMBOL(capable); ++ ++bool capable_noaudit(int cap) ++{ ++ return ns_capable_noaudit(&init_user_ns, cap); ++} ++EXPORT_SYMBOL(capable_noaudit); + #endif /* CONFIG_MULTIUSER */ + + /** +diff --git a/kernel/events/core.c b/kernel/events/core.c +index 5a97f34bc14c..a4a4fc1e1586 100644 +--- a/kernel/events/core.c ++++ b/kernel/events/core.c +@@ -397,8 +397,13 @@ static cpumask_var_t perf_online_mask; + * 0 - disallow raw tracepoint access for unpriv + * 1 - disallow cpu events for unpriv + * 2 - disallow kernel profiling for unpriv ++ * 3 - disallow all unpriv perf event use + */ ++#ifdef CONFIG_SECURITY_PERF_EVENTS_RESTRICT ++int sysctl_perf_event_paranoid __read_mostly = 3; ++#else + int sysctl_perf_event_paranoid __read_mostly = 2; ++#endif + + /* Minimum for 512 kiB + 1 user control page */ + int sysctl_perf_event_mlock __read_mostly = 512 + (PAGE_SIZE / 1024); /* 'free' kiB per user */ +@@ -10410,6 +10415,9 @@ SYSCALL_DEFINE5(perf_event_open, + if (flags & ~PERF_FLAG_ALL) + return -EINVAL; + ++ if (perf_paranoid_any() && !capable(CAP_SYS_ADMIN)) ++ return -EACCES; ++ + err = perf_copy_attr(attr_uptr, &attr); + if (err) + return err; +diff --git a/kernel/fork.c b/kernel/fork.c +index 64ef113e387e..42d257e43e04 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -103,6 +103,11 @@ + + #define CREATE_TRACE_POINTS + #include <trace/events/task.h> ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#else ++#define unprivileged_userns_clone 0 ++#endif + + /* + * Minimum number of threads to boot the kernel +@@ -1649,6 +1654,10 @@ static __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -2476,6 +2485,12 @@ int ksys_unshare(unsigned long unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/power/snapshot.c b/kernel/power/snapshot.c +index 3d37c279c090..0789ca413f09 100644 +--- a/kernel/power/snapshot.c ++++ b/kernel/power/snapshot.c +@@ -1138,7 +1138,7 @@ void free_basic_memory_bitmaps(void) + + void clear_free_pages(void) + { +-#ifdef CONFIG_PAGE_POISONING_ZERO ++#if defined(CONFIG_PAGE_POISONING_ZERO) || defined(CONFIG_PAGE_SANITIZE) + struct memory_bitmap *bm = free_pages_map; + unsigned long pfn; + +@@ -1155,7 +1155,7 @@ void clear_free_pages(void) + } + memory_bm_position_reset(bm); + pr_info("free pages cleared after restore\n"); +-#endif /* PAGE_POISONING_ZERO */ ++#endif /* PAGE_POISONING_ZERO || PAGE_SANITIZE */ + } + + /** +diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c +index befc9321a89c..61e19256560c 100644 +--- a/kernel/rcu/tiny.c ++++ b/kernel/rcu/tiny.c +@@ -162,7 +162,7 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp) + } + } + +-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) ++static __latent_entropy void rcu_process_callbacks(void) + { + __rcu_process_callbacks(&rcu_sched_ctrlblk); + __rcu_process_callbacks(&rcu_bh_ctrlblk); +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index 15301ed19da6..2a799dea7016 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -2862,7 +2862,7 @@ __rcu_process_callbacks(struct rcu_state *rsp) + /* + * Do RCU core processing for the current CPU. + */ +-static __latent_entropy void rcu_process_callbacks(struct softirq_action *unused) ++static __latent_entropy void rcu_process_callbacks(void) + { + struct rcu_state *rsp; + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 7137bc343b4a..104e0855a018 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9593,7 +9593,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) + * run_rebalance_domains is triggered when needed from the scheduler tick. + * Also triggered for nohz idle balancing (with nohz_balancing_kick set). + */ +-static __latent_entropy void run_rebalance_domains(struct softirq_action *h) ++static __latent_entropy void run_rebalance_domains(void) + { + struct rq *this_rq = this_rq(); + enum cpu_idle_type idle = this_rq->idle_balance ? +diff --git a/kernel/softirq.c b/kernel/softirq.c +index 6f584861d329..1943fe60f3b9 100644 +--- a/kernel/softirq.c ++++ b/kernel/softirq.c +@@ -53,7 +53,7 @@ DEFINE_PER_CPU_ALIGNED(irq_cpustat_t, irq_stat); + EXPORT_PER_CPU_SYMBOL(irq_stat); + #endif + +-static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp; ++static struct softirq_action softirq_vec[NR_SOFTIRQS] __ro_after_init __aligned(PAGE_SIZE); + + DEFINE_PER_CPU(struct task_struct *, ksoftirqd); + +@@ -289,7 +289,7 @@ asmlinkage __visible void __softirq_entry __do_softirq(void) + kstat_incr_softirqs_this_cpu(vec_nr); + + trace_softirq_entry(vec_nr); +- h->action(h); ++ h->action(); + trace_softirq_exit(vec_nr); + if (unlikely(prev_count != preempt_count())) { + pr_err("huh, entered softirq %u %s %p with preempt_count %08x, exited with %08x?\n", +@@ -451,7 +451,7 @@ void __raise_softirq_irqoff(unsigned int nr) + or_softirq_pending(1UL << nr); + } + +-void open_softirq(int nr, void (*action)(struct softirq_action *)) ++void __init open_softirq(int nr, void (*action)(void)) + { + softirq_vec[nr].action = action; + } +@@ -497,8 +497,7 @@ void __tasklet_hi_schedule(struct tasklet_struct *t) + } + EXPORT_SYMBOL(__tasklet_hi_schedule); + +-static void tasklet_action_common(struct softirq_action *a, +- struct tasklet_head *tl_head, ++static void tasklet_action_common(struct tasklet_head *tl_head, + unsigned int softirq_nr) + { + struct tasklet_struct *list; +@@ -535,14 +534,14 @@ static void tasklet_action_common(struct softirq_action *a, + } + } + +-static __latent_entropy void tasklet_action(struct softirq_action *a) ++static __latent_entropy void tasklet_action(void) + { +- tasklet_action_common(a, this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); ++ tasklet_action_common(this_cpu_ptr(&tasklet_vec), TASKLET_SOFTIRQ); + } + +-static __latent_entropy void tasklet_hi_action(struct softirq_action *a) ++static __latent_entropy void tasklet_hi_action(void) + { +- tasklet_action_common(a, this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); ++ tasklet_action_common(this_cpu_ptr(&tasklet_hi_vec), HI_SOFTIRQ); + } + + void tasklet_init(struct tasklet_struct *t, +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index cc02050fd0c4..cca161854186 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -67,6 +67,7 @@ + #include <linux/bpf.h> + #include <linux/mount.h> + #include <linux/pipe_fs_i.h> ++#include <linux/tty.h> + + #include <linux/uaccess.h> + #include <asm/processor.h> +@@ -99,12 +100,19 @@ + #if defined(CONFIG_SYSCTL) + + /* External variables not in a header file. */ ++#if IS_ENABLED(CONFIG_USB) ++int deny_new_usb __read_mostly = 0; ++EXPORT_SYMBOL(deny_new_usb); ++#endif + extern int suid_dumpable; + #ifdef CONFIG_COREDUMP + extern int core_uses_pid; + extern char core_pattern[]; + extern unsigned int core_pipe_limit; + #endif ++#ifdef CONFIG_USER_NS ++extern int unprivileged_userns_clone; ++#endif + extern int pid_max; + extern int pid_max_min, pid_max_max; + extern int percpu_pagelist_fraction; +@@ -116,33 +124,33 @@ extern int sysctl_nr_trim_pages; + + /* Constants used for minimum and maximum */ + #ifdef CONFIG_LOCKUP_DETECTOR +-static int sixty = 60; ++static int sixty __read_only = 60; + #endif + +-static int __maybe_unused neg_one = -1; ++static int __maybe_unused neg_one __read_only = -1; + + static int zero; +-static int __maybe_unused one = 1; +-static int __maybe_unused two = 2; +-static int __maybe_unused four = 4; +-static unsigned long one_ul = 1; +-static int one_hundred = 100; +-static int one_thousand = 1000; ++static int __maybe_unused one __read_only = 1; ++static int __maybe_unused two __read_only = 2; ++static int __maybe_unused four __read_only = 4; ++static unsigned long one_ul __read_only = 1; ++static int one_hundred __read_only = 100; ++static int one_thousand __read_only = 1000; + #ifdef CONFIG_PRINTK +-static int ten_thousand = 10000; ++static int ten_thousand __read_only = 10000; + #endif + #ifdef CONFIG_PERF_EVENTS +-static int six_hundred_forty_kb = 640 * 1024; ++static int six_hundred_forty_kb __read_only = 640 * 1024; + #endif + + /* this is needed for the proc_doulongvec_minmax of vm_dirty_bytes */ +-static unsigned long dirty_bytes_min = 2 * PAGE_SIZE; ++static unsigned long dirty_bytes_min __read_only = 2 * PAGE_SIZE; + + /* this is needed for the proc_dointvec_minmax for [fs_]overflow UID and GID */ +-static int maxolduid = 65535; +-static int minolduid; ++static int maxolduid __read_only = 65535; ++static int minolduid __read_only; + +-static int ngroups_max = NGROUPS_MAX; ++static int ngroups_max __read_only = NGROUPS_MAX; + static const int cap_last_cap = CAP_LAST_CAP; + + /* +@@ -150,9 +158,12 @@ static const int cap_last_cap = CAP_LAST_CAP; + * and hung_task_check_interval_secs + */ + #ifdef CONFIG_DETECT_HUNG_TASK +-static unsigned long hung_task_timeout_max = (LONG_MAX/HZ); ++static unsigned long hung_task_timeout_max __read_only = (LONG_MAX/HZ); + #endif + ++int device_sidechannel_restrict __read_mostly = 1; ++EXPORT_SYMBOL(device_sidechannel_restrict); ++ + #ifdef CONFIG_INOTIFY_USER + #include <linux/inotify.h> + #endif +@@ -296,19 +307,19 @@ static struct ctl_table sysctl_base_table[] = { + }; + + #ifdef CONFIG_SCHED_DEBUG +-static int min_sched_granularity_ns = 100000; /* 100 usecs */ +-static int max_sched_granularity_ns = NSEC_PER_SEC; /* 1 second */ +-static int min_wakeup_granularity_ns; /* 0 usecs */ +-static int max_wakeup_granularity_ns = NSEC_PER_SEC; /* 1 second */ ++static int min_sched_granularity_ns __read_only = 100000; /* 100 usecs */ ++static int max_sched_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ ++static int min_wakeup_granularity_ns __read_only; /* 0 usecs */ ++static int max_wakeup_granularity_ns __read_only = NSEC_PER_SEC; /* 1 second */ + #ifdef CONFIG_SMP +-static int min_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; +-static int max_sched_tunable_scaling = SCHED_TUNABLESCALING_END-1; ++static int min_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_NONE; ++static int max_sched_tunable_scaling __read_only = SCHED_TUNABLESCALING_END-1; + #endif /* CONFIG_SMP */ + #endif /* CONFIG_SCHED_DEBUG */ + + #ifdef CONFIG_COMPACTION +-static int min_extfrag_threshold; +-static int max_extfrag_threshold = 1000; ++static int min_extfrag_threshold __read_only; ++static int max_extfrag_threshold __read_only = 1000; + #endif + + static struct ctl_table kern_table[] = { +@@ -514,6 +525,15 @@ static struct ctl_table kern_table[] = { + .proc_handler = proc_dointvec, + }, + #endif ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "tainted", +@@ -862,6 +882,37 @@ static struct ctl_table kern_table[] = { + .extra1 = &zero, + .extra2 = &two, + }, ++#endif ++#if defined CONFIG_TTY ++ { ++ .procname = "tiocsti_restrict", ++ .data = &tiocsti_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++#endif ++ { ++ .procname = "device_sidechannel_restrict", ++ .data = &device_sidechannel_restrict, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, ++#if IS_ENABLED(CONFIG_USB) ++ { ++ .procname = "deny_new_usb", ++ .data = &deny_new_usb, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec_minmax_sysadmin, ++ .extra1 = &zero, ++ .extra2 = &one, ++ }, + #endif + { + .procname = "ngroups_max", +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index e1a549c9e399..c560063e3a8c 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1462,7 +1462,7 @@ static void __hrtimer_run_queues(struct hrtimer_cpu_base *cpu_base, ktime_t now, + } + } + +-static __latent_entropy void hrtimer_run_softirq(struct softirq_action *h) ++static __latent_entropy void hrtimer_run_softirq(void) + { + struct hrtimer_cpu_base *cpu_base = this_cpu_ptr(&hrtimer_bases); + unsigned long flags; +diff --git a/kernel/time/timer.c b/kernel/time/timer.c +index fa49cd753dea..a16f8613282e 100644 +--- a/kernel/time/timer.c ++++ b/kernel/time/timer.c +@@ -1688,7 +1688,7 @@ static inline void __run_timers(struct timer_base *base) + /* + * This function runs timers and the timer-tq in bottom half context. + */ +-static __latent_entropy void run_timer_softirq(struct softirq_action *h) ++static __latent_entropy void run_timer_softirq(void) + { + struct timer_base *base = this_cpu_ptr(&timer_bases[BASE_STD]); + +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 923414a246e9..6b9dbc257e34 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -26,6 +26,9 @@ + #include <linux/bsearch.h> + #include <linux/sort.h> + ++/* sysctl */ ++int unprivileged_userns_clone; ++ + static struct kmem_cache *user_ns_cachep __read_mostly; + static DEFINE_MUTEX(userns_state_mutex); + +diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug +index 4966c4fbe7f7..7a685272c155 100644 +--- a/lib/Kconfig.debug ++++ b/lib/Kconfig.debug +@@ -950,6 +950,7 @@ endmenu # "Debug lockups and hangs" + + config PANIC_ON_OOPS + bool "Panic on Oops" ++ default y + help + Say Y here to enable the kernel to panic when it oopses. This + has the same effect as setting oops=panic on the kernel command +@@ -959,7 +960,7 @@ config PANIC_ON_OOPS + anything erroneous after an oops which could result in data + corruption or other issues. + +- Say N if unsure. ++ Say Y if unsure. + + config PANIC_ON_OOPS_VALUE + int +@@ -1328,6 +1329,7 @@ config DEBUG_BUGVERBOSE + config DEBUG_LIST + bool "Debug linked list manipulation" + depends on DEBUG_KERNEL || BUG_ON_DATA_CORRUPTION ++ default y + help + Enable this to turn on extended checks in the linked-list + walking routines. +@@ -1982,6 +1984,7 @@ config MEMTEST + config BUG_ON_DATA_CORRUPTION + bool "Trigger a BUG when data corruption is detected" + select DEBUG_LIST ++ default y + help + Select this option if the kernel should BUG when it encounters + data corruption in kernel memory structures when they get checked +@@ -2021,6 +2024,7 @@ config STRICT_DEVMEM + config IO_STRICT_DEVMEM + bool "Filter I/O access to /dev/mem" + depends on STRICT_DEVMEM ++ default y + ---help--- + If this option is disabled, you allow userspace (root) access to all + io-memory regardless of whether a driver is actively using that +diff --git a/lib/irq_poll.c b/lib/irq_poll.c +index 86a709954f5a..6f15787fcb1b 100644 +--- a/lib/irq_poll.c ++++ b/lib/irq_poll.c +@@ -75,7 +75,7 @@ void irq_poll_complete(struct irq_poll *iop) + } + EXPORT_SYMBOL(irq_poll_complete); + +-static void __latent_entropy irq_poll_softirq(struct softirq_action *h) ++static void __latent_entropy irq_poll_softirq(void) + { + struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll); + int rearm = 0, budget = irq_poll_budget; +diff --git a/lib/kobject.c b/lib/kobject.c +index 97d86dc17c42..388257c2878b 100644 +--- a/lib/kobject.c ++++ b/lib/kobject.c +@@ -978,9 +978,9 @@ EXPORT_SYMBOL_GPL(kset_create_and_add); + + + static DEFINE_SPINLOCK(kobj_ns_type_lock); +-static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES]; ++static const struct kobj_ns_type_operations *kobj_ns_ops_tbl[KOBJ_NS_TYPES] __ro_after_init; + +-int kobj_ns_type_register(const struct kobj_ns_type_operations *ops) ++int __init kobj_ns_type_register(const struct kobj_ns_type_operations *ops) + { + enum kobj_ns_type type = ops->type; + int error; +diff --git a/lib/nlattr.c b/lib/nlattr.c +index e335bcafa9e4..f6334f882b1f 100644 +--- a/lib/nlattr.c ++++ b/lib/nlattr.c +@@ -364,6 +364,8 @@ int nla_memcpy(void *dest, const struct nlattr *src, int count) + { + int minlen = min_t(int, count, nla_len(src)); + ++ BUG_ON(minlen < 0); ++ + memcpy(dest, nla_data(src), minlen); + if (count > minlen) + memset(dest + minlen, 0, count - minlen); +diff --git a/lib/vsprintf.c b/lib/vsprintf.c +index 812e59e13fe6..2c2104884c81 100644 +--- a/lib/vsprintf.c ++++ b/lib/vsprintf.c +@@ -1371,7 +1371,7 @@ char *pointer_string(char *buf, char *end, const void *ptr, + return number(buf, end, (unsigned long int)ptr, spec); + } + +-int kptr_restrict __read_mostly; ++int kptr_restrict __read_mostly = 2; + + static noinline_for_stack + char *restricted_pointer(char *buf, char *end, const void *ptr, +diff --git a/mm/Kconfig b/mm/Kconfig +index de64ea658716..8bff017856eb 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -311,7 +311,8 @@ config KSM + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" + depends on MMU +- default 4096 ++ default 32768 if ARM || (ARM64 && COMPAT) ++ default 65536 + help + This is the portion of low virtual memory which should be protected + from userspace allocation. Keeping a user from writing to low pages +diff --git a/mm/mmap.c b/mm/mmap.c +index f7cd9cb966c0..fda49841f4f2 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -229,6 +229,13 @@ SYSCALL_DEFINE1(brk, unsigned long, brk) + + newbrk = PAGE_ALIGN(brk); + oldbrk = PAGE_ALIGN(mm->brk); ++ /* properly handle unaligned min_brk as an empty heap */ ++ if (min_brk & ~PAGE_MASK) { ++ if (brk == min_brk) ++ newbrk -= PAGE_SIZE; ++ if (mm->brk == min_brk) ++ oldbrk -= PAGE_SIZE; ++ } + if (oldbrk == newbrk) + goto set_brk; + +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 9e45553cabd6..f5ec01e1498c 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -66,6 +66,7 @@ + #include <linux/ftrace.h> + #include <linux/lockdep.h> + #include <linux/nmi.h> ++#include <linux/random.h> + + #include <asm/sections.h> + #include <asm/tlbflush.h> +@@ -99,6 +100,15 @@ int _node_numa_mem_[MAX_NUMNODES]; + DEFINE_MUTEX(pcpu_drain_mutex); + DEFINE_PER_CPU(struct work_struct, pcpu_drain); + ++bool __meminitdata extra_latent_entropy; ++ ++static int __init setup_extra_latent_entropy(char *str) ++{ ++ extra_latent_entropy = true; ++ return 0; ++} ++early_param("extra_latent_entropy", setup_extra_latent_entropy); ++ + #ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY + volatile unsigned long latent_entropy __latent_entropy; + EXPORT_SYMBOL(latent_entropy); +@@ -1027,6 +1037,13 @@ static __always_inline bool free_pages_prepare(struct page *page, + debug_check_no_obj_freed(page_address(page), + PAGE_SIZE << order); + } ++ ++ if (IS_ENABLED(CONFIG_PAGE_SANITIZE)) { ++ int i; ++ for (i = 0; i < (1 << order); i++) ++ clear_highpage(page + i); ++ } ++ + arch_free_page(page, order); + kernel_poison_pages(page, 1 << order, 0); + kernel_map_pages(page, 1 << order, 0); +@@ -1267,6 +1284,21 @@ static void __init __free_pages_boot_core(struct page *page, unsigned int order) + __ClearPageReserved(p); + set_page_count(p, 0); + ++ if (extra_latent_entropy && !PageHighMem(page) && page_to_pfn(page) < 0x100000) { ++ unsigned long hash = 0; ++ size_t index, end = PAGE_SIZE * nr_pages / sizeof hash; ++ const unsigned long *data = lowmem_page_address(page); ++ ++ for (index = 0; index < end; index++) ++ hash ^= hash + data[index]; ++#ifdef CONFIG_GCC_PLUGIN_LATENT_ENTROPY ++ latent_entropy ^= hash; ++ add_device_randomness((const void *)&latent_entropy, sizeof(latent_entropy)); ++#else ++ add_device_randomness((const void *)&hash, sizeof(hash)); ++#endif ++ } ++ + page_zone(page)->managed_pages += nr_pages; + set_page_refcounted(page); + __free_pages(page, order); +@@ -1855,8 +1887,8 @@ static inline int check_new_page(struct page *page) + + static inline bool free_pages_prezeroed(void) + { +- return IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && +- page_poisoning_enabled(); ++ return IS_ENABLED(CONFIG_PAGE_SANITIZE) || ++ (IS_ENABLED(CONFIG_PAGE_POISONING_ZERO) && page_poisoning_enabled()); + } + + #ifdef CONFIG_DEBUG_VM +@@ -1913,6 +1945,11 @@ static void prep_new_page(struct page *page, unsigned int order, gfp_t gfp_flags + + post_alloc_hook(page, order, gfp_flags); + ++ if (IS_ENABLED(CONFIG_PAGE_SANITIZE_VERIFY)) { ++ for (i = 0; i < (1 << order); i++) ++ verify_zero_highpage(page + i); ++ } ++ + if (!free_pages_prezeroed() && (gfp_flags & __GFP_ZERO)) + for (i = 0; i < (1 << order); i++) + clear_highpage(page + i); +diff --git a/mm/slab.h b/mm/slab.h +index 58c6c1c2a78e..86d7a6e7ad25 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -313,7 +313,11 @@ static inline bool is_root_cache(struct kmem_cache *s) + static inline bool slab_equal_or_root(struct kmem_cache *s, + struct kmem_cache *p) + { ++#ifdef CONFIG_SLAB_HARDENED ++ return p == s; ++#else + return true; ++#endif + } + + static inline const char *cache_name(struct kmem_cache *s) +@@ -365,18 +369,26 @@ static inline struct kmem_cache *cache_from_obj(struct kmem_cache *s, void *x) + * to not do even the assignment. In that case, slab_equal_or_root + * will also be a constant. + */ +- if (!memcg_kmem_enabled() && ++ if (!IS_ENABLED(CONFIG_SLAB_HARDENED) && ++ !memcg_kmem_enabled() && + !unlikely(s->flags & SLAB_CONSISTENCY_CHECKS)) + return s; + + page = virt_to_head_page(x); ++#ifdef CONFIG_SLAB_HARDENED ++ BUG_ON(!PageSlab(page)); ++#endif + cachep = page->slab_cache; + if (slab_equal_or_root(cachep, s)) + return cachep; + + pr_err("%s: Wrong slab cache. %s but object is from %s\n", + __func__, s->name, cachep->name); ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(1); ++#else + WARN_ON_ONCE(1); ++#endif + return s; + } + +@@ -401,7 +413,7 @@ static inline size_t slab_ksize(const struct kmem_cache *s) + * back there or track user information then we can + * only use the space before that information. + */ +- if (s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) ++ if ((s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_STORE_USER)) || IS_ENABLED(CONFIG_SLAB_CANARY)) + return s->inuse; + /* + * Else we can use all the padding etc for the allocation +diff --git a/mm/slab_common.c b/mm/slab_common.c +index 3a7ac4f15194..a567cc1807ae 100644 +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -27,10 +27,10 @@ + + #include "slab.h" + +-enum slab_state slab_state; ++enum slab_state slab_state __ro_after_init; + LIST_HEAD(slab_caches); + DEFINE_MUTEX(slab_mutex); +-struct kmem_cache *kmem_cache; ++struct kmem_cache *kmem_cache __ro_after_init; + + #ifdef CONFIG_HARDENED_USERCOPY + bool usercopy_fallback __ro_after_init = +@@ -58,7 +58,7 @@ static DECLARE_WORK(slab_caches_to_rcu_destroy_work, + /* + * Merge control. If this is set then no merging of slab caches will occur. + */ +-static bool slab_nomerge = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); ++static bool slab_nomerge __ro_after_init = !IS_ENABLED(CONFIG_SLAB_MERGE_DEFAULT); + + static int __init setup_slab_nomerge(char *str) + { +diff --git a/mm/slub.c b/mm/slub.c +index 8da34a8af53d..f05bc9ca8489 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -124,6 +124,16 @@ static inline int kmem_cache_debug(struct kmem_cache *s) + #endif + } + ++static inline bool has_sanitize(struct kmem_cache *s) ++{ ++ return IS_ENABLED(CONFIG_SLAB_SANITIZE) && !(s->flags & (SLAB_TYPESAFE_BY_RCU | SLAB_POISON)); ++} ++ ++static inline bool has_sanitize_verify(struct kmem_cache *s) ++{ ++ return IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && has_sanitize(s); ++} ++ + void *fixup_red_left(struct kmem_cache *s, void *p) + { + if (kmem_cache_debug(s) && s->flags & SLAB_RED_ZONE) +@@ -297,6 +307,35 @@ static inline void set_freepointer(struct kmem_cache *s, void *object, void *fp) + *(void **)freeptr_addr = freelist_ptr(s, fp, freeptr_addr); + } + ++#ifdef CONFIG_SLAB_CANARY ++static inline unsigned long *get_canary(struct kmem_cache *s, void *object) ++{ ++ if (s->offset) ++ return object + s->offset + sizeof(void *); ++ return object + s->inuse; ++} ++ ++static inline unsigned long get_canary_value(const void *canary, unsigned long value) ++{ ++ return (value ^ (unsigned long)canary) & CANARY_MASK; ++} ++ ++static inline void set_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ *canary = get_canary_value(canary, value); ++} ++ ++static inline void check_canary(struct kmem_cache *s, void *object, unsigned long value) ++{ ++ unsigned long *canary = get_canary(s, object); ++ BUG_ON(*canary != get_canary_value(canary, value)); ++} ++#else ++#define set_canary(s, object, value) ++#define check_canary(s, object, value) ++#endif ++ + /* Loop over all objects in a slab */ + #define for_each_object(__p, __s, __addr, __objects) \ + for (__p = fixup_red_left(__s, __addr); \ +@@ -469,13 +508,13 @@ static inline void *restore_red_left(struct kmem_cache *s, void *p) + * Debug settings: + */ + #if defined(CONFIG_SLUB_DEBUG_ON) +-static slab_flags_t slub_debug = DEBUG_DEFAULT_FLAGS; ++static slab_flags_t slub_debug __ro_after_init = DEBUG_DEFAULT_FLAGS; + #else +-static slab_flags_t slub_debug; ++static slab_flags_t slub_debug __ro_after_init; + #endif + +-static char *slub_debug_slabs; +-static int disable_higher_order_debug; ++static char *slub_debug_slabs __ro_after_init; ++static int disable_higher_order_debug __ro_after_init; + + /* + * slub is about to manipulate internal object metadata. This memory lies +@@ -535,6 +574,9 @@ static struct track *get_track(struct kmem_cache *s, void *object, + else + p = object + s->inuse; + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ p = (void *)p + sizeof(void *); ++ + return p + alloc; + } + +@@ -674,6 +716,9 @@ static void print_trailer(struct kmem_cache *s, struct page *page, u8 *p) + else + off = s->inuse; + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + off += 2 * sizeof(struct track); + +@@ -803,6 +848,9 @@ static int check_pad_bytes(struct kmem_cache *s, struct page *page, u8 *p) + /* Freepointer is placed after the object. */ + off += sizeof(void *); + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ off += sizeof(void *); ++ + if (s->flags & SLAB_STORE_USER) + /* We also have user information there */ + off += 2 * sizeof(struct track); +@@ -1417,8 +1465,9 @@ static void setup_object(struct kmem_cache *s, struct page *page, + void *object) + { + setup_object_debug(s, page, object); ++ set_canary(s, object, s->random_inactive); + kasan_init_slab_obj(s, object); +- if (unlikely(s->ctor)) { ++ if (unlikely(s->ctor) && !has_sanitize_verify(s)) { + kasan_unpoison_object_data(s, object); + s->ctor(object); + kasan_poison_object_data(s, object); +@@ -2700,9 +2749,21 @@ static __always_inline void *slab_alloc_node(struct kmem_cache *s, + stat(s, ALLOC_FASTPATH); + } + +- if (unlikely(gfpflags & __GFP_ZERO) && object) ++ if (has_sanitize_verify(s) && object) { ++ size_t offset = s->offset ? 0 : sizeof(void *); ++ BUG_ON(memchr_inv(object + offset, 0, s->object_size - offset)); ++ if (s->ctor) ++ s->ctor(object); ++ if (unlikely(gfpflags & __GFP_ZERO) && offset) ++ memset(object, 0, sizeof(void *)); ++ } else if (unlikely(gfpflags & __GFP_ZERO) && object) + memset(object, 0, s->object_size); + ++ if (object) { ++ check_canary(s, object, s->random_inactive); ++ set_canary(s, object, s->random_active); ++ } ++ + slab_post_alloc_hook(s, gfpflags, 1, &object); + + return object; +@@ -2909,6 +2970,27 @@ static __always_inline void do_slab_free(struct kmem_cache *s, + void *tail_obj = tail ? : head; + struct kmem_cache_cpu *c; + unsigned long tid; ++ bool sanitize = has_sanitize(s); ++ ++ if (IS_ENABLED(CONFIG_SLAB_CANARY) || sanitize) { ++ __maybe_unused int offset = s->offset ? 0 : sizeof(void *); ++ void *x = head; ++ ++ while (1) { ++ check_canary(s, x, s->random_active); ++ set_canary(s, x, s->random_inactive); ++ ++ if (sanitize) { ++ memset(x + offset, 0, s->object_size - offset); ++ if (!IS_ENABLED(CONFIG_SLAB_SANITIZE_VERIFY) && s->ctor) ++ s->ctor(x); ++ } ++ if (x == tail_obj) ++ break; ++ x = get_freepointer(s, x); ++ } ++ } ++ + redo: + /* + * Determine the currently cpus per cpu slab. +@@ -3085,7 +3167,7 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + void **p) + { + struct kmem_cache_cpu *c; +- int i; ++ int i, k; + + /* memcg and kmem_cache debug support */ + s = slab_pre_alloc_hook(s, flags); +@@ -3122,13 +3204,29 @@ int kmem_cache_alloc_bulk(struct kmem_cache *s, gfp_t flags, size_t size, + local_irq_enable(); + + /* Clear memory outside IRQ disabled fastpath loop */ +- if (unlikely(flags & __GFP_ZERO)) { ++ if (has_sanitize_verify(s)) { ++ int j; ++ ++ for (j = 0; j < i; j++) { ++ size_t offset = s->offset ? 0 : sizeof(void *); ++ BUG_ON(memchr_inv(p[j] + offset, 0, s->object_size - offset)); ++ if (s->ctor) ++ s->ctor(p[j]); ++ if (unlikely(flags & __GFP_ZERO) && offset) ++ memset(p[j], 0, sizeof(void *)); ++ } ++ } else if (unlikely(flags & __GFP_ZERO)) { + int j; + + for (j = 0; j < i; j++) + memset(p[j], 0, s->object_size); + } + ++ for (k = 0; k < i; k++) { ++ check_canary(s, p[k], s->random_inactive); ++ set_canary(s, p[k], s->random_active); ++ } ++ + /* memcg and kmem_cache debug support */ + slab_post_alloc_hook(s, flags, size, p); + return i; +@@ -3160,9 +3258,9 @@ EXPORT_SYMBOL(kmem_cache_alloc_bulk); + * and increases the number of allocations possible without having to + * take the list_lock. + */ +-static unsigned int slub_min_order; +-static unsigned int slub_max_order = PAGE_ALLOC_COSTLY_ORDER; +-static unsigned int slub_min_objects; ++static unsigned int slub_min_order __ro_after_init; ++static unsigned int slub_max_order __ro_after_init = PAGE_ALLOC_COSTLY_ORDER; ++static unsigned int slub_min_objects __ro_after_init; + + /* + * Calculate the order of allocation given an slab object size. +@@ -3334,6 +3432,7 @@ static void early_kmem_cache_node_alloc(int node) + init_object(kmem_cache_node, n, SLUB_RED_ACTIVE); + init_tracking(kmem_cache_node, n); + #endif ++ set_canary(kmem_cache_node, n, kmem_cache_node->random_active); + kasan_kmalloc(kmem_cache_node, n, sizeof(struct kmem_cache_node), + GFP_KERNEL); + init_kmem_cache_node(n); +@@ -3490,6 +3589,9 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) + size += sizeof(void *); + } + ++ if (IS_ENABLED(CONFIG_SLAB_CANARY)) ++ size += sizeof(void *); ++ + #ifdef CONFIG_SLUB_DEBUG + if (flags & SLAB_STORE_USER) + /* +@@ -3559,6 +3661,10 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) + #ifdef CONFIG_SLAB_FREELIST_HARDENED + s->random = get_random_long(); + #endif ++#ifdef CONFIG_SLAB_CANARY ++ s->random_active = get_random_long(); ++ s->random_inactive = get_random_long(); ++#endif + + if (!calculate_sizes(s, -1)) + goto error; +@@ -3835,6 +3941,8 @@ void __check_heap_object(const void *ptr, unsigned long n, struct page *page, + offset -= s->red_left_pad; + } + ++ check_canary(s, (void *)ptr - offset, s->random_active); ++ + /* Allow address range falling entirely within usercopy region. */ + if (offset >= s->useroffset && + offset - s->useroffset <= s->usersize && +@@ -3868,7 +3976,11 @@ static size_t __ksize(const void *object) + page = virt_to_head_page(object); + + if (unlikely(!PageSlab(page))) { ++#ifdef CONFIG_BUG_ON_DATA_CORRUPTION ++ BUG_ON(!PageCompound(page)); ++#else + WARN_ON(!PageCompound(page)); ++#endif + return PAGE_SIZE << compound_order(page); + } + +@@ -4728,7 +4840,7 @@ enum slab_stat_type { + #define SO_TOTAL (1 << SL_TOTAL) + + #ifdef CONFIG_MEMCG +-static bool memcg_sysfs_enabled = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); ++static bool memcg_sysfs_enabled __ro_after_init = IS_ENABLED(CONFIG_SLUB_MEMCG_SYSFS_ON); + + static int __init setup_slub_memcg_sysfs(char *str) + { +diff --git a/mm/swap.c b/mm/swap.c +index 26fc9b5f1b6c..7c9312ca8982 100644 +--- a/mm/swap.c ++++ b/mm/swap.c +@@ -93,6 +93,13 @@ static void __put_compound_page(struct page *page) + if (!PageHuge(page)) + __page_cache_release(page); + dtor = get_compound_page_dtor(page); ++ if (!PageHuge(page)) ++ BUG_ON(dtor != free_compound_page ++#ifdef CONFIG_TRANSPARENT_HUGEPAGE ++ && dtor != free_transhuge_page ++#endif ++ ); ++ + (*dtor)(page); + } + +diff --git a/net/core/dev.c b/net/core/dev.c +index af097ca9cb4f..fda1753e5b65 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -4519,7 +4519,7 @@ int netif_rx_ni(struct sk_buff *skb) + } + EXPORT_SYMBOL(netif_rx_ni); + +-static __latent_entropy void net_tx_action(struct softirq_action *h) ++static __latent_entropy void net_tx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + +@@ -6302,7 +6302,7 @@ static int napi_poll(struct napi_struct *n, struct list_head *repoll) + return work; + } + +-static __latent_entropy void net_rx_action(struct softirq_action *h) ++static __latent_entropy void net_rx_action(void) + { + struct softnet_data *sd = this_cpu_ptr(&softnet_data); + unsigned long time_limit = jiffies + +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 32cae39cdff6..9141d7ae99b2 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -266,6 +266,7 @@ config IP_PIMSM_V2 + + config SYN_COOKIES + bool "IP: TCP syncookie support" ++ default y + ---help--- + Normal TCP/IP networking is open to an attack known as "SYN + flooding". This denial-of-service attack prevents legitimate remote +diff --git a/scripts/gcc-plugins/Kconfig b/scripts/gcc-plugins/Kconfig +index cb0c889e13aa..305f52f58c1a 100644 +--- a/scripts/gcc-plugins/Kconfig ++++ b/scripts/gcc-plugins/Kconfig +@@ -59,6 +59,11 @@ config GCC_PLUGIN_LATENT_ENTROPY + is some slowdown of the boot process (about 0.5%) and fork and + irq processing. + ++ When extra_latent_entropy is passed on the kernel command line, ++ entropy will be extracted from up to the first 4GB of RAM while the ++ runtime memory allocator is being initialized. This costs even more ++ slowdown of the boot process. ++ + Note that entropy extracted this way is not cryptographically + secure! + +diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c +index 5a5b3780456f..01eac2c6e7eb 100644 +--- a/scripts/mod/modpost.c ++++ b/scripts/mod/modpost.c +@@ -35,6 +35,7 @@ static int vmlinux_section_warnings = 1; + static int warn_unresolved = 0; + /* How a symbol is exported */ + static int sec_mismatch_count = 0; ++static int writable_fptr_count = 0; + static int sec_mismatch_verbose = 1; + static int sec_mismatch_fatal = 0; + /* ignore missing files */ +@@ -954,6 +955,7 @@ enum mismatch { + ANY_EXIT_TO_ANY_INIT, + EXPORT_TO_INIT_EXIT, + EXTABLE_TO_NON_TEXT, ++ DATA_TO_TEXT + }; + + /** +@@ -1080,6 +1082,12 @@ static const struct sectioncheck sectioncheck[] = { + .good_tosec = {ALL_TEXT_SECTIONS , NULL}, + .mismatch = EXTABLE_TO_NON_TEXT, + .handler = extable_mismatch_handler, ++}, ++/* Do not reference code from writable data */ ++{ ++ .fromsec = { DATA_SECTIONS, NULL }, ++ .bad_tosec = { ALL_TEXT_SECTIONS, NULL }, ++ .mismatch = DATA_TO_TEXT + } + }; + +@@ -1229,10 +1237,10 @@ static Elf_Sym *find_elf_symbol(struct elf_info *elf, Elf64_Sword addr, + continue; + if (ELF_ST_TYPE(sym->st_info) == STT_SECTION) + continue; +- if (sym->st_value == addr) +- return sym; + /* Find a symbol nearby - addr are maybe negative */ + d = sym->st_value - addr; ++ if (d == 0) ++ return sym; + if (d < 0) + d = addr - sym->st_value; + if (d < distance) { +@@ -1391,7 +1399,11 @@ static void report_sec_mismatch(const char *modname, + char *prl_from; + char *prl_to; + +- sec_mismatch_count++; ++ if (mismatch->mismatch == DATA_TO_TEXT) ++ writable_fptr_count++; ++ else ++ sec_mismatch_count++; ++ + if (!sec_mismatch_verbose) + return; + +@@ -1515,6 +1527,14 @@ static void report_sec_mismatch(const char *modname, + fatal("There's a special handler for this mismatch type, " + "we should never get here."); + break; ++ case DATA_TO_TEXT: ++#if 0 ++ fprintf(stderr, ++ "The %s %s:%s references\n" ++ "the %s %s:%s%s\n", ++ from, fromsec, fromsym, to, tosec, tosym, to_p); ++#endif ++ break; + } + fprintf(stderr, "\n"); + } +@@ -2526,6 +2546,14 @@ int main(int argc, char **argv) + } + } + free(buf.p); ++ if (writable_fptr_count) { ++ if (!sec_mismatch_verbose) { ++ warn("modpost: Found %d writable function pointer(s).\n" ++ "To see full details build your kernel with:\n" ++ "'make CONFIG_DEBUG_SECTION_MISMATCH=y'\n", ++ writable_fptr_count); ++ } ++ } + + return err; + } +diff --git a/security/Kconfig b/security/Kconfig +index d9aa521b5206..a921713b76ec 100644 +--- a/security/Kconfig ++++ b/security/Kconfig +@@ -8,7 +8,7 @@ source security/keys/Kconfig + + config SECURITY_DMESG_RESTRICT + bool "Restrict unprivileged access to the kernel syslog" +- default n ++ default y + help + This enforces restrictions on unprivileged users reading the kernel + syslog via dmesg(8). +@@ -18,10 +18,34 @@ config SECURITY_DMESG_RESTRICT + + If you are unsure how to answer this question, answer N. + ++config SECURITY_PERF_EVENTS_RESTRICT ++ bool "Restrict unprivileged use of performance events" ++ depends on PERF_EVENTS ++ default y ++ help ++ If you say Y here, the kernel.perf_event_paranoid sysctl ++ will be set to 3 by default, and no unprivileged use of the ++ perf_event_open syscall will be permitted unless it is ++ changed. ++ ++config SECURITY_TIOCSTI_RESTRICT ++ bool "Restrict unprivileged use of tiocsti command injection" ++ default y ++ help ++ This enforces restrictions on unprivileged users injecting commands ++ into other processes which share a tty session using the TIOCSTI ++ ioctl. This option makes TIOCSTI use require CAP_SYS_ADMIN. ++ ++ If this option is not selected, no restrictions will be enforced ++ unless the tiocsti_restrict sysctl is explicitly set to (1). ++ ++ If you are unsure how to answer this question, answer N. ++ + config SECURITY + bool "Enable different security models" + depends on SYSFS + depends on MULTIUSER ++ default y + help + This allows you to choose different security modules to be + configured into your kernel. +@@ -48,6 +72,7 @@ config SECURITYFS + config SECURITY_NETWORK + bool "Socket and Networking Security Hooks" + depends on SECURITY ++ default y + help + This enables the socket and networking security hooks. + If enabled, a security module can use these hooks to +@@ -154,6 +179,7 @@ config HARDENED_USERCOPY + bool "Harden memory copies between kernel and userspace" + depends on HAVE_HARDENED_USERCOPY_ALLOCATOR + imply STRICT_DEVMEM ++ default y + help + This option checks for obviously wrong memory regions when + copying memory to/from the kernel (via copy_to_user() and +@@ -166,7 +192,6 @@ config HARDENED_USERCOPY + config HARDENED_USERCOPY_FALLBACK + bool "Allow usercopy whitelist violations to fallback to object size" + depends on HARDENED_USERCOPY +- default y + help + This is a temporary option that allows missing usercopy whitelists + to be discovered via a WARN() to the kernel log, instead of +@@ -191,10 +216,36 @@ config HARDENED_USERCOPY_PAGESPAN + config FORTIFY_SOURCE + bool "Harden common str/mem functions against buffer overflows" + depends on ARCH_HAS_FORTIFY_SOURCE ++ default y + help + Detect overflows of buffers in common string and memory functions + where the compiler can determine and validate the buffer sizes. + ++config FORTIFY_SOURCE_STRICT_STRING ++ bool "Harden common functions against buffer overflows" ++ depends on FORTIFY_SOURCE ++ depends on EXPERT ++ help ++ Perform stricter overflow checks catching overflows within objects ++ for common C string functions rather than only between objects. ++ ++ This is not yet intended for production use, only bug finding. ++ ++config PAGE_SANITIZE ++ bool "Sanitize pages" ++ default y ++ help ++ Zero fill page allocations on free, reducing the lifetime of ++ sensitive data and helping to mitigate use-after-free bugs. ++ ++config PAGE_SANITIZE_VERIFY ++ bool "Verify sanitized pages" ++ depends on PAGE_SANITIZE ++ default y ++ help ++ Verify that newly allocated pages are zeroed to detect ++ write-after-free bugs. ++ + config STATIC_USERMODEHELPER + bool "Force all usermode helper calls through a single binary" + help +diff --git a/security/selinux/Kconfig b/security/selinux/Kconfig +index 8af7a690eb40..6539694b0fd3 100644 +--- a/security/selinux/Kconfig ++++ b/security/selinux/Kconfig +@@ -2,7 +2,7 @@ config SECURITY_SELINUX + bool "NSA SELinux Support" + depends on SECURITY_NETWORK && AUDIT && NET && INET + select NETWORK_SECMARK +- default n ++ default y + help + This selects NSA Security-Enhanced Linux (SELinux). + You will also need a policy configuration and a labeled filesystem. +@@ -79,23 +79,3 @@ config SECURITY_SELINUX_AVC_STATS + This option collects access vector cache statistics to + /selinux/avc/cache_stats, which may be monitored via + tools such as avcstat. +- +-config SECURITY_SELINUX_CHECKREQPROT_VALUE +- int "NSA SELinux checkreqprot default value" +- depends on SECURITY_SELINUX +- range 0 1 +- default 0 +- help +- This option sets the default value for the 'checkreqprot' flag +- that determines whether SELinux checks the protection requested +- by the application or the protection that will be applied by the +- kernel (including any implied execute for read-implies-exec) for +- mmap and mprotect calls. If this option is set to 0 (zero), +- SELinux will default to checking the protection that will be applied +- by the kernel. If this option is set to 1 (one), SELinux will +- default to checking the protection requested by the application. +- The checkreqprot flag may be changed from the default via the +- 'checkreqprot=' boot parameter. It may also be changed at runtime +- via /selinux/checkreqprot if authorized by policy. +- +- If you are unsure how to answer this question, answer 0. +diff --git a/security/selinux/hooks.c b/security/selinux/hooks.c +index 3c3878f0d2fa..553e52f19f28 100644 +--- a/security/selinux/hooks.c ++++ b/security/selinux/hooks.c +@@ -135,18 +135,7 @@ __setup("selinux=", selinux_enabled_setup); + int selinux_enabled = 1; + #endif + +-static unsigned int selinux_checkreqprot_boot = +- CONFIG_SECURITY_SELINUX_CHECKREQPROT_VALUE; +- +-static int __init checkreqprot_setup(char *str) +-{ +- unsigned long checkreqprot; +- +- if (!kstrtoul(str, 0, &checkreqprot)) +- selinux_checkreqprot_boot = checkreqprot ? 1 : 0; +- return 1; +-} +-__setup("checkreqprot=", checkreqprot_setup); ++static const unsigned int selinux_checkreqprot_boot; + + static struct kmem_cache *sel_inode_cache; + static struct kmem_cache *file_security_cache; +diff --git a/security/selinux/selinuxfs.c b/security/selinux/selinuxfs.c +index f3a5a138a096..d95f3c5fe6f0 100644 +--- a/security/selinux/selinuxfs.c ++++ b/security/selinux/selinuxfs.c +@@ -640,7 +640,6 @@ static ssize_t sel_read_checkreqprot(struct file *filp, char __user *buf, + static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + size_t count, loff_t *ppos) + { +- struct selinux_fs_info *fsi = file_inode(file)->i_sb->s_fs_info; + char *page; + ssize_t length; + unsigned int new_value; +@@ -664,10 +663,9 @@ static ssize_t sel_write_checkreqprot(struct file *file, const char __user *buf, + return PTR_ERR(page); + + length = -EINVAL; +- if (sscanf(page, "%u", &new_value) != 1) ++ if (sscanf(page, "%u", &new_value) != 1 || new_value) + goto out; + +- fsi->state->checkreqprot = new_value ? 1 : 0; + length = count; + out: + kfree(page); +diff --git a/security/yama/Kconfig b/security/yama/Kconfig +index 96b27405558a..485c1b85c325 100644 +--- a/security/yama/Kconfig ++++ b/security/yama/Kconfig +@@ -1,7 +1,7 @@ + config SECURITY_YAMA + bool "Yama support" + depends on SECURITY +- default n ++ default y + help + This selects Yama, which extends DAC support with additional + system-wide security settings beyond regular Linux discretionary diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-mute-pps_state_mismatch.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-mute-pps_state_mismatch.patch new file mode 100644 index 00000000..5bc1eff7 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-mute-pps_state_mismatch.patch @@ -0,0 +1,16 @@ +diff --git a/drivers/gpu/drm/i915/intel_dp.c b/drivers/gpu/drm/i915/intel_dp.c +index 158438bb0389..734718e45aaa 100644 +--- a/drivers/gpu/drm/i915/intel_dp.c ++++ b/drivers/gpu/drm/i915/intel_dp.c +@@ -5245,7 +5245,10 @@ intel_pps_verify_state(struct drm_i915_private *dev_priv, + + if (hw.t1_t3 != sw->t1_t3 || hw.t8 != sw->t8 || hw.t9 != sw->t9 || + hw.t10 != sw->t10 || hw.t11_t12 != sw->t11_t12) { +- DRM_ERROR("PPS state mismatch\n"); ++ /* seem buggy on 4.14.x .. mute that for now ++ * even is not a real solution .. ++ * DRM_ERROR("PPS state mismatch\n"); ++ */ + intel_pps_dump_state("sw", sw); + intel_pps_dump_state("hw", &hw); + } diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-nouveau-pascal-backlight.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-nouveau-pascal-backlight.patch new file mode 100644 index 00000000..754d982a --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-nouveau-pascal-backlight.patch @@ -0,0 +1,11 @@ +diff -up linux-4.16/drivers/gpu/drm/nouveau/nouveau_backlight.c.omv~ linux-4.16/drivers/gpu/drm/nouveau/nouveau_backlight.c +--- linux-4.16/drivers/gpu/drm/nouveau/nouveau_backlight.c.omv~ 2018-04-06 01:04:34.573357055 +0200 ++++ linux-4.16/drivers/gpu/drm/nouveau/nouveau_backlight.c 2018-04-06 01:05:46.985579248 +0200 +@@ -287,6 +287,7 @@ nouveau_backlight_init(struct drm_device + case NV_DEVICE_INFO_V0_FERMI: + case NV_DEVICE_INFO_V0_KEPLER: + case NV_DEVICE_INFO_V0_MAXWELL: ++ case NV_DEVICE_INFO_V0_PASCAL: + return nv50_backlight_init(connector); + default: + break; diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-radeon_dp_aux_transfer_native-no-ratelimited_debug.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-radeon_dp_aux_transfer_native-no-ratelimited_debug.patch new file mode 100644 index 00000000..6ffcb42c --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-radeon_dp_aux_transfer_native-no-ratelimited_debug.patch @@ -0,0 +1,13 @@ +diff --git a/drivers/gpu/drm/radeon/radeon_dp_auxch.c b/drivers/gpu/drm/radeon/radeon_dp_auxch.c +index 12eac4e75542..a26b8ddd7d3f 100644 +--- a/drivers/gpu/drm/radeon/radeon_dp_auxch.c ++++ b/drivers/gpu/drm/radeon/radeon_dp_auxch.c +@@ -168,7 +168,7 @@ radeon_dp_aux_transfer_native(struct drm_dp_aux *aux, struct drm_dp_aux_msg *msg + goto done; + } + if (tmp & AUX_RX_ERROR_FLAGS) { +- DRM_DEBUG_KMS_RATELIMITED("dp_aux_ch flags not zero: %08x\n", ++ DRM_DEBUG_KMS("dp_aux_ch flags not zero: %08x\n", + tmp); + ret = -EIO; + goto done; diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-redcore-lts-amd64.config b/sys-kernel/linux-image-redcore-lts/files/4.19-redcore-lts-amd64.config new file mode 100644 index 00000000..c5bedf65 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-redcore-lts-amd64.config @@ -0,0 +1,9348 @@ +# +# Automatically generated file; DO NOT EDIT. +# Linux/x86 4.19.20-redcore-lts Kernel Configuration +# + +# +# Compiler: gcc (Gentoo Hardened 8.2.0-r1337 p1.6) 8.2.0 +# +CONFIG_CC_IS_GCC=y +CONFIG_GCC_VERSION=80200 +CONFIG_CLANG_VERSION=0 +CONFIG_IRQ_WORK=y +CONFIG_BUILDTIME_EXTABLE_SORT=y +CONFIG_THREAD_INFO_IN_TASK=y + +# +# General setup +# +CONFIG_SCHED_MUQSS=y +CONFIG_INIT_ENV_ARG_LIMIT=32 +# CONFIG_COMPILE_TEST is not set +CONFIG_LOCALVERSION="" +CONFIG_LOCALVERSION_AUTO=y +CONFIG_BUILD_SALT="" +CONFIG_HAVE_KERNEL_GZIP=y +CONFIG_HAVE_KERNEL_BZIP2=y +CONFIG_HAVE_KERNEL_LZMA=y +CONFIG_HAVE_KERNEL_XZ=y +CONFIG_HAVE_KERNEL_LZO=y +CONFIG_HAVE_KERNEL_LZ4=y +# CONFIG_KERNEL_GZIP is not set +# CONFIG_KERNEL_BZIP2 is not set +# CONFIG_KERNEL_LZMA is not set +# CONFIG_KERNEL_XZ is not set +# CONFIG_KERNEL_LZO is not set +CONFIG_KERNEL_LZ4=y +CONFIG_DEFAULT_HOSTNAME="(none)" +CONFIG_SWAP=y +CONFIG_SYSVIPC=y +CONFIG_SYSVIPC_SYSCTL=y +CONFIG_POSIX_MQUEUE=y +CONFIG_POSIX_MQUEUE_SYSCTL=y +CONFIG_CROSS_MEMORY_ATTACH=y +# CONFIG_USELIB is not set +CONFIG_AUDIT=y +CONFIG_HAVE_ARCH_AUDITSYSCALL=y +CONFIG_AUDITSYSCALL=y +CONFIG_AUDIT_WATCH=y +CONFIG_AUDIT_TREE=y + +# +# IRQ subsystem +# +CONFIG_GENERIC_IRQ_PROBE=y +CONFIG_GENERIC_IRQ_SHOW=y +CONFIG_GENERIC_IRQ_EFFECTIVE_AFF_MASK=y +CONFIG_GENERIC_PENDING_IRQ=y +CONFIG_GENERIC_IRQ_MIGRATION=y +CONFIG_GENERIC_IRQ_CHIP=y +CONFIG_IRQ_DOMAIN=y +CONFIG_IRQ_SIM=y +CONFIG_IRQ_DOMAIN_HIERARCHY=y +CONFIG_GENERIC_MSI_IRQ=y +CONFIG_GENERIC_MSI_IRQ_DOMAIN=y +CONFIG_GENERIC_IRQ_MATRIX_ALLOCATOR=y +CONFIG_GENERIC_IRQ_RESERVATION_MODE=y +CONFIG_IRQ_FORCED_THREADING=y +# CONFIG_FORCE_IRQ_THREADING is not set +CONFIG_SPARSE_IRQ=y +# CONFIG_GENERIC_IRQ_DEBUGFS is not set +CONFIG_CLOCKSOURCE_WATCHDOG=y +CONFIG_ARCH_CLOCKSOURCE_DATA=y +CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y +CONFIG_GENERIC_TIME_VSYSCALL=y +CONFIG_GENERIC_CLOCKEVENTS=y +CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y +CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y +CONFIG_GENERIC_CMOS_UPDATE=y + +# +# Timers subsystem +# +CONFIG_TICK_ONESHOT=y +CONFIG_HZ_PERIODIC=y +# CONFIG_NO_HZ_IDLE is not set +# CONFIG_NO_HZ_FULL is not set +CONFIG_NO_HZ=y +CONFIG_HIGH_RES_TIMERS=y +# CONFIG_PREEMPT_NONE is not set +# CONFIG_PREEMPT_VOLUNTARY is not set +CONFIG_PREEMPT=y +CONFIG_PREEMPT_COUNT=y + +# +# CPU/Task time and stats accounting +# +CONFIG_VIRT_CPU_ACCOUNTING=y +# CONFIG_TICK_CPU_ACCOUNTING is not set +CONFIG_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_SCHED_AVG_IRQ=y +CONFIG_BSD_PROCESS_ACCT=y +# CONFIG_BSD_PROCESS_ACCT_V3 is not set +CONFIG_TASKSTATS=y +CONFIG_TASK_DELAY_ACCT=y +CONFIG_TASK_XACCT=y +CONFIG_TASK_IO_ACCOUNTING=y +CONFIG_CPU_ISOLATION=y + +# +# RCU Subsystem +# +CONFIG_PREEMPT_RCU=y +# CONFIG_RCU_EXPERT is not set +CONFIG_SRCU=y +CONFIG_TREE_SRCU=y +CONFIG_TASKS_RCU=y +CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y +CONFIG_CONTEXT_TRACKING=y +# CONFIG_CONTEXT_TRACKING_FORCE is not set +CONFIG_BUILD_BIN2C=y +CONFIG_IKCONFIG=y +CONFIG_IKCONFIG_PROC=y +CONFIG_LOG_BUF_SHIFT=17 +CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=13 +CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y +CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y +CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y +CONFIG_ARCH_SUPPORTS_INT128=y +CONFIG_CGROUPS=y +CONFIG_PAGE_COUNTER=y +CONFIG_MEMCG=y +CONFIG_MEMCG_SWAP=y +CONFIG_MEMCG_SWAP_ENABLED=y +CONFIG_MEMCG_KMEM=y +CONFIG_BLK_CGROUP=y +# CONFIG_DEBUG_BLK_CGROUP is not set +CONFIG_CGROUP_WRITEBACK=y +CONFIG_CGROUP_SCHED=y +CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y +CONFIG_CGROUP_FREEZER=y +CONFIG_CGROUP_HUGETLB=y +CONFIG_CPUSETS=y +CONFIG_PROC_PID_CPUSET=y +CONFIG_CGROUP_DEVICE=y +CONFIG_CGROUP_PERF=y +CONFIG_CGROUP_BPF=y +# CONFIG_CGROUP_DEBUG is not set +CONFIG_SOCK_CGROUP_DATA=y +CONFIG_NAMESPACES=y +CONFIG_UTS_NS=y +CONFIG_IPC_NS=y +CONFIG_USER_NS=y +CONFIG_PID_NS=y +CONFIG_NET_NS=y +# CONFIG_CHECKPOINT_RESTORE is not set +# CONFIG_SYSFS_DEPRECATED is not set +CONFIG_RELAY=y +CONFIG_BLK_DEV_INITRD=y +CONFIG_INITRAMFS_SOURCE="" +CONFIG_RD_GZIP=y +CONFIG_RD_BZIP2=y +CONFIG_RD_LZMA=y +CONFIG_RD_XZ=y +CONFIG_RD_LZO=y +CONFIG_RD_LZ4=y +CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y +# CONFIG_CC_OPTIMIZE_FOR_SIZE is not set +# CONFIG_LOCAL_INIT is not set +CONFIG_SYSCTL=y +CONFIG_ANON_INODES=y +CONFIG_HAVE_UID16=y +CONFIG_SYSCTL_EXCEPTION_TRACE=y +CONFIG_HAVE_PCSPKR_PLATFORM=y +CONFIG_BPF=y +# CONFIG_EXPERT is not set +CONFIG_UID16=y +CONFIG_MULTIUSER=y +CONFIG_SGETMASK_SYSCALL=y +CONFIG_SYSFS_SYSCALL=y +CONFIG_FHANDLE=y +CONFIG_POSIX_TIMERS=y +CONFIG_PRINTK=y +CONFIG_PRINTK_NMI=y +CONFIG_BUG=y +CONFIG_ELF_CORE=y +CONFIG_PCSPKR_PLATFORM=y +CONFIG_BASE_FULL=y +CONFIG_FUTEX=y +CONFIG_FUTEX_PI=y +CONFIG_EPOLL=y +CONFIG_SIGNALFD=y +CONFIG_TIMERFD=y +CONFIG_EVENTFD=y +CONFIG_SHMEM=y +CONFIG_AIO=y +CONFIG_ADVISE_SYSCALLS=y +CONFIG_MEMBARRIER=y +CONFIG_KALLSYMS=y +CONFIG_KALLSYMS_ALL=y +CONFIG_KALLSYMS_ABSOLUTE_PERCPU=y +CONFIG_KALLSYMS_BASE_RELATIVE=y +CONFIG_BPF_SYSCALL=y +CONFIG_BPF_JIT_ALWAYS_ON=y +CONFIG_USERFAULTFD=y +CONFIG_ARCH_HAS_MEMBARRIER_SYNC_CORE=y +CONFIG_RSEQ=y +# CONFIG_EMBEDDED is not set +CONFIG_HAVE_PERF_EVENTS=y + +# +# Kernel Performance Events And Counters +# +CONFIG_PERF_EVENTS=y +# CONFIG_DEBUG_PERF_USE_VMALLOC is not set +CONFIG_VM_EVENT_COUNTERS=y +CONFIG_SLUB_DEBUG=y +# CONFIG_COMPAT_BRK is not set +# CONFIG_SLAB is not set +CONFIG_SLUB=y +CONFIG_SLAB_MERGE_DEFAULT=y +CONFIG_SLAB_FREELIST_RANDOM=y +CONFIG_SLAB_FREELIST_HARDENED=y +CONFIG_SLAB_HARDENED=y +CONFIG_SLAB_SANITIZE=y +CONFIG_SLAB_SANITIZE_VERIFY=y +CONFIG_SLUB_CPU_PARTIAL=y +CONFIG_SYSTEM_DATA_VERIFICATION=y +CONFIG_PROFILING=y +CONFIG_TRACEPOINTS=y +CONFIG_64BIT=y +CONFIG_X86_64=y +CONFIG_X86=y +CONFIG_INSTRUCTION_DECODER=y +CONFIG_OUTPUT_FORMAT="elf64-x86-64" +CONFIG_ARCH_DEFCONFIG="arch/x86/configs/x86_64_defconfig" +CONFIG_LOCKDEP_SUPPORT=y +CONFIG_STACKTRACE_SUPPORT=y +CONFIG_MMU=y +CONFIG_ARCH_MMAP_RND_BITS_MIN=28 +CONFIG_ARCH_MMAP_RND_BITS_MAX=32 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8 +CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16 +CONFIG_GENERIC_ISA_DMA=y +CONFIG_GENERIC_BUG=y +CONFIG_GENERIC_BUG_RELATIVE_POINTERS=y +CONFIG_GENERIC_HWEIGHT=y +CONFIG_ARCH_MAY_HAVE_PC_FDC=y +CONFIG_RWSEM_XCHGADD_ALGORITHM=y +CONFIG_GENERIC_CALIBRATE_DELAY=y +CONFIG_ARCH_HAS_CPU_RELAX=y +CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y +CONFIG_ARCH_HAS_FILTER_PGPROT=y +CONFIG_HAVE_SETUP_PER_CPU_AREA=y +CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y +CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y +CONFIG_ARCH_HIBERNATION_POSSIBLE=y +CONFIG_ARCH_SUSPEND_POSSIBLE=y +CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y +CONFIG_ARCH_WANT_GENERAL_HUGETLB=y +CONFIG_ZONE_DMA32=y +CONFIG_AUDIT_ARCH=y +CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y +CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y +CONFIG_HAVE_INTEL_TXT=y +CONFIG_X86_64_SMP=y +CONFIG_ARCH_SUPPORTS_UPROBES=y +CONFIG_FIX_EARLYCON_MEM=y +CONFIG_DYNAMIC_PHYSICAL_MASK=y +CONFIG_PGTABLE_LEVELS=4 +CONFIG_CC_HAS_SANE_STACKPROTECTOR=y + +# +# Processor type and features +# +CONFIG_ZONE_DMA=y +CONFIG_SMP=y +CONFIG_X86_FEATURE_NAMES=y +CONFIG_X86_X2APIC=y +CONFIG_X86_MPPARSE=y +# CONFIG_GOLDFISH is not set +CONFIG_RETPOLINE=y +CONFIG_INTEL_RDT=y +# CONFIG_X86_EXTENDED_PLATFORM is not set +CONFIG_X86_INTEL_LPSS=y +CONFIG_X86_AMD_PLATFORM_DEVICE=y +CONFIG_IOSF_MBI=y +# CONFIG_IOSF_MBI_DEBUG is not set +CONFIG_X86_SUPPORTS_MEMORY_FAILURE=y +CONFIG_SCHED_OMIT_FRAME_POINTER=y +CONFIG_HYPERVISOR_GUEST=y +CONFIG_PARAVIRT=y +# CONFIG_PARAVIRT_DEBUG is not set +# CONFIG_PARAVIRT_SPINLOCKS is not set +# CONFIG_XEN is not set +CONFIG_KVM_GUEST=y +# CONFIG_KVM_DEBUG_FS is not set +# CONFIG_PARAVIRT_TIME_ACCOUNTING is not set +CONFIG_PARAVIRT_CLOCK=y +CONFIG_JAILHOUSE_GUEST=y +CONFIG_NO_BOOTMEM=y +# CONFIG_MK8 is not set +# CONFIG_MPSC is not set +# CONFIG_MCORE2 is not set +# CONFIG_MATOM is not set +CONFIG_GENERIC_CPU=y +CONFIG_X86_INTERNODE_CACHE_SHIFT=6 +CONFIG_X86_L1_CACHE_SHIFT=6 +CONFIG_X86_TSC=y +CONFIG_X86_CMPXCHG64=y +CONFIG_X86_CMOV=y +CONFIG_X86_MINIMUM_CPU_FAMILY=64 +CONFIG_X86_DEBUGCTLMSR=y +CONFIG_CPU_SUP_INTEL=y +CONFIG_CPU_SUP_AMD=y +CONFIG_CPU_SUP_CENTAUR=y +CONFIG_HPET_TIMER=y +CONFIG_HPET_EMULATE_RTC=y +CONFIG_DMI=y +CONFIG_GART_IOMMU=y +# CONFIG_CALGARY_IOMMU is not set +CONFIG_MAXSMP=y +CONFIG_NR_CPUS_RANGE_BEGIN=8192 +CONFIG_NR_CPUS_RANGE_END=8192 +CONFIG_NR_CPUS_DEFAULT=8192 +CONFIG_NR_CPUS=8192 +CONFIG_SCHED_SMT=y +CONFIG_SMT_NICE=y +CONFIG_SCHED_MC=y +CONFIG_SCHED_MC_PRIO=y +# CONFIG_RQ_NONE is not set +# CONFIG_RQ_SMT is not set +CONFIG_RQ_MC=y +# CONFIG_RQ_SMP is not set +CONFIG_SHARERQ=2 +CONFIG_X86_LOCAL_APIC=y +CONFIG_X86_IO_APIC=y +CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y +CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set +CONFIG_X86_MCE_INTEL=y +CONFIG_X86_MCE_AMD=y +CONFIG_X86_MCE_THRESHOLD=y +# CONFIG_X86_MCE_INJECT is not set +CONFIG_X86_THERMAL_VECTOR=y + +# +# Performance monitoring +# +CONFIG_PERF_EVENTS_INTEL_UNCORE=y +CONFIG_PERF_EVENTS_INTEL_RAPL=y +CONFIG_PERF_EVENTS_INTEL_CSTATE=y +CONFIG_PERF_EVENTS_AMD_POWER=m +CONFIG_X86_16BIT=y +CONFIG_X86_ESPFIX64=y +CONFIG_X86_VSYSCALL_EMULATION=y +CONFIG_I8K=m +CONFIG_MICROCODE=y +CONFIG_MICROCODE_INTEL=y +CONFIG_MICROCODE_AMD=y +CONFIG_MICROCODE_OLD_INTERFACE=y +CONFIG_X86_MSR=m +CONFIG_X86_CPUID=m +# CONFIG_X86_5LEVEL is not set +CONFIG_X86_DIRECT_GBPAGES=y +CONFIG_ARCH_HAS_MEM_ENCRYPT=y +CONFIG_AMD_MEM_ENCRYPT=y +# CONFIG_AMD_MEM_ENCRYPT_ACTIVE_BY_DEFAULT is not set +CONFIG_ARCH_USE_MEMREMAP_PROT=y +CONFIG_NUMA=y +CONFIG_AMD_NUMA=y +CONFIG_X86_64_ACPI_NUMA=y +CONFIG_NODES_SPAN_OTHER_NODES=y +# CONFIG_NUMA_EMU is not set +CONFIG_NODES_SHIFT=10 +CONFIG_ARCH_SPARSEMEM_ENABLE=y +CONFIG_ARCH_SPARSEMEM_DEFAULT=y +CONFIG_ARCH_SELECT_MEMORY_MODEL=y +CONFIG_ARCH_MEMORY_PROBE=y +CONFIG_ARCH_PROC_KCORE_TEXT=y +CONFIG_ILLEGAL_POINTER_VALUE=0xdead000000000000 +CONFIG_X86_PMEM_LEGACY_DEVICE=y +CONFIG_X86_PMEM_LEGACY=m +CONFIG_X86_CHECK_BIOS_CORRUPTION=y +CONFIG_X86_BOOTPARAM_MEMORY_CORRUPTION_CHECK=y +CONFIG_X86_RESERVE_LOW=64 +CONFIG_MTRR=y +CONFIG_MTRR_SANITIZER=y +CONFIG_MTRR_SANITIZER_ENABLE_DEFAULT=0 +CONFIG_MTRR_SANITIZER_SPARE_REG_NR_DEFAULT=1 +CONFIG_X86_PAT=y +CONFIG_ARCH_USES_PG_UNCACHED=y +CONFIG_ARCH_RANDOM=y +CONFIG_X86_SMAP=y +CONFIG_X86_INTEL_UMIP=y +CONFIG_X86_INTEL_MPX=y +CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS=y +CONFIG_EFI=y +CONFIG_EFI_STUB=y +CONFIG_EFI_MIXED=y +CONFIG_SECCOMP=y +CONFIG_HZ_100=y +# CONFIG_HZ_250_NODEF is not set +# CONFIG_HZ_300_NODEF is not set +# CONFIG_HZ_1000_NODEF is not set +CONFIG_HZ=100 +CONFIG_SCHED_HRTICK=y +CONFIG_KEXEC=y +# CONFIG_CRASH_DUMP is not set +CONFIG_KEXEC_JUMP=y +CONFIG_PHYSICAL_START=0x1000000 +CONFIG_RELOCATABLE=y +CONFIG_RANDOMIZE_BASE=y +CONFIG_X86_NEED_RELOCS=y +CONFIG_PHYSICAL_ALIGN=0x1000000 +CONFIG_DYNAMIC_MEMORY_LAYOUT=y +CONFIG_RANDOMIZE_MEMORY=y +CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0xa +CONFIG_HOTPLUG_CPU=y +CONFIG_BOOTPARAM_HOTPLUG_CPU0=y +# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_COMPAT_VDSO is not set +CONFIG_LEGACY_VSYSCALL_EMULATE=y +# CONFIG_LEGACY_VSYSCALL_NONE is not set +# CONFIG_CMDLINE_BOOL is not set +CONFIG_MODIFY_LDT_SYSCALL=y +CONFIG_HAVE_LIVEPATCH=y +# CONFIG_LIVEPATCH is not set +CONFIG_ARCH_HAS_ADD_PAGES=y +CONFIG_ARCH_ENABLE_MEMORY_HOTPLUG=y +CONFIG_ARCH_ENABLE_MEMORY_HOTREMOVE=y +CONFIG_USE_PERCPU_NUMA_NODE_ID=y +CONFIG_ARCH_ENABLE_SPLIT_PMD_PTLOCK=y +CONFIG_ARCH_ENABLE_HUGEPAGE_MIGRATION=y +CONFIG_ARCH_ENABLE_THP_MIGRATION=y + +# +# Power management and ACPI options +# +CONFIG_ARCH_HIBERNATION_HEADER=y +CONFIG_SUSPEND=y +CONFIG_SUSPEND_FREEZER=y +CONFIG_HIBERNATE_CALLBACKS=y +CONFIG_HIBERNATION=y +CONFIG_PM_STD_PARTITION="" +CONFIG_PM_SLEEP=y +CONFIG_PM_SLEEP_SMP=y +CONFIG_PM_AUTOSLEEP=y +CONFIG_PM_WAKELOCKS=y +CONFIG_PM_WAKELOCKS_LIMIT=100 +CONFIG_PM_WAKELOCKS_GC=y +CONFIG_PM=y +# CONFIG_PM_DEBUG is not set +CONFIG_PM_CLK=y +CONFIG_PM_GENERIC_DOMAINS=y +# CONFIG_WQ_POWER_EFFICIENT_DEFAULT is not set +CONFIG_PM_GENERIC_DOMAINS_SLEEP=y +CONFIG_ARCH_SUPPORTS_ACPI=y +CONFIG_ACPI=y +CONFIG_ACPI_LEGACY_TABLES_LOOKUP=y +CONFIG_ARCH_MIGHT_HAVE_ACPI_PDC=y +CONFIG_ACPI_SYSTEM_POWER_STATES_SUPPORT=y +# CONFIG_ACPI_DEBUGGER is not set +CONFIG_ACPI_SPCR_TABLE=y +CONFIG_ACPI_LPIT=y +CONFIG_ACPI_SLEEP=y +# CONFIG_ACPI_PROCFS_POWER is not set +CONFIG_ACPI_REV_OVERRIDE_POSSIBLE=y +# CONFIG_ACPI_EC_DEBUGFS is not set +CONFIG_ACPI_AC=m +CONFIG_ACPI_BATTERY=m +CONFIG_ACPI_BUTTON=m +CONFIG_ACPI_VIDEO=m +CONFIG_ACPI_FAN=m +CONFIG_ACPI_TAD=m +CONFIG_ACPI_DOCK=y +CONFIG_ACPI_CPU_FREQ_PSS=y +CONFIG_ACPI_PROCESSOR_CSTATE=y +CONFIG_ACPI_PROCESSOR_IDLE=y +CONFIG_ACPI_CPPC_LIB=y +CONFIG_ACPI_PROCESSOR=y +CONFIG_ACPI_IPMI=m +CONFIG_ACPI_HOTPLUG_CPU=y +CONFIG_ACPI_PROCESSOR_AGGREGATOR=m +CONFIG_ACPI_THERMAL=m +CONFIG_ACPI_NUMA=y +CONFIG_ARCH_HAS_ACPI_TABLE_UPGRADE=y +CONFIG_ACPI_TABLE_UPGRADE=y +# CONFIG_ACPI_DEBUG is not set +CONFIG_ACPI_PCI_SLOT=y +CONFIG_ACPI_CONTAINER=y +CONFIG_ACPI_HOTPLUG_MEMORY=y +CONFIG_ACPI_HOTPLUG_IOAPIC=y +CONFIG_ACPI_SBS=m +CONFIG_ACPI_HED=y +# CONFIG_ACPI_CUSTOM_METHOD is not set +CONFIG_ACPI_BGRT=y +CONFIG_ACPI_NFIT=m +CONFIG_HAVE_ACPI_APEI=y +CONFIG_HAVE_ACPI_APEI_NMI=y +CONFIG_ACPI_APEI=y +CONFIG_ACPI_APEI_GHES=y +CONFIG_ACPI_APEI_PCIEAER=y +CONFIG_ACPI_APEI_MEMORY_FAILURE=y +# CONFIG_ACPI_APEI_EINJ is not set +# CONFIG_ACPI_APEI_ERST_DEBUG is not set +CONFIG_DPTF_POWER=m +CONFIG_ACPI_WATCHDOG=y +CONFIG_ACPI_EXTLOG=m +CONFIG_PMIC_OPREGION=y +# CONFIG_XPOWER_PMIC_OPREGION is not set +# CONFIG_BXT_WC_PMIC_OPREGION is not set +CONFIG_CHT_DC_TI_PMIC_OPREGION=y +CONFIG_ACPI_CONFIGFS=m +CONFIG_X86_PM_TIMER=y +CONFIG_SFI=y + +# +# CPU Frequency scaling +# +CONFIG_CPU_FREQ=y +CONFIG_CPU_FREQ_GOV_ATTR_SET=y +CONFIG_CPU_FREQ_GOV_COMMON=y +CONFIG_CPU_FREQ_STAT=y +CONFIG_CPU_FREQ_DEFAULT_GOV_PERFORMANCE=y +# CONFIG_CPU_FREQ_DEFAULT_GOV_POWERSAVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_USERSPACE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_ONDEMAND is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_CONSERVATIVE is not set +# CONFIG_CPU_FREQ_DEFAULT_GOV_SCHEDUTIL is not set +CONFIG_CPU_FREQ_GOV_PERFORMANCE=y +CONFIG_CPU_FREQ_GOV_POWERSAVE=y +CONFIG_CPU_FREQ_GOV_USERSPACE=y +CONFIG_CPU_FREQ_GOV_ONDEMAND=y +CONFIG_CPU_FREQ_GOV_CONSERVATIVE=y +CONFIG_CPU_FREQ_GOV_SCHEDUTIL=y + +# +# CPU frequency scaling drivers +# +CONFIG_X86_INTEL_PSTATE=y +CONFIG_X86_PCC_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ=m +CONFIG_X86_ACPI_CPUFREQ_CPB=y +CONFIG_X86_POWERNOW_K8=m +CONFIG_X86_AMD_FREQ_SENSITIVITY=m +# CONFIG_X86_SPEEDSTEP_CENTRINO is not set +# CONFIG_X86_P4_CLOCKMOD is not set + +# +# shared options +# + +# +# CPU Idle +# +CONFIG_CPU_IDLE=y +CONFIG_CPU_IDLE_GOV_LADDER=y +CONFIG_CPU_IDLE_GOV_MENU=y +CONFIG_INTEL_IDLE=y + +# +# Bus options (PCI etc.) +# +CONFIG_PCI=y +CONFIG_PCI_DIRECT=y +CONFIG_PCI_MMCONFIG=y +CONFIG_PCI_DOMAINS=y +CONFIG_MMCONF_FAM10H=y +CONFIG_PCIEPORTBUS=y +CONFIG_HOTPLUG_PCI_PCIE=y +CONFIG_PCIEAER=y +CONFIG_PCIEAER_INJECT=m +CONFIG_PCIE_ECRC=y +CONFIG_PCIEASPM=y +# CONFIG_PCIEASPM_DEBUG is not set +CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_POWERSAVE is not set +# CONFIG_PCIEASPM_POWER_SUPERSAVE is not set +# CONFIG_PCIEASPM_PERFORMANCE is not set +CONFIG_PCIE_PME=y +CONFIG_PCIE_DPC=y +CONFIG_PCIE_PTM=y +CONFIG_PCI_MSI=y +CONFIG_PCI_MSI_IRQ_DOMAIN=y +CONFIG_PCI_QUIRKS=y +# CONFIG_PCI_DEBUG is not set +CONFIG_PCI_REALLOC_ENABLE_AUTO=y +CONFIG_PCI_STUB=m +CONFIG_PCI_PF_STUB=m +CONFIG_PCI_ATS=y +CONFIG_PCI_LOCKLESS_CONFIG=y +CONFIG_PCI_IOV=y +CONFIG_PCI_PRI=y +CONFIG_PCI_PASID=y +CONFIG_PCI_LABEL=y +CONFIG_PCI_HYPERV=m +CONFIG_HOTPLUG_PCI=y +CONFIG_HOTPLUG_PCI_ACPI=y +CONFIG_HOTPLUG_PCI_ACPI_IBM=m +CONFIG_HOTPLUG_PCI_CPCI=y +CONFIG_HOTPLUG_PCI_CPCI_ZT5550=m +CONFIG_HOTPLUG_PCI_CPCI_GENERIC=m +# CONFIG_HOTPLUG_PCI_SHPC is not set + +# +# PCI controller drivers +# + +# +# Cadence PCIe controllers support +# +CONFIG_VMD=m + +# +# DesignWare PCI Core Support +# +# CONFIG_PCIE_DW_PLAT_HOST is not set +# CONFIG_PCIE_DW_PLAT_EP is not set + +# +# PCI Endpoint +# +CONFIG_PCI_ENDPOINT=y +CONFIG_PCI_ENDPOINT_CONFIGFS=y +# CONFIG_PCI_EPF_TEST is not set + +# +# PCI switch controller drivers +# +CONFIG_PCI_SW_SWITCHTEC=m +CONFIG_ISA_DMA_API=y +CONFIG_AMD_NB=y +CONFIG_PCCARD=m +CONFIG_PCMCIA=m +CONFIG_PCMCIA_LOAD_CIS=y +CONFIG_CARDBUS=y + +# +# PC-card bridges +# +CONFIG_YENTA=m +CONFIG_YENTA_O2=y +CONFIG_YENTA_RICOH=y +CONFIG_YENTA_TI=y +CONFIG_YENTA_ENE_TUNE=y +CONFIG_YENTA_TOSHIBA=y +CONFIG_PD6729=m +CONFIG_I82092=m +CONFIG_PCCARD_NONSTATIC=y +CONFIG_RAPIDIO=y +CONFIG_RAPIDIO_TSI721=y +CONFIG_RAPIDIO_DISC_TIMEOUT=30 +CONFIG_RAPIDIO_ENABLE_RX_TX_PORTS=y +CONFIG_RAPIDIO_DMA_ENGINE=y +# CONFIG_RAPIDIO_DEBUG is not set +CONFIG_RAPIDIO_ENUM_BASIC=m +CONFIG_RAPIDIO_CHMAN=m +CONFIG_RAPIDIO_MPORT_CDEV=m + +# +# RapidIO Switch drivers +# +CONFIG_RAPIDIO_TSI57X=y +CONFIG_RAPIDIO_CPS_XX=y +CONFIG_RAPIDIO_TSI568=y +CONFIG_RAPIDIO_CPS_GEN2=y +CONFIG_RAPIDIO_RXS_GEN3=m +CONFIG_X86_SYSFB=y + +# +# Binary Emulations +# +CONFIG_IA32_EMULATION=y +CONFIG_IA32_AOUT=y +CONFIG_X86_X32=y +CONFIG_COMPAT_32=y +CONFIG_COMPAT=y +CONFIG_COMPAT_FOR_U64_ALIGNMENT=y +CONFIG_SYSVIPC_COMPAT=y +CONFIG_X86_DEV_DMA_OPS=y +CONFIG_HAVE_GENERIC_GUP=y + +# +# Firmware Drivers +# +CONFIG_EDD=m +# CONFIG_EDD_OFF is not set +CONFIG_FIRMWARE_MEMMAP=y +CONFIG_DELL_RBU=m +CONFIG_DCDBAS=m +CONFIG_DMIID=y +CONFIG_DMI_SYSFS=m +CONFIG_DMI_SCAN_MACHINE_NON_EFI_FALLBACK=y +CONFIG_ISCSI_IBFT_FIND=y +CONFIG_ISCSI_IBFT=m +CONFIG_FW_CFG_SYSFS=m +# CONFIG_FW_CFG_SYSFS_CMDLINE is not set +CONFIG_GOOGLE_FIRMWARE=y +CONFIG_GOOGLE_SMI=m +CONFIG_GOOGLE_COREBOOT_TABLE=m +CONFIG_GOOGLE_COREBOOT_TABLE_ACPI=m +CONFIG_GOOGLE_MEMCONSOLE=m +CONFIG_GOOGLE_MEMCONSOLE_X86_LEGACY=m +CONFIG_GOOGLE_FRAMEBUFFER_COREBOOT=m +CONFIG_GOOGLE_MEMCONSOLE_COREBOOT=m +CONFIG_GOOGLE_VPD=m + +# +# EFI (Extensible Firmware Interface) Support +# +CONFIG_EFI_VARS=m +CONFIG_EFI_ESRT=y +CONFIG_EFI_VARS_PSTORE=m +CONFIG_EFI_VARS_PSTORE_DEFAULT_DISABLE=y +CONFIG_EFI_RUNTIME_MAP=y +# CONFIG_EFI_FAKE_MEMMAP is not set +CONFIG_EFI_RUNTIME_WRAPPERS=y +CONFIG_EFI_BOOTLOADER_CONTROL=m +CONFIG_EFI_CAPSULE_LOADER=m +CONFIG_EFI_TEST=m +CONFIG_APPLE_PROPERTIES=y +CONFIG_RESET_ATTACK_MITIGATION=y +CONFIG_UEFI_CPER=y +CONFIG_UEFI_CPER_X86=y +CONFIG_EFI_DEV_PATH_PARSER=y + +# +# Tegra firmware driver +# +CONFIG_HAVE_KVM=y +CONFIG_HAVE_KVM_IRQCHIP=y +CONFIG_HAVE_KVM_IRQFD=y +CONFIG_HAVE_KVM_IRQ_ROUTING=y +CONFIG_HAVE_KVM_EVENTFD=y +CONFIG_KVM_MMIO=y +CONFIG_KVM_ASYNC_PF=y +CONFIG_HAVE_KVM_MSI=y +CONFIG_HAVE_KVM_CPU_RELAX_INTERCEPT=y +CONFIG_KVM_VFIO=y +CONFIG_KVM_GENERIC_DIRTYLOG_READ_PROTECT=y +CONFIG_KVM_COMPAT=y +CONFIG_HAVE_KVM_IRQ_BYPASS=y +CONFIG_VIRTUALIZATION=y +CONFIG_KVM=m +CONFIG_KVM_INTEL=m +CONFIG_KVM_AMD=m +CONFIG_KVM_AMD_SEV=y +# CONFIG_KVM_MMU_AUDIT is not set +CONFIG_VHOST_NET=m +CONFIG_VHOST_SCSI=m +CONFIG_VHOST_VSOCK=m +CONFIG_VHOST=m +# CONFIG_VHOST_CROSS_ENDIAN_LEGACY is not set + +# +# General architecture-dependent options +# +CONFIG_CRASH_CORE=y +CONFIG_KEXEC_CORE=y +CONFIG_HOTPLUG_SMT=y +# CONFIG_OPROFILE is not set +CONFIG_HAVE_OPROFILE=y +CONFIG_OPROFILE_NMI_TIMER=y +CONFIG_KPROBES=y +CONFIG_JUMP_LABEL=y +# CONFIG_STATIC_KEYS_SELFTEST is not set +CONFIG_OPTPROBES=y +CONFIG_KPROBES_ON_FTRACE=y +CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS=y +CONFIG_ARCH_USE_BUILTIN_BSWAP=y +CONFIG_KRETPROBES=y +CONFIG_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_IOREMAP_PROT=y +CONFIG_HAVE_KPROBES=y +CONFIG_HAVE_KRETPROBES=y +CONFIG_HAVE_OPTPROBES=y +CONFIG_HAVE_KPROBES_ON_FTRACE=y +CONFIG_HAVE_FUNCTION_ERROR_INJECTION=y +CONFIG_HAVE_NMI=y +CONFIG_HAVE_ARCH_TRACEHOOK=y +CONFIG_HAVE_DMA_CONTIGUOUS=y +CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_FORTIFY_SOURCE=y +CONFIG_ARCH_HAS_SET_MEMORY=y +CONFIG_HAVE_ARCH_THREAD_STRUCT_WHITELIST=y +CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y +CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y +CONFIG_HAVE_RSEQ=y +CONFIG_HAVE_CLK=y +CONFIG_HAVE_HW_BREAKPOINT=y +CONFIG_HAVE_MIXED_BREAKPOINTS_REGS=y +CONFIG_HAVE_USER_RETURN_NOTIFIER=y +CONFIG_HAVE_PERF_EVENTS_NMI=y +CONFIG_HAVE_HARDLOCKUP_DETECTOR_PERF=y +CONFIG_HAVE_PERF_REGS=y +CONFIG_HAVE_PERF_USER_STACK_DUMP=y +CONFIG_HAVE_ARCH_JUMP_LABEL=y +CONFIG_HAVE_RCU_TABLE_FREE=y +CONFIG_HAVE_RCU_TABLE_INVALIDATE=y +CONFIG_ARCH_HAVE_NMI_SAFE_CMPXCHG=y +CONFIG_HAVE_ALIGNED_STRUCT_PAGE=y +CONFIG_HAVE_CMPXCHG_LOCAL=y +CONFIG_HAVE_CMPXCHG_DOUBLE=y +CONFIG_ARCH_WANT_COMPAT_IPC_PARSE_VERSION=y +CONFIG_ARCH_WANT_OLD_COMPAT_IPC=y +CONFIG_HAVE_ARCH_SECCOMP_FILTER=y +CONFIG_SECCOMP_FILTER=y +CONFIG_HAVE_STACKPROTECTOR=y +CONFIG_CC_HAS_STACKPROTECTOR_NONE=y +CONFIG_STACKPROTECTOR=y +CONFIG_STACKPROTECTOR_STRONG=y +CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES=y +CONFIG_HAVE_CONTEXT_TRACKING=y +CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y +CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y +CONFIG_HAVE_ARCH_HUGE_VMAP=y +CONFIG_HAVE_ARCH_SOFT_DIRTY=y +CONFIG_HAVE_MOD_ARCH_SPECIFIC=y +CONFIG_MODULES_USE_ELF_RELA=y +CONFIG_HAVE_IRQ_EXIT_ON_IRQ_STACK=y +CONFIG_ARCH_HAS_ELF_RANDOMIZE=y +CONFIG_HAVE_ARCH_MMAP_RND_BITS=y +CONFIG_HAVE_EXIT_THREAD=y +CONFIG_ARCH_MMAP_RND_BITS=32 +CONFIG_HAVE_ARCH_MMAP_RND_COMPAT_BITS=y +CONFIG_ARCH_MMAP_RND_COMPAT_BITS=16 +CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES=y +CONFIG_HAVE_COPY_THREAD_TLS=y +CONFIG_HAVE_STACK_VALIDATION=y +CONFIG_HAVE_RELIABLE_STACKTRACE=y +CONFIG_ISA_BUS_API=y +CONFIG_OLD_SIGSUSPEND3=y +CONFIG_COMPAT_OLD_SIGACTION=y +CONFIG_COMPAT_32BIT_TIME=y +CONFIG_HAVE_ARCH_VMAP_STACK=y +CONFIG_VMAP_STACK=y +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y +CONFIG_STRICT_MODULE_RWX=y +CONFIG_ARCH_HAS_REFCOUNT=y +CONFIG_REFCOUNT_FULL=y +CONFIG_HAVE_ARCH_PREL32_RELOCATIONS=y + +# +# GCOV-based kernel profiling +# +# CONFIG_GCOV_KERNEL is not set +CONFIG_ARCH_HAS_GCOV_PROFILE_ALL=y +CONFIG_PLUGIN_HOSTCC="g++" +CONFIG_HAVE_GCC_PLUGINS=y +# CONFIG_GCC_PLUGINS is not set +CONFIG_RT_MUTEXES=y +CONFIG_BASE_SMALL=0 +CONFIG_MODULES=y +CONFIG_MODULE_FORCE_LOAD=y +CONFIG_MODULE_UNLOAD=y +CONFIG_MODULE_FORCE_UNLOAD=y +CONFIG_MODVERSIONS=y +CONFIG_MODULE_SRCVERSION_ALL=y +CONFIG_MODULE_SIG=y +# CONFIG_MODULE_SIG_FORCE is not set +CONFIG_MODULE_SIG_ALL=y +# CONFIG_MODULE_SIG_SHA1 is not set +# CONFIG_MODULE_SIG_SHA224 is not set +# CONFIG_MODULE_SIG_SHA256 is not set +# CONFIG_MODULE_SIG_SHA384 is not set +CONFIG_MODULE_SIG_SHA512=y +CONFIG_MODULE_SIG_HASH="sha512" +CONFIG_MODULE_COMPRESS=y +CONFIG_MODULE_COMPRESS_GZIP=y +# CONFIG_MODULE_COMPRESS_XZ is not set +# CONFIG_TRIM_UNUSED_KSYMS is not set +CONFIG_MODULES_TREE_LOOKUP=y +CONFIG_BLOCK=y +CONFIG_BLK_SCSI_REQUEST=y +CONFIG_BLK_DEV_BSG=y +CONFIG_BLK_DEV_BSGLIB=y +CONFIG_BLK_DEV_INTEGRITY=y +CONFIG_BLK_DEV_ZONED=y +CONFIG_BLK_DEV_THROTTLING=y +# CONFIG_BLK_DEV_THROTTLING_LOW is not set +CONFIG_BLK_CMDLINE_PARSER=y +CONFIG_BLK_WBT=y +# CONFIG_BLK_CGROUP_IOLATENCY is not set +CONFIG_BLK_WBT_SQ=y +CONFIG_BLK_WBT_MQ=y +CONFIG_BLK_DEBUG_FS=y +CONFIG_BLK_DEBUG_FS_ZONED=y +# CONFIG_BLK_SED_OPAL is not set + +# +# Partition Types +# +CONFIG_PARTITION_ADVANCED=y +CONFIG_ACORN_PARTITION=y +CONFIG_ACORN_PARTITION_CUMANA=y +CONFIG_ACORN_PARTITION_EESOX=y +CONFIG_ACORN_PARTITION_ICS=y +CONFIG_ACORN_PARTITION_ADFS=y +CONFIG_ACORN_PARTITION_POWERTEC=y +CONFIG_ACORN_PARTITION_RISCIX=y +CONFIG_AIX_PARTITION=y +CONFIG_OSF_PARTITION=y +CONFIG_AMIGA_PARTITION=y +CONFIG_ATARI_PARTITION=y +CONFIG_MAC_PARTITION=y +CONFIG_MSDOS_PARTITION=y +CONFIG_BSD_DISKLABEL=y +CONFIG_MINIX_SUBPARTITION=y +CONFIG_SOLARIS_X86_PARTITION=y +CONFIG_UNIXWARE_DISKLABEL=y +CONFIG_LDM_PARTITION=y +CONFIG_LDM_DEBUG=y +CONFIG_SGI_PARTITION=y +CONFIG_ULTRIX_PARTITION=y +CONFIG_SUN_PARTITION=y +CONFIG_KARMA_PARTITION=y +CONFIG_EFI_PARTITION=y +CONFIG_SYSV68_PARTITION=y +CONFIG_CMDLINE_PARTITION=y +CONFIG_BLOCK_COMPAT=y +CONFIG_BLK_MQ_PCI=y +CONFIG_BLK_MQ_VIRTIO=y +CONFIG_BLK_MQ_RDMA=y + +# +# IO Schedulers +# +CONFIG_IOSCHED_NOOP=y +CONFIG_IOSCHED_DEADLINE=y +CONFIG_IOSCHED_CFQ=y +CONFIG_CFQ_GROUP_IOSCHED=y +CONFIG_IOSCHED_BFQ_SQ=y +CONFIG_BFQ_SQ_GROUP_IOSCHED=y +# CONFIG_DEFAULT_DEADLINE is not set +# CONFIG_DEFAULT_CFQ is not set +CONFIG_DEFAULT_BFQ_SQ=y +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="bfq-sq" +CONFIG_MQ_IOSCHED_BFQ=y +CONFIG_MQ_BFQ_GROUP_IOSCHED=y +CONFIG_MQ_IOSCHED_DEADLINE=y +# CONFIG_MQ_IOSCHED_KYBER is not set +CONFIG_IOSCHED_BFQ=y +CONFIG_BFQ_GROUP_IOSCHED=y +CONFIG_PREEMPT_NOTIFIERS=y +CONFIG_PADATA=y +CONFIG_ASN1=y +CONFIG_UNINLINE_SPIN_UNLOCK=y +CONFIG_ARCH_SUPPORTS_ATOMIC_RMW=y +CONFIG_MUTEX_SPIN_ON_OWNER=y +CONFIG_RWSEM_SPIN_ON_OWNER=y +CONFIG_LOCK_SPIN_ON_OWNER=y +CONFIG_ARCH_USE_QUEUED_SPINLOCKS=y +CONFIG_QUEUED_SPINLOCKS=y +CONFIG_ARCH_USE_QUEUED_RWLOCKS=y +CONFIG_QUEUED_RWLOCKS=y +CONFIG_ARCH_HAS_SYNC_CORE_BEFORE_USERMODE=y +CONFIG_ARCH_HAS_SYSCALL_WRAPPER=y +CONFIG_FREEZER=y + +# +# Executable file formats +# +CONFIG_BINFMT_ELF=y +CONFIG_COMPAT_BINFMT_ELF=y +CONFIG_ELFCORE=y +CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS=y +CONFIG_BINFMT_SCRIPT=y +CONFIG_BINFMT_MISC=y +CONFIG_COREDUMP=y + +# +# Memory Management options +# +CONFIG_SELECT_MEMORY_MODEL=y +CONFIG_SPARSEMEM_MANUAL=y +CONFIG_SPARSEMEM=y +CONFIG_NEED_MULTIPLE_NODES=y +CONFIG_HAVE_MEMORY_PRESENT=y +CONFIG_SPARSEMEM_EXTREME=y +CONFIG_SPARSEMEM_VMEMMAP_ENABLE=y +CONFIG_SPARSEMEM_VMEMMAP=y +CONFIG_HAVE_MEMBLOCK=y +CONFIG_HAVE_MEMBLOCK_NODE_MAP=y +CONFIG_ARCH_DISCARD_MEMBLOCK=y +CONFIG_MEMORY_ISOLATION=y +CONFIG_HAVE_BOOTMEM_INFO_NODE=y +CONFIG_MEMORY_HOTPLUG=y +CONFIG_MEMORY_HOTPLUG_SPARSE=y +# CONFIG_MEMORY_HOTPLUG_DEFAULT_ONLINE is not set +CONFIG_MEMORY_HOTREMOVE=y +CONFIG_SPLIT_PTLOCK_CPUS=4 +CONFIG_MEMORY_BALLOON=y +CONFIG_BALLOON_COMPACTION=y +CONFIG_COMPACTION=y +CONFIG_MIGRATION=y +CONFIG_PHYS_ADDR_T_64BIT=y +CONFIG_BOUNCE=y +CONFIG_VIRT_TO_BUS=y +CONFIG_MMU_NOTIFIER=y +CONFIG_KSM=y +CONFIG_UKSM=y +# CONFIG_KSM_LEGACY is not set +CONFIG_DEFAULT_MMAP_MIN_ADDR=65536 +CONFIG_ARCH_SUPPORTS_MEMORY_FAILURE=y +CONFIG_MEMORY_FAILURE=y +# CONFIG_HWPOISON_INJECT is not set +CONFIG_TRANSPARENT_HUGEPAGE=y +CONFIG_TRANSPARENT_HUGEPAGE_ALWAYS=y +# CONFIG_TRANSPARENT_HUGEPAGE_MADVISE is not set +CONFIG_ARCH_WANTS_THP_SWAP=y +CONFIG_THP_SWAP=y +CONFIG_TRANSPARENT_HUGE_PAGECACHE=y +CONFIG_CLEANCACHE=y +CONFIG_FRONTSWAP=y +CONFIG_CMA=y +# CONFIG_CMA_DEBUG is not set +# CONFIG_CMA_DEBUGFS is not set +CONFIG_CMA_AREAS=7 +# CONFIG_ZSWAP is not set +CONFIG_ZPOOL=m +CONFIG_ZBUD=m +CONFIG_Z3FOLD=m +CONFIG_ZSMALLOC=y +# CONFIG_PGTABLE_MAPPING is not set +# CONFIG_ZSMALLOC_STAT is not set +CONFIG_GENERIC_EARLY_IOREMAP=y +# CONFIG_DEFERRED_STRUCT_PAGE_INIT is not set +# CONFIG_IDLE_PAGE_TRACKING is not set +CONFIG_ARCH_HAS_ZONE_DEVICE=y +# CONFIG_ZONE_DEVICE is not set +CONFIG_FRAME_VECTOR=y +CONFIG_ARCH_USES_HIGH_VMA_FLAGS=y +CONFIG_ARCH_HAS_PKEYS=y +# CONFIG_PERCPU_STATS is not set +# CONFIG_GUP_BENCHMARK is not set +CONFIG_ARCH_HAS_PTE_SPECIAL=y +CONFIG_NET=y +CONFIG_COMPAT_NETLINK_MESSAGES=y +CONFIG_NET_INGRESS=y +CONFIG_NET_EGRESS=y + +# +# Networking options +# +CONFIG_PACKET=m +CONFIG_PACKET_DIAG=m +CONFIG_UNIX=m +CONFIG_UNIX_DIAG=m +CONFIG_TLS=m +# CONFIG_TLS_DEVICE is not set +CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y +CONFIG_XFRM_ALGO=m +CONFIG_XFRM_USER=m +# CONFIG_XFRM_INTERFACE is not set +CONFIG_XFRM_SUB_POLICY=y +CONFIG_XFRM_MIGRATE=y +CONFIG_XFRM_STATISTICS=y +CONFIG_XFRM_IPCOMP=m +CONFIG_NET_KEY=m +CONFIG_NET_KEY_MIGRATE=y +CONFIG_SMC=m +CONFIG_SMC_DIAG=m +CONFIG_XDP_SOCKETS=y +CONFIG_INET=y +CONFIG_IP_MULTICAST=y +CONFIG_IP_ADVANCED_ROUTER=y +CONFIG_IP_FIB_TRIE_STATS=y +CONFIG_IP_MULTIPLE_TABLES=y +CONFIG_IP_ROUTE_MULTIPATH=y +CONFIG_IP_ROUTE_VERBOSE=y +CONFIG_IP_ROUTE_CLASSID=y +CONFIG_IP_PNP=y +CONFIG_IP_PNP_DHCP=y +CONFIG_IP_PNP_BOOTP=y +CONFIG_IP_PNP_RARP=y +CONFIG_NET_IPIP=m +CONFIG_NET_IPGRE_DEMUX=m +CONFIG_NET_IP_TUNNEL=m +CONFIG_NET_IPGRE=m +CONFIG_NET_IPGRE_BROADCAST=y +CONFIG_IP_MROUTE_COMMON=y +CONFIG_IP_MROUTE=y +CONFIG_IP_MROUTE_MULTIPLE_TABLES=y +CONFIG_IP_PIMSM_V1=y +CONFIG_IP_PIMSM_V2=y +CONFIG_SYN_COOKIES=y +CONFIG_NET_IPVTI=m +CONFIG_NET_UDP_TUNNEL=m +CONFIG_NET_FOU=m +CONFIG_NET_FOU_IP_TUNNELS=y +CONFIG_INET_AH=m +CONFIG_INET_ESP=m +CONFIG_INET_ESP_OFFLOAD=m +CONFIG_INET_IPCOMP=m +CONFIG_INET_XFRM_TUNNEL=m +CONFIG_INET_TUNNEL=m +CONFIG_INET_XFRM_MODE_TRANSPORT=m +CONFIG_INET_XFRM_MODE_TUNNEL=m +CONFIG_INET_XFRM_MODE_BEET=m +CONFIG_INET_DIAG=m +CONFIG_INET_TCP_DIAG=m +CONFIG_INET_UDP_DIAG=m +CONFIG_INET_RAW_DIAG=m +CONFIG_INET_DIAG_DESTROY=y +CONFIG_TCP_CONG_ADVANCED=y +CONFIG_TCP_CONG_BIC=m +CONFIG_TCP_CONG_CUBIC=m +CONFIG_TCP_CONG_WESTWOOD=m +CONFIG_TCP_CONG_HTCP=m +CONFIG_TCP_CONG_HSTCP=m +CONFIG_TCP_CONG_HYBLA=m +CONFIG_TCP_CONG_VEGAS=m +CONFIG_TCP_CONG_NV=m +CONFIG_TCP_CONG_SCALABLE=m +CONFIG_TCP_CONG_LP=m +CONFIG_TCP_CONG_VENO=m +CONFIG_TCP_CONG_YEAH=m +CONFIG_TCP_CONG_ILLINOIS=m +CONFIG_TCP_CONG_DCTCP=m +# CONFIG_TCP_CONG_CDG is not set +CONFIG_TCP_CONG_BBR=m +CONFIG_DEFAULT_RENO=y +CONFIG_DEFAULT_TCP_CONG="reno" +CONFIG_TCP_MD5SIG=y +CONFIG_IPV6=m +CONFIG_IPV6_ROUTER_PREF=y +CONFIG_IPV6_ROUTE_INFO=y +CONFIG_IPV6_OPTIMISTIC_DAD=y +CONFIG_INET6_AH=m +CONFIG_INET6_ESP=m +CONFIG_INET6_ESP_OFFLOAD=m +CONFIG_INET6_IPCOMP=m +CONFIG_IPV6_MIP6=m +CONFIG_IPV6_ILA=m +CONFIG_INET6_XFRM_TUNNEL=m +CONFIG_INET6_TUNNEL=m +CONFIG_INET6_XFRM_MODE_TRANSPORT=m +CONFIG_INET6_XFRM_MODE_TUNNEL=m +CONFIG_INET6_XFRM_MODE_BEET=m +CONFIG_INET6_XFRM_MODE_ROUTEOPTIMIZATION=m +CONFIG_IPV6_VTI=m +CONFIG_IPV6_SIT=m +CONFIG_IPV6_SIT_6RD=y +CONFIG_IPV6_NDISC_NODETYPE=y +CONFIG_IPV6_TUNNEL=m +CONFIG_IPV6_GRE=m +CONFIG_IPV6_FOU=m +CONFIG_IPV6_FOU_TUNNEL=m +CONFIG_IPV6_MULTIPLE_TABLES=y +CONFIG_IPV6_SUBTREES=y +CONFIG_IPV6_MROUTE=y +CONFIG_IPV6_MROUTE_MULTIPLE_TABLES=y +CONFIG_IPV6_PIMSM_V2=y +CONFIG_IPV6_SEG6_LWTUNNEL=y +CONFIG_IPV6_SEG6_HMAC=y +# CONFIG_NETLABEL is not set +CONFIG_NETWORK_SECMARK=y +CONFIG_NET_PTP_CLASSIFY=y +CONFIG_NETWORK_PHY_TIMESTAMPING=y +CONFIG_NETFILTER=y +CONFIG_NETFILTER_ADVANCED=y +CONFIG_BRIDGE_NETFILTER=m + +# +# Core Netfilter Configuration +# +CONFIG_NETFILTER_INGRESS=y +CONFIG_NETFILTER_NETLINK=m +CONFIG_NETFILTER_FAMILY_BRIDGE=y +CONFIG_NETFILTER_FAMILY_ARP=y +CONFIG_NETFILTER_NETLINK_ACCT=m +CONFIG_NETFILTER_NETLINK_QUEUE=m +CONFIG_NETFILTER_NETLINK_LOG=m +CONFIG_NETFILTER_NETLINK_OSF=m +CONFIG_NF_CONNTRACK=m +CONFIG_NF_LOG_COMMON=m +CONFIG_NF_LOG_NETDEV=m +CONFIG_NETFILTER_CONNCOUNT=m +CONFIG_NF_CONNTRACK_MARK=y +CONFIG_NF_CONNTRACK_SECMARK=y +CONFIG_NF_CONNTRACK_ZONES=y +CONFIG_NF_CONNTRACK_PROCFS=y +CONFIG_NF_CONNTRACK_EVENTS=y +CONFIG_NF_CONNTRACK_TIMEOUT=y +CONFIG_NF_CONNTRACK_TIMESTAMP=y +CONFIG_NF_CONNTRACK_LABELS=y +CONFIG_NF_CT_PROTO_DCCP=y +CONFIG_NF_CT_PROTO_GRE=m +CONFIG_NF_CT_PROTO_SCTP=y +CONFIG_NF_CT_PROTO_UDPLITE=y +CONFIG_NF_CONNTRACK_AMANDA=m +CONFIG_NF_CONNTRACK_FTP=m +CONFIG_NF_CONNTRACK_H323=m +CONFIG_NF_CONNTRACK_IRC=m +CONFIG_NF_CONNTRACK_BROADCAST=m +CONFIG_NF_CONNTRACK_NETBIOS_NS=m +CONFIG_NF_CONNTRACK_SNMP=m +CONFIG_NF_CONNTRACK_PPTP=m +CONFIG_NF_CONNTRACK_SANE=m +CONFIG_NF_CONNTRACK_SIP=m +CONFIG_NF_CONNTRACK_TFTP=m +CONFIG_NF_CT_NETLINK=m +CONFIG_NF_CT_NETLINK_TIMEOUT=m +CONFIG_NF_CT_NETLINK_HELPER=m +CONFIG_NETFILTER_NETLINK_GLUE_CT=y +CONFIG_NF_NAT=m +CONFIG_NF_NAT_NEEDED=y +CONFIG_NF_NAT_PROTO_DCCP=y +CONFIG_NF_NAT_PROTO_UDPLITE=y +CONFIG_NF_NAT_PROTO_SCTP=y +CONFIG_NF_NAT_AMANDA=m +CONFIG_NF_NAT_FTP=m +CONFIG_NF_NAT_IRC=m +CONFIG_NF_NAT_SIP=m +CONFIG_NF_NAT_TFTP=m +CONFIG_NF_NAT_REDIRECT=y +CONFIG_NETFILTER_SYNPROXY=m +CONFIG_NF_TABLES=m +CONFIG_NF_TABLES_SET=m +CONFIG_NF_TABLES_INET=y +CONFIG_NF_TABLES_NETDEV=y +CONFIG_NFT_NUMGEN=m +CONFIG_NFT_CT=m +CONFIG_NFT_FLOW_OFFLOAD=m +CONFIG_NFT_COUNTER=m +CONFIG_NFT_CONNLIMIT=m +CONFIG_NFT_LOG=m +CONFIG_NFT_LIMIT=m +CONFIG_NFT_MASQ=m +CONFIG_NFT_REDIR=m +CONFIG_NFT_NAT=m +# CONFIG_NFT_TUNNEL is not set +CONFIG_NFT_OBJREF=m +CONFIG_NFT_QUEUE=m +CONFIG_NFT_QUOTA=m +CONFIG_NFT_REJECT=m +CONFIG_NFT_REJECT_INET=m +CONFIG_NFT_COMPAT=m +CONFIG_NFT_HASH=m +CONFIG_NFT_FIB=m +CONFIG_NFT_FIB_INET=m +CONFIG_NFT_SOCKET=m +# CONFIG_NFT_OSF is not set +# CONFIG_NFT_TPROXY is not set +CONFIG_NF_DUP_NETDEV=m +CONFIG_NFT_DUP_NETDEV=m +CONFIG_NFT_FWD_NETDEV=m +CONFIG_NFT_FIB_NETDEV=m +CONFIG_NF_FLOW_TABLE_INET=m +CONFIG_NF_FLOW_TABLE=m +CONFIG_NETFILTER_XTABLES=m + +# +# Xtables combined modules +# +CONFIG_NETFILTER_XT_MARK=m +CONFIG_NETFILTER_XT_CONNMARK=m +CONFIG_NETFILTER_XT_SET=m + +# +# Xtables targets +# +CONFIG_NETFILTER_XT_TARGET_AUDIT=m +CONFIG_NETFILTER_XT_TARGET_CHECKSUM=m +CONFIG_NETFILTER_XT_TARGET_CLASSIFY=m +CONFIG_NETFILTER_XT_TARGET_CONNMARK=m +CONFIG_NETFILTER_XT_TARGET_CONNSECMARK=m +CONFIG_NETFILTER_XT_TARGET_CT=m +CONFIG_NETFILTER_XT_TARGET_DSCP=m +CONFIG_NETFILTER_XT_TARGET_HL=m +CONFIG_NETFILTER_XT_TARGET_HMARK=m +CONFIG_NETFILTER_XT_TARGET_IDLETIMER=m +CONFIG_NETFILTER_XT_TARGET_LED=m +CONFIG_NETFILTER_XT_TARGET_LOG=m +CONFIG_NETFILTER_XT_TARGET_MARK=m +CONFIG_NETFILTER_XT_NAT=m +CONFIG_NETFILTER_XT_TARGET_NETMAP=m +CONFIG_NETFILTER_XT_TARGET_NFLOG=m +CONFIG_NETFILTER_XT_TARGET_NFQUEUE=m +CONFIG_NETFILTER_XT_TARGET_NOTRACK=m +CONFIG_NETFILTER_XT_TARGET_RATEEST=m +CONFIG_NETFILTER_XT_TARGET_REDIRECT=m +CONFIG_NETFILTER_XT_TARGET_TEE=m +CONFIG_NETFILTER_XT_TARGET_TPROXY=m +CONFIG_NETFILTER_XT_TARGET_TRACE=m +CONFIG_NETFILTER_XT_TARGET_SECMARK=m +CONFIG_NETFILTER_XT_TARGET_TCPMSS=m +CONFIG_NETFILTER_XT_TARGET_TCPOPTSTRIP=m + +# +# Xtables matches +# +CONFIG_NETFILTER_XT_MATCH_ADDRTYPE=m +CONFIG_NETFILTER_XT_MATCH_BPF=m +CONFIG_NETFILTER_XT_MATCH_CGROUP=m +CONFIG_NETFILTER_XT_MATCH_CLUSTER=m +CONFIG_NETFILTER_XT_MATCH_COMMENT=m +CONFIG_NETFILTER_XT_MATCH_CONNBYTES=m +CONFIG_NETFILTER_XT_MATCH_CONNLABEL=m +CONFIG_NETFILTER_XT_MATCH_CONNLIMIT=m +CONFIG_NETFILTER_XT_MATCH_CONNMARK=m +CONFIG_NETFILTER_XT_MATCH_CONNTRACK=m +CONFIG_NETFILTER_XT_MATCH_CPU=m +CONFIG_NETFILTER_XT_MATCH_DCCP=m +CONFIG_NETFILTER_XT_MATCH_DEVGROUP=m +CONFIG_NETFILTER_XT_MATCH_DSCP=m +CONFIG_NETFILTER_XT_MATCH_ECN=m +CONFIG_NETFILTER_XT_MATCH_ESP=m +CONFIG_NETFILTER_XT_MATCH_HASHLIMIT=m +CONFIG_NETFILTER_XT_MATCH_HELPER=m +CONFIG_NETFILTER_XT_MATCH_HL=m +CONFIG_NETFILTER_XT_MATCH_IPCOMP=m +CONFIG_NETFILTER_XT_MATCH_IPRANGE=m +CONFIG_NETFILTER_XT_MATCH_IPVS=m +CONFIG_NETFILTER_XT_MATCH_L2TP=m +CONFIG_NETFILTER_XT_MATCH_LENGTH=m +CONFIG_NETFILTER_XT_MATCH_LIMIT=m +CONFIG_NETFILTER_XT_MATCH_MAC=m +CONFIG_NETFILTER_XT_MATCH_MARK=m +CONFIG_NETFILTER_XT_MATCH_MULTIPORT=m +CONFIG_NETFILTER_XT_MATCH_NFACCT=m +CONFIG_NETFILTER_XT_MATCH_OSF=m +CONFIG_NETFILTER_XT_MATCH_OWNER=m +CONFIG_NETFILTER_XT_MATCH_POLICY=m +CONFIG_NETFILTER_XT_MATCH_PHYSDEV=m +CONFIG_NETFILTER_XT_MATCH_PKTTYPE=m +CONFIG_NETFILTER_XT_MATCH_QUOTA=m +CONFIG_NETFILTER_XT_MATCH_RATEEST=m +CONFIG_NETFILTER_XT_MATCH_REALM=m +CONFIG_NETFILTER_XT_MATCH_RECENT=m +CONFIG_NETFILTER_XT_MATCH_SCTP=m +CONFIG_NETFILTER_XT_MATCH_SOCKET=m +CONFIG_NETFILTER_XT_MATCH_STATE=m +CONFIG_NETFILTER_XT_MATCH_STATISTIC=m +CONFIG_NETFILTER_XT_MATCH_STRING=m +CONFIG_NETFILTER_XT_MATCH_TCPMSS=m +CONFIG_NETFILTER_XT_MATCH_TIME=m +CONFIG_NETFILTER_XT_MATCH_U32=m +CONFIG_IP_SET=m +CONFIG_IP_SET_MAX=256 +CONFIG_IP_SET_BITMAP_IP=m +CONFIG_IP_SET_BITMAP_IPMAC=m +CONFIG_IP_SET_BITMAP_PORT=m +CONFIG_IP_SET_HASH_IP=m +CONFIG_IP_SET_HASH_IPMARK=m +CONFIG_IP_SET_HASH_IPPORT=m +CONFIG_IP_SET_HASH_IPPORTIP=m +CONFIG_IP_SET_HASH_IPPORTNET=m +CONFIG_IP_SET_HASH_IPMAC=m +CONFIG_IP_SET_HASH_MAC=m +CONFIG_IP_SET_HASH_NETPORTNET=m +CONFIG_IP_SET_HASH_NET=m +CONFIG_IP_SET_HASH_NETNET=m +CONFIG_IP_SET_HASH_NETPORT=m +CONFIG_IP_SET_HASH_NETIFACE=m +CONFIG_IP_SET_LIST_SET=m +CONFIG_IP_VS=m +CONFIG_IP_VS_IPV6=y +# CONFIG_IP_VS_DEBUG is not set +CONFIG_IP_VS_TAB_BITS=12 + +# +# IPVS transport protocol load balancing support +# +CONFIG_IP_VS_PROTO_TCP=y +CONFIG_IP_VS_PROTO_UDP=y +CONFIG_IP_VS_PROTO_AH_ESP=y +CONFIG_IP_VS_PROTO_ESP=y +CONFIG_IP_VS_PROTO_AH=y +CONFIG_IP_VS_PROTO_SCTP=y + +# +# IPVS scheduler +# +CONFIG_IP_VS_RR=m +CONFIG_IP_VS_WRR=m +CONFIG_IP_VS_LC=m +CONFIG_IP_VS_WLC=m +CONFIG_IP_VS_FO=m +CONFIG_IP_VS_OVF=m +CONFIG_IP_VS_LBLC=m +CONFIG_IP_VS_LBLCR=m +CONFIG_IP_VS_DH=m +CONFIG_IP_VS_SH=m +CONFIG_IP_VS_MH=m +CONFIG_IP_VS_SED=m +CONFIG_IP_VS_NQ=m + +# +# IPVS SH scheduler +# +CONFIG_IP_VS_SH_TAB_BITS=8 + +# +# IPVS MH scheduler +# +CONFIG_IP_VS_MH_TAB_INDEX=12 + +# +# IPVS application helper +# +CONFIG_IP_VS_FTP=m +CONFIG_IP_VS_NFCT=y +CONFIG_IP_VS_PE_SIP=m + +# +# IP: Netfilter Configuration +# +CONFIG_NF_DEFRAG_IPV4=m +CONFIG_NF_SOCKET_IPV4=m +CONFIG_NF_TPROXY_IPV4=m +CONFIG_NF_TABLES_IPV4=y +CONFIG_NFT_CHAIN_ROUTE_IPV4=m +CONFIG_NFT_REJECT_IPV4=m +CONFIG_NFT_DUP_IPV4=m +CONFIG_NFT_FIB_IPV4=m +CONFIG_NF_TABLES_ARP=y +CONFIG_NF_FLOW_TABLE_IPV4=m +CONFIG_NF_DUP_IPV4=m +CONFIG_NF_LOG_ARP=m +CONFIG_NF_LOG_IPV4=m +CONFIG_NF_REJECT_IPV4=m +CONFIG_NF_NAT_IPV4=m +CONFIG_NF_NAT_MASQUERADE_IPV4=y +CONFIG_NFT_CHAIN_NAT_IPV4=m +CONFIG_NFT_MASQ_IPV4=m +CONFIG_NFT_REDIR_IPV4=m +CONFIG_NF_NAT_SNMP_BASIC=m +CONFIG_NF_NAT_PROTO_GRE=m +CONFIG_NF_NAT_PPTP=m +CONFIG_NF_NAT_H323=m +CONFIG_IP_NF_IPTABLES=m +CONFIG_IP_NF_MATCH_AH=m +CONFIG_IP_NF_MATCH_ECN=m +CONFIG_IP_NF_MATCH_RPFILTER=m +CONFIG_IP_NF_MATCH_TTL=m +CONFIG_IP_NF_FILTER=m +CONFIG_IP_NF_TARGET_REJECT=m +CONFIG_IP_NF_TARGET_SYNPROXY=m +CONFIG_IP_NF_NAT=m +CONFIG_IP_NF_TARGET_MASQUERADE=m +CONFIG_IP_NF_TARGET_NETMAP=m +CONFIG_IP_NF_TARGET_REDIRECT=m +CONFIG_IP_NF_MANGLE=m +CONFIG_IP_NF_TARGET_CLUSTERIP=m +CONFIG_IP_NF_TARGET_ECN=m +CONFIG_IP_NF_TARGET_TTL=m +CONFIG_IP_NF_RAW=m +# CONFIG_IP_NF_SECURITY is not set +CONFIG_IP_NF_ARPTABLES=m +CONFIG_IP_NF_ARPFILTER=m +CONFIG_IP_NF_ARP_MANGLE=m + +# +# IPv6: Netfilter Configuration +# +CONFIG_NF_SOCKET_IPV6=m +CONFIG_NF_TPROXY_IPV6=m +CONFIG_NF_TABLES_IPV6=y +CONFIG_NFT_CHAIN_ROUTE_IPV6=m +CONFIG_NFT_CHAIN_NAT_IPV6=m +CONFIG_NFT_MASQ_IPV6=m +CONFIG_NFT_REDIR_IPV6=m +CONFIG_NFT_REJECT_IPV6=m +CONFIG_NFT_DUP_IPV6=m +CONFIG_NFT_FIB_IPV6=m +CONFIG_NF_FLOW_TABLE_IPV6=m +CONFIG_NF_DUP_IPV6=m +CONFIG_NF_REJECT_IPV6=m +CONFIG_NF_LOG_IPV6=m +CONFIG_NF_NAT_IPV6=m +CONFIG_NF_NAT_MASQUERADE_IPV6=y +CONFIG_IP6_NF_IPTABLES=m +CONFIG_IP6_NF_MATCH_AH=m +CONFIG_IP6_NF_MATCH_EUI64=m +CONFIG_IP6_NF_MATCH_FRAG=m +CONFIG_IP6_NF_MATCH_OPTS=m +CONFIG_IP6_NF_MATCH_HL=m +CONFIG_IP6_NF_MATCH_IPV6HEADER=m +CONFIG_IP6_NF_MATCH_MH=m +CONFIG_IP6_NF_MATCH_RPFILTER=m +CONFIG_IP6_NF_MATCH_RT=m +CONFIG_IP6_NF_MATCH_SRH=m +CONFIG_IP6_NF_TARGET_HL=m +CONFIG_IP6_NF_FILTER=m +CONFIG_IP6_NF_TARGET_REJECT=m +CONFIG_IP6_NF_TARGET_SYNPROXY=m +CONFIG_IP6_NF_MANGLE=m +CONFIG_IP6_NF_RAW=m +# CONFIG_IP6_NF_SECURITY is not set +CONFIG_IP6_NF_NAT=m +CONFIG_IP6_NF_TARGET_MASQUERADE=m +CONFIG_IP6_NF_TARGET_NPT=m +CONFIG_NF_DEFRAG_IPV6=m + +# +# DECnet: Netfilter Configuration +# +CONFIG_DECNET_NF_GRABULATOR=m +CONFIG_NF_TABLES_BRIDGE=y +CONFIG_NFT_BRIDGE_REJECT=m +CONFIG_NF_LOG_BRIDGE=m +CONFIG_BRIDGE_NF_EBTABLES=m +CONFIG_BRIDGE_EBT_BROUTE=m +CONFIG_BRIDGE_EBT_T_FILTER=m +CONFIG_BRIDGE_EBT_T_NAT=m +CONFIG_BRIDGE_EBT_802_3=m +CONFIG_BRIDGE_EBT_AMONG=m +CONFIG_BRIDGE_EBT_ARP=m +CONFIG_BRIDGE_EBT_IP=m +CONFIG_BRIDGE_EBT_IP6=m +CONFIG_BRIDGE_EBT_LIMIT=m +CONFIG_BRIDGE_EBT_MARK=m +CONFIG_BRIDGE_EBT_PKTTYPE=m +CONFIG_BRIDGE_EBT_STP=m +CONFIG_BRIDGE_EBT_VLAN=m +CONFIG_BRIDGE_EBT_ARPREPLY=m +CONFIG_BRIDGE_EBT_DNAT=m +CONFIG_BRIDGE_EBT_MARK_T=m +CONFIG_BRIDGE_EBT_REDIRECT=m +CONFIG_BRIDGE_EBT_SNAT=m +CONFIG_BRIDGE_EBT_LOG=m +CONFIG_BRIDGE_EBT_NFLOG=m +CONFIG_BPFILTER=y +CONFIG_BPFILTER_UMH=m +CONFIG_IP_DCCP=m +CONFIG_INET_DCCP_DIAG=m + +# +# DCCP CCIDs Configuration +# +# CONFIG_IP_DCCP_CCID2_DEBUG is not set +CONFIG_IP_DCCP_CCID3=y +# CONFIG_IP_DCCP_CCID3_DEBUG is not set +CONFIG_IP_DCCP_TFRC_LIB=y + +# +# DCCP Kernel Hacking +# +# CONFIG_IP_DCCP_DEBUG is not set +CONFIG_IP_SCTP=m +# CONFIG_SCTP_DBG_OBJCNT is not set +CONFIG_SCTP_DEFAULT_COOKIE_HMAC_MD5=y +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_SHA1 is not set +# CONFIG_SCTP_DEFAULT_COOKIE_HMAC_NONE is not set +CONFIG_SCTP_COOKIE_HMAC_MD5=y +CONFIG_SCTP_COOKIE_HMAC_SHA1=y +CONFIG_INET_SCTP_DIAG=m +CONFIG_RDS=m +CONFIG_RDS_RDMA=m +CONFIG_RDS_TCP=m +# CONFIG_RDS_DEBUG is not set +CONFIG_TIPC=m +CONFIG_TIPC_MEDIA_IB=y +CONFIG_TIPC_MEDIA_UDP=y +CONFIG_TIPC_DIAG=m +CONFIG_ATM=m +CONFIG_ATM_CLIP=m +# CONFIG_ATM_CLIP_NO_ICMP is not set +CONFIG_ATM_LANE=m +CONFIG_ATM_MPOA=m +CONFIG_ATM_BR2684=m +# CONFIG_ATM_BR2684_IPFILTER is not set +CONFIG_L2TP=m +# CONFIG_L2TP_DEBUGFS is not set +CONFIG_L2TP_V3=y +CONFIG_L2TP_IP=m +CONFIG_L2TP_ETH=m +CONFIG_STP=m +CONFIG_GARP=m +CONFIG_MRP=m +CONFIG_BRIDGE=m +CONFIG_BRIDGE_IGMP_SNOOPING=y +CONFIG_BRIDGE_VLAN_FILTERING=y +CONFIG_HAVE_NET_DSA=y +CONFIG_NET_DSA=m +CONFIG_NET_DSA_LEGACY=y +CONFIG_NET_DSA_TAG_BRCM=y +CONFIG_NET_DSA_TAG_BRCM_PREPEND=y +CONFIG_NET_DSA_TAG_DSA=y +CONFIG_NET_DSA_TAG_EDSA=y +CONFIG_NET_DSA_TAG_KSZ=y +CONFIG_NET_DSA_TAG_LAN9303=y +CONFIG_NET_DSA_TAG_MTK=y +CONFIG_NET_DSA_TAG_TRAILER=y +CONFIG_NET_DSA_TAG_QCA=y +CONFIG_VLAN_8021Q=m +CONFIG_VLAN_8021Q_GVRP=y +CONFIG_VLAN_8021Q_MVRP=y +CONFIG_DECNET=m +CONFIG_DECNET_ROUTER=y +CONFIG_LLC=m +CONFIG_LLC2=m +CONFIG_ATALK=m +CONFIG_DEV_APPLETALK=m +CONFIG_IPDDP=m +CONFIG_IPDDP_ENCAP=y +CONFIG_X25=m +CONFIG_LAPB=m +CONFIG_PHONET=m +CONFIG_6LOWPAN=m +# CONFIG_6LOWPAN_DEBUGFS is not set +CONFIG_6LOWPAN_NHC=m +CONFIG_6LOWPAN_NHC_DEST=m +CONFIG_6LOWPAN_NHC_FRAGMENT=m +CONFIG_6LOWPAN_NHC_HOP=m +CONFIG_6LOWPAN_NHC_IPV6=m +CONFIG_6LOWPAN_NHC_MOBILITY=m +CONFIG_6LOWPAN_NHC_ROUTING=m +CONFIG_6LOWPAN_NHC_UDP=m +CONFIG_6LOWPAN_GHC_EXT_HDR_HOP=m +CONFIG_6LOWPAN_GHC_UDP=m +CONFIG_6LOWPAN_GHC_ICMPV6=m +CONFIG_6LOWPAN_GHC_EXT_HDR_DEST=m +CONFIG_6LOWPAN_GHC_EXT_HDR_FRAG=m +CONFIG_6LOWPAN_GHC_EXT_HDR_ROUTE=m +CONFIG_IEEE802154=m +CONFIG_IEEE802154_NL802154_EXPERIMENTAL=y +CONFIG_IEEE802154_SOCKET=m +CONFIG_IEEE802154_6LOWPAN=m +CONFIG_MAC802154=m +CONFIG_NET_SCHED=y + +# +# Queueing/Scheduling +# +CONFIG_NET_SCH_CBQ=m +CONFIG_NET_SCH_HTB=m +CONFIG_NET_SCH_HFSC=m +CONFIG_NET_SCH_ATM=m +CONFIG_NET_SCH_PRIO=m +CONFIG_NET_SCH_MULTIQ=m +CONFIG_NET_SCH_RED=m +CONFIG_NET_SCH_SFB=m +CONFIG_NET_SCH_SFQ=m +CONFIG_NET_SCH_TEQL=m +CONFIG_NET_SCH_TBF=m +CONFIG_NET_SCH_CBS=m +# CONFIG_NET_SCH_ETF is not set +CONFIG_NET_SCH_GRED=m +CONFIG_NET_SCH_DSMARK=m +CONFIG_NET_SCH_NETEM=m +CONFIG_NET_SCH_DRR=m +CONFIG_NET_SCH_MQPRIO=m +# CONFIG_NET_SCH_SKBPRIO is not set +CONFIG_NET_SCH_CHOKE=m +CONFIG_NET_SCH_QFQ=m +CONFIG_NET_SCH_CODEL=m +CONFIG_NET_SCH_FQ_CODEL=m +# CONFIG_NET_SCH_CAKE is not set +CONFIG_NET_SCH_FQ=m +CONFIG_NET_SCH_HHF=m +CONFIG_NET_SCH_PIE=m +CONFIG_NET_SCH_INGRESS=m +CONFIG_NET_SCH_PLUG=m +# CONFIG_NET_SCH_DEFAULT is not set + +# +# Classification +# +CONFIG_NET_CLS=y +CONFIG_NET_CLS_BASIC=m +CONFIG_NET_CLS_TCINDEX=m +CONFIG_NET_CLS_ROUTE4=m +CONFIG_NET_CLS_FW=m +CONFIG_NET_CLS_U32=m +CONFIG_CLS_U32_PERF=y +CONFIG_CLS_U32_MARK=y +CONFIG_NET_CLS_RSVP=m +CONFIG_NET_CLS_RSVP6=m +CONFIG_NET_CLS_FLOW=m +CONFIG_NET_CLS_CGROUP=m +CONFIG_NET_CLS_BPF=m +CONFIG_NET_CLS_FLOWER=m +CONFIG_NET_CLS_MATCHALL=m +CONFIG_NET_EMATCH=y +CONFIG_NET_EMATCH_STACK=32 +CONFIG_NET_EMATCH_CMP=m +CONFIG_NET_EMATCH_NBYTE=m +CONFIG_NET_EMATCH_U32=m +CONFIG_NET_EMATCH_META=m +CONFIG_NET_EMATCH_TEXT=m +CONFIG_NET_EMATCH_CANID=m +CONFIG_NET_EMATCH_IPSET=m +CONFIG_NET_EMATCH_IPT=m +CONFIG_NET_CLS_ACT=y +CONFIG_NET_ACT_POLICE=m +CONFIG_NET_ACT_GACT=m +CONFIG_GACT_PROB=y +CONFIG_NET_ACT_MIRRED=m +CONFIG_NET_ACT_SAMPLE=m +CONFIG_NET_ACT_IPT=m +CONFIG_NET_ACT_NAT=m +CONFIG_NET_ACT_PEDIT=m +# CONFIG_NET_ACT_SIMP is not set +CONFIG_NET_ACT_SKBEDIT=m +CONFIG_NET_ACT_CSUM=m +CONFIG_NET_ACT_VLAN=m +CONFIG_NET_ACT_BPF=m +CONFIG_NET_ACT_CONNMARK=m +CONFIG_NET_ACT_SKBMOD=m +CONFIG_NET_ACT_IFE=m +CONFIG_NET_ACT_TUNNEL_KEY=m +CONFIG_NET_IFE_SKBMARK=m +CONFIG_NET_IFE_SKBPRIO=m +CONFIG_NET_IFE_SKBTCINDEX=m +CONFIG_NET_CLS_IND=y +CONFIG_NET_SCH_FIFO=y +CONFIG_DCB=y +CONFIG_DNS_RESOLVER=y +CONFIG_BATMAN_ADV=m +# CONFIG_BATMAN_ADV_BATMAN_V is not set +CONFIG_BATMAN_ADV_BLA=y +CONFIG_BATMAN_ADV_DAT=y +CONFIG_BATMAN_ADV_NC=y +CONFIG_BATMAN_ADV_MCAST=y +CONFIG_BATMAN_ADV_DEBUGFS=y +# CONFIG_BATMAN_ADV_DEBUG is not set +CONFIG_OPENVSWITCH=m +CONFIG_OPENVSWITCH_GRE=m +CONFIG_OPENVSWITCH_VXLAN=m +CONFIG_OPENVSWITCH_GENEVE=m +CONFIG_VSOCKETS=m +CONFIG_VSOCKETS_DIAG=m +CONFIG_VMWARE_VMCI_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS=m +CONFIG_VIRTIO_VSOCKETS_COMMON=m +CONFIG_HYPERV_VSOCKETS=m +CONFIG_NETLINK_DIAG=m +CONFIG_MPLS=y +CONFIG_NET_MPLS_GSO=m +CONFIG_MPLS_ROUTING=m +CONFIG_MPLS_IPTUNNEL=m +CONFIG_NET_NSH=m +CONFIG_HSR=m +CONFIG_NET_SWITCHDEV=y +CONFIG_NET_L3_MASTER_DEV=y +# CONFIG_NET_NCSI is not set +CONFIG_RPS=y +CONFIG_RFS_ACCEL=y +CONFIG_XPS=y +CONFIG_CGROUP_NET_PRIO=y +CONFIG_CGROUP_NET_CLASSID=y +CONFIG_NET_RX_BUSY_POLL=y +CONFIG_BQL=y +CONFIG_BPF_JIT=y +CONFIG_BPF_STREAM_PARSER=y +CONFIG_NET_FLOW_LIMIT=y + +# +# Network testing +# +# CONFIG_NET_PKTGEN is not set +# CONFIG_NET_DROP_MONITOR is not set +CONFIG_HAMRADIO=y + +# +# Packet Radio protocols +# +CONFIG_AX25=m +CONFIG_AX25_DAMA_SLAVE=y +CONFIG_NETROM=m +CONFIG_ROSE=m + +# +# AX.25 network device drivers +# +CONFIG_MKISS=m +CONFIG_6PACK=m +CONFIG_BPQETHER=m +CONFIG_BAYCOM_SER_FDX=m +CONFIG_BAYCOM_SER_HDX=m +CONFIG_BAYCOM_PAR=m +CONFIG_YAM=m +CONFIG_CAN=m +CONFIG_CAN_RAW=m +CONFIG_CAN_BCM=m +CONFIG_CAN_GW=m + +# +# CAN Device Drivers +# +CONFIG_CAN_VCAN=m +CONFIG_CAN_VXCAN=m +CONFIG_CAN_SLCAN=m +CONFIG_CAN_DEV=m +CONFIG_CAN_CALC_BITTIMING=y +CONFIG_CAN_JANZ_ICAN3=m +CONFIG_CAN_C_CAN=m +CONFIG_CAN_C_CAN_PLATFORM=m +CONFIG_CAN_C_CAN_PCI=m +CONFIG_CAN_CC770=m +CONFIG_CAN_CC770_ISA=m +CONFIG_CAN_CC770_PLATFORM=m +CONFIG_CAN_IFI_CANFD=m +CONFIG_CAN_M_CAN=m +CONFIG_CAN_PEAK_PCIEFD=m +CONFIG_CAN_SJA1000=m +CONFIG_CAN_SJA1000_ISA=m +CONFIG_CAN_SJA1000_PLATFORM=m +CONFIG_CAN_EMS_PCMCIA=m +CONFIG_CAN_EMS_PCI=m +CONFIG_CAN_PEAK_PCMCIA=m +CONFIG_CAN_PEAK_PCI=m +CONFIG_CAN_PEAK_PCIEC=y +CONFIG_CAN_KVASER_PCI=m +CONFIG_CAN_PLX_PCI=m +CONFIG_CAN_SOFTING=m +CONFIG_CAN_SOFTING_CS=m + +# +# CAN SPI interfaces +# +CONFIG_CAN_HI311X=m +CONFIG_CAN_MCP251X=m + +# +# CAN USB interfaces +# +CONFIG_CAN_8DEV_USB=m +CONFIG_CAN_EMS_USB=m +CONFIG_CAN_ESD_USB2=m +CONFIG_CAN_GS_USB=m +CONFIG_CAN_KVASER_USB=m +CONFIG_CAN_MCBA_USB=m +CONFIG_CAN_PEAK_USB=m +# CONFIG_CAN_UCAN is not set +# CONFIG_CAN_DEBUG_DEVICES is not set +CONFIG_BT=m +CONFIG_BT_BREDR=y +CONFIG_BT_RFCOMM=m +CONFIG_BT_RFCOMM_TTY=y +CONFIG_BT_BNEP=m +CONFIG_BT_BNEP_MC_FILTER=y +CONFIG_BT_BNEP_PROTO_FILTER=y +CONFIG_BT_CMTP=m +CONFIG_BT_HIDP=m +CONFIG_BT_HS=y +CONFIG_BT_LE=y +CONFIG_BT_6LOWPAN=m +CONFIG_BT_LEDS=y +# CONFIG_BT_SELFTEST is not set +CONFIG_BT_DEBUGFS=y + +# +# Bluetooth device drivers +# +CONFIG_BT_INTEL=m +CONFIG_BT_BCM=m +CONFIG_BT_RTL=m +CONFIG_BT_QCA=m +CONFIG_BT_HCIBTUSB=m +# CONFIG_BT_HCIBTUSB_AUTOSUSPEND is not set +CONFIG_BT_HCIBTUSB_BCM=y +CONFIG_BT_HCIBTUSB_RTL=y +CONFIG_BT_HCIBTSDIO=m +CONFIG_BT_HCIUART=m +CONFIG_BT_HCIUART_SERDEV=y +CONFIG_BT_HCIUART_H4=y +CONFIG_BT_HCIUART_NOKIA=m +CONFIG_BT_HCIUART_BCSP=y +CONFIG_BT_HCIUART_ATH3K=y +CONFIG_BT_HCIUART_LL=y +CONFIG_BT_HCIUART_3WIRE=y +CONFIG_BT_HCIUART_INTEL=y +# CONFIG_BT_HCIUART_RTL is not set +CONFIG_BT_HCIUART_QCA=y +CONFIG_BT_HCIUART_AG6XX=y +CONFIG_BT_HCIUART_MRVL=y +CONFIG_BT_HCIBCM203X=m +CONFIG_BT_HCIBPA10X=m +CONFIG_BT_HCIBFUSB=m +CONFIG_BT_HCIDTL1=m +CONFIG_BT_HCIBT3C=m +CONFIG_BT_HCIBLUECARD=m +CONFIG_BT_HCIVHCI=m +CONFIG_BT_MRVL=m +CONFIG_BT_MRVL_SDIO=m +CONFIG_BT_ATH3K=m +CONFIG_BT_WILINK=m +# CONFIG_BT_MTKUART is not set +CONFIG_BT_HCIRSI=m +CONFIG_AF_RXRPC=m +CONFIG_AF_RXRPC_IPV6=y +# CONFIG_AF_RXRPC_INJECT_LOSS is not set +# CONFIG_AF_RXRPC_DEBUG is not set +# CONFIG_RXKAD is not set +CONFIG_AF_KCM=m +CONFIG_STREAM_PARSER=y +CONFIG_FIB_RULES=y +CONFIG_WIRELESS=y +CONFIG_WIRELESS_EXT=y +CONFIG_WEXT_CORE=y +CONFIG_WEXT_PROC=y +CONFIG_WEXT_SPY=y +CONFIG_WEXT_PRIV=y +CONFIG_CFG80211=m +CONFIG_NL80211_TESTMODE=y +# CONFIG_CFG80211_DEVELOPER_WARNINGS is not set +CONFIG_CFG80211_REQUIRE_SIGNED_REGDB=y +CONFIG_CFG80211_USE_KERNEL_REGDB_KEYS=y +CONFIG_CFG80211_DEFAULT_PS=y +# CONFIG_CFG80211_DEBUGFS is not set +CONFIG_CFG80211_CRDA_SUPPORT=y +CONFIG_CFG80211_WEXT=y +CONFIG_CFG80211_WEXT_EXPORT=y +CONFIG_LIB80211=m +CONFIG_LIB80211_CRYPT_WEP=m +CONFIG_LIB80211_CRYPT_CCMP=m +CONFIG_LIB80211_CRYPT_TKIP=m +# CONFIG_LIB80211_DEBUG is not set +CONFIG_MAC80211=m +CONFIG_MAC80211_HAS_RC=y +CONFIG_MAC80211_RC_MINSTREL=y +CONFIG_MAC80211_RC_MINSTREL_HT=y +CONFIG_MAC80211_RC_DEFAULT_MINSTREL=y +CONFIG_MAC80211_RC_DEFAULT="minstrel_ht" +CONFIG_MAC80211_MESH=y +CONFIG_MAC80211_LEDS=y +# CONFIG_MAC80211_DEBUGFS is not set +# CONFIG_MAC80211_MESSAGE_TRACING is not set +# CONFIG_MAC80211_DEBUG_MENU is not set +CONFIG_MAC80211_STA_HASH_MAX_SIZE=0 +CONFIG_WIMAX=m +CONFIG_WIMAX_DEBUG_LEVEL=8 +CONFIG_RFKILL=m +CONFIG_RFKILL_LEDS=y +CONFIG_RFKILL_INPUT=y +CONFIG_RFKILL_GPIO=m +CONFIG_NET_9P=m +CONFIG_NET_9P_VIRTIO=m +CONFIG_NET_9P_RDMA=m +# CONFIG_NET_9P_DEBUG is not set +CONFIG_CAIF=m +# CONFIG_CAIF_DEBUG is not set +CONFIG_CAIF_NETDEV=m +CONFIG_CAIF_USB=m +CONFIG_CEPH_LIB=m +# CONFIG_CEPH_LIB_PRETTYDEBUG is not set +CONFIG_CEPH_LIB_USE_DNS_RESOLVER=y +CONFIG_NFC=m +CONFIG_NFC_DIGITAL=m +CONFIG_NFC_NCI=m +CONFIG_NFC_NCI_SPI=m +CONFIG_NFC_NCI_UART=m +CONFIG_NFC_HCI=m +CONFIG_NFC_SHDLC=y + +# +# Near Field Communication (NFC) devices +# +CONFIG_NFC_TRF7970A=m +CONFIG_NFC_MEI_PHY=m +CONFIG_NFC_SIM=m +CONFIG_NFC_PORT100=m +CONFIG_NFC_FDP=m +CONFIG_NFC_FDP_I2C=m +CONFIG_NFC_PN544=m +CONFIG_NFC_PN544_I2C=m +CONFIG_NFC_PN544_MEI=m +CONFIG_NFC_PN533=m +CONFIG_NFC_PN533_USB=m +CONFIG_NFC_PN533_I2C=m +CONFIG_NFC_MICROREAD=m +CONFIG_NFC_MICROREAD_I2C=m +CONFIG_NFC_MICROREAD_MEI=m +CONFIG_NFC_MRVL=m +CONFIG_NFC_MRVL_USB=m +CONFIG_NFC_MRVL_UART=m +CONFIG_NFC_MRVL_I2C=m +CONFIG_NFC_MRVL_SPI=m +CONFIG_NFC_ST21NFCA=m +CONFIG_NFC_ST21NFCA_I2C=m +CONFIG_NFC_ST_NCI=m +CONFIG_NFC_ST_NCI_I2C=m +CONFIG_NFC_ST_NCI_SPI=m +CONFIG_NFC_NXP_NCI=m +CONFIG_NFC_NXP_NCI_I2C=m +CONFIG_NFC_S3FWRN5=m +CONFIG_NFC_S3FWRN5_I2C=m +CONFIG_NFC_ST95HF=m +CONFIG_PSAMPLE=m +CONFIG_NET_IFE=m +CONFIG_LWTUNNEL=y +CONFIG_LWTUNNEL_BPF=y +CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y +CONFIG_NET_DEVLINK=m +CONFIG_MAY_USE_DEVLINK=m +CONFIG_FAILOVER=m +CONFIG_HAVE_EBPF_JIT=y + +# +# Device Drivers +# + +# +# Generic Driver Options +# +# CONFIG_UEVENT_HELPER is not set +CONFIG_DEVTMPFS=y +CONFIG_DEVTMPFS_MOUNT=y +CONFIG_STANDALONE=y +CONFIG_PREVENT_FIRMWARE_BUILD=y + +# +# Firmware loader +# +CONFIG_FW_LOADER=y +CONFIG_EXTRA_FIRMWARE="" +CONFIG_FW_LOADER_USER_HELPER=y +# CONFIG_FW_LOADER_USER_HELPER_FALLBACK is not set +CONFIG_WANT_DEV_COREDUMP=y +CONFIG_ALLOW_DEV_COREDUMP=y +CONFIG_DEV_COREDUMP=y +# CONFIG_DEBUG_DRIVER is not set +# CONFIG_DEBUG_DEVRES is not set +# CONFIG_DEBUG_TEST_DRIVER_REMOVE is not set +CONFIG_TEST_ASYNC_DRIVER_PROBE=m +CONFIG_GENERIC_CPU_AUTOPROBE=y +CONFIG_GENERIC_CPU_VULNERABILITIES=y +CONFIG_REGMAP=y +CONFIG_REGMAP_I2C=m +CONFIG_REGMAP_SPI=y +CONFIG_REGMAP_SPMI=m +CONFIG_REGMAP_W1=m +CONFIG_REGMAP_MMIO=y +CONFIG_REGMAP_IRQ=y +CONFIG_REGMAP_SOUNDWIRE=m +CONFIG_DMA_SHARED_BUFFER=y +# CONFIG_DMA_FENCE_TRACE is not set +# CONFIG_DMA_CMA is not set + +# +# Bus devices +# +CONFIG_CONNECTOR=m +# CONFIG_GNSS is not set +CONFIG_MTD=m +CONFIG_MTD_TESTS=m +CONFIG_MTD_REDBOOT_PARTS=m +CONFIG_MTD_REDBOOT_DIRECTORY_BLOCK=-1 +CONFIG_MTD_REDBOOT_PARTS_UNALLOCATED=y +CONFIG_MTD_REDBOOT_PARTS_READONLY=y +CONFIG_MTD_CMDLINE_PARTS=m +CONFIG_MTD_AR7_PARTS=m + +# +# Partition parsers +# + +# +# User Modules And Translation Layers +# +CONFIG_MTD_BLKDEVS=m +CONFIG_MTD_BLOCK=m +CONFIG_MTD_BLOCK_RO=m +CONFIG_FTL=m +CONFIG_NFTL=m +CONFIG_NFTL_RW=y +CONFIG_INFTL=m +CONFIG_RFD_FTL=m +CONFIG_SSFDC=m +CONFIG_SM_FTL=m +CONFIG_MTD_OOPS=m +CONFIG_MTD_SWAP=m +# CONFIG_MTD_PARTITIONED_MASTER is not set + +# +# RAM/ROM/Flash chip drivers +# +CONFIG_MTD_CFI=m +CONFIG_MTD_JEDECPROBE=m +CONFIG_MTD_GEN_PROBE=m +# CONFIG_MTD_CFI_ADV_OPTIONS is not set +CONFIG_MTD_MAP_BANK_WIDTH_1=y +CONFIG_MTD_MAP_BANK_WIDTH_2=y +CONFIG_MTD_MAP_BANK_WIDTH_4=y +CONFIG_MTD_CFI_I1=y +CONFIG_MTD_CFI_I2=y +CONFIG_MTD_CFI_INTELEXT=m +CONFIG_MTD_CFI_AMDSTD=m +CONFIG_MTD_CFI_STAA=m +CONFIG_MTD_CFI_UTIL=m +CONFIG_MTD_RAM=m +CONFIG_MTD_ROM=m +CONFIG_MTD_ABSENT=m + +# +# Mapping drivers for chip access +# +CONFIG_MTD_COMPLEX_MAPPINGS=y +CONFIG_MTD_PHYSMAP=m +# CONFIG_MTD_PHYSMAP_COMPAT is not set +CONFIG_MTD_SBC_GXX=m +CONFIG_MTD_AMD76XROM=m +CONFIG_MTD_ICHXROM=m +CONFIG_MTD_ESB2ROM=m +CONFIG_MTD_CK804XROM=m +CONFIG_MTD_SCB2_FLASH=m +CONFIG_MTD_NETtel=m +CONFIG_MTD_L440GX=m +CONFIG_MTD_PCI=m +CONFIG_MTD_PCMCIA=m +# CONFIG_MTD_PCMCIA_ANONYMOUS is not set +CONFIG_MTD_GPIO_ADDR=m +CONFIG_MTD_INTEL_VR_NOR=m +CONFIG_MTD_PLATRAM=m +CONFIG_MTD_LATCH_ADDR=m + +# +# Self-contained MTD device drivers +# +CONFIG_MTD_PMC551=m +CONFIG_MTD_PMC551_BUGFIX=y +# CONFIG_MTD_PMC551_DEBUG is not set +CONFIG_MTD_DATAFLASH=m +CONFIG_MTD_DATAFLASH_WRITE_VERIFY=y +CONFIG_MTD_DATAFLASH_OTP=y +CONFIG_MTD_M25P80=m +CONFIG_MTD_MCHP23K256=m +CONFIG_MTD_SST25L=m +CONFIG_MTD_SLRAM=m +CONFIG_MTD_PHRAM=m +CONFIG_MTD_MTDRAM=m +CONFIG_MTDRAM_TOTAL_SIZE=4096 +CONFIG_MTDRAM_ERASE_SIZE=128 +CONFIG_MTD_BLOCK2MTD=m + +# +# Disk-On-Chip Device Drivers +# +CONFIG_MTD_DOCG3=m +CONFIG_BCH_CONST_M=14 +CONFIG_BCH_CONST_T=4 +CONFIG_MTD_ONENAND=m +CONFIG_MTD_ONENAND_VERIFY_WRITE=y +CONFIG_MTD_ONENAND_GENERIC=m +CONFIG_MTD_ONENAND_OTP=y +CONFIG_MTD_ONENAND_2X_PROGRAM=y +CONFIG_MTD_NAND_ECC=m +CONFIG_MTD_NAND_ECC_SMC=y +CONFIG_MTD_NAND=m +CONFIG_MTD_NAND_BCH=m +CONFIG_MTD_NAND_ECC_BCH=y +CONFIG_MTD_SM_COMMON=m +CONFIG_MTD_NAND_DENALI=m +CONFIG_MTD_NAND_DENALI_PCI=m +CONFIG_MTD_NAND_GPIO=m +CONFIG_MTD_NAND_RICOH=m +CONFIG_MTD_NAND_DISKONCHIP=m +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADVANCED=y +CONFIG_MTD_NAND_DISKONCHIP_PROBE_ADDRESS=0x0 +CONFIG_MTD_NAND_DISKONCHIP_PROBE_HIGH=y +CONFIG_MTD_NAND_DISKONCHIP_BBTWRITE=y +CONFIG_MTD_NAND_DOCG4=m +CONFIG_MTD_NAND_CAFE=m +CONFIG_MTD_NAND_NANDSIM=m +CONFIG_MTD_NAND_PLATFORM=m +# CONFIG_MTD_SPI_NAND is not set + +# +# LPDDR & LPDDR2 PCM memory drivers +# +CONFIG_MTD_LPDDR=m +CONFIG_MTD_QINFO_PROBE=m +CONFIG_MTD_SPI_NOR=m +CONFIG_MTD_MT81xx_NOR=m +CONFIG_MTD_SPI_NOR_USE_4K_SECTORS=y +CONFIG_SPI_INTEL_SPI=m +CONFIG_SPI_INTEL_SPI_PCI=m +CONFIG_SPI_INTEL_SPI_PLATFORM=m +CONFIG_MTD_UBI=m +CONFIG_MTD_UBI_WL_THRESHOLD=4096 +CONFIG_MTD_UBI_BEB_LIMIT=20 +CONFIG_MTD_UBI_FASTMAP=y +# CONFIG_MTD_UBI_GLUEBI is not set +CONFIG_MTD_UBI_BLOCK=y +# CONFIG_OF is not set +CONFIG_ARCH_MIGHT_HAVE_PC_PARPORT=y +CONFIG_PARPORT=m +CONFIG_PARPORT_PC=m +CONFIG_PARPORT_SERIAL=m +CONFIG_PARPORT_PC_FIFO=y +CONFIG_PARPORT_PC_SUPERIO=y +CONFIG_PARPORT_PC_PCMCIA=m +CONFIG_PARPORT_AX88796=m +CONFIG_PARPORT_1284=y +CONFIG_PARPORT_NOT_PC=y +CONFIG_PNP=y +# CONFIG_PNP_DEBUG_MESSAGES is not set + +# +# Protocols +# +CONFIG_PNPACPI=y +CONFIG_BLK_DEV=y +# CONFIG_BLK_DEV_NULL_BLK is not set +CONFIG_BLK_DEV_FD=m +CONFIG_CDROM=m +CONFIG_PARIDE=m + +# +# Parallel IDE high-level drivers +# +CONFIG_PARIDE_PD=m +CONFIG_PARIDE_PCD=m +CONFIG_PARIDE_PF=m +CONFIG_PARIDE_PT=m +CONFIG_PARIDE_PG=m + +# +# Parallel IDE protocol modules +# +CONFIG_PARIDE_ATEN=m +CONFIG_PARIDE_BPCK=m +CONFIG_PARIDE_COMM=m +CONFIG_PARIDE_DSTR=m +CONFIG_PARIDE_FIT2=m +CONFIG_PARIDE_FIT3=m +CONFIG_PARIDE_EPAT=m +CONFIG_PARIDE_EPATC8=y +CONFIG_PARIDE_EPIA=m +CONFIG_PARIDE_FRIQ=m +CONFIG_PARIDE_FRPW=m +CONFIG_PARIDE_KBIC=m +CONFIG_PARIDE_KTTI=m +CONFIG_PARIDE_ON20=m +CONFIG_PARIDE_ON26=m +CONFIG_BLK_DEV_PCIESSD_MTIP32XX=m +CONFIG_ZRAM=m +CONFIG_ZRAM_WRITEBACK=y +# CONFIG_ZRAM_MEMORY_TRACKING is not set +CONFIG_BLK_DEV_DAC960=m +CONFIG_BLK_DEV_UMEM=m +CONFIG_BLK_DEV_LOOP=y +CONFIG_BLK_DEV_LOOP_MIN_COUNT=8 +CONFIG_BLK_DEV_CRYPTOLOOP=m +CONFIG_BLK_DEV_DRBD=m +# CONFIG_DRBD_FAULT_INJECTION is not set +CONFIG_BLK_DEV_NBD=m +CONFIG_BLK_DEV_SKD=m +CONFIG_BLK_DEV_SX8=m +CONFIG_BLK_DEV_RAM=m +CONFIG_BLK_DEV_RAM_COUNT=16 +CONFIG_BLK_DEV_RAM_SIZE=16384 +CONFIG_CDROM_PKTCDVD=m +CONFIG_CDROM_PKTCDVD_BUFFERS=8 +CONFIG_CDROM_PKTCDVD_WCACHE=y +CONFIG_ATA_OVER_ETH=m +CONFIG_VIRTIO_BLK=m +# CONFIG_VIRTIO_BLK_SCSI is not set +CONFIG_BLK_DEV_RBD=m +CONFIG_BLK_DEV_RSXX=m + +# +# NVME Support +# +CONFIG_NVME_CORE=y +CONFIG_BLK_DEV_NVME=y +# CONFIG_NVME_MULTIPATH is not set +CONFIG_NVME_FABRICS=m +CONFIG_NVME_RDMA=m +CONFIG_NVME_FC=m +CONFIG_NVME_TARGET=m +CONFIG_NVME_TARGET_LOOP=m +CONFIG_NVME_TARGET_RDMA=m +CONFIG_NVME_TARGET_FC=m +CONFIG_NVME_TARGET_FCLOOP=m + +# +# Misc devices +# +CONFIG_SENSORS_LIS3LV02D=m +CONFIG_AD525X_DPOT=m +CONFIG_AD525X_DPOT_I2C=m +CONFIG_AD525X_DPOT_SPI=m +# CONFIG_DUMMY_IRQ is not set +CONFIG_IBM_ASM=m +CONFIG_PHANTOM=m +CONFIG_SGI_IOC4=m +CONFIG_TIFM_CORE=m +CONFIG_TIFM_7XX1=m +CONFIG_ICS932S401=m +CONFIG_ENCLOSURE_SERVICES=m +CONFIG_HP_ILO=m +CONFIG_APDS9802ALS=m +CONFIG_ISL29003=m +CONFIG_ISL29020=m +CONFIG_SENSORS_TSL2550=m +CONFIG_SENSORS_BH1770=m +CONFIG_SENSORS_APDS990X=m +CONFIG_HMC6352=m +CONFIG_DS1682=m +CONFIG_VMWARE_BALLOON=m +CONFIG_USB_SWITCH_FSA9480=m +CONFIG_LATTICE_ECP3_CONFIG=m +CONFIG_SRAM=y +CONFIG_PCI_ENDPOINT_TEST=m +CONFIG_MISC_RTSX=m +CONFIG_C2PORT=m +CONFIG_C2PORT_DURAMAR_2150=m + +# +# EEPROM support +# +CONFIG_EEPROM_AT24=m +CONFIG_EEPROM_AT25=m +CONFIG_EEPROM_LEGACY=m +CONFIG_EEPROM_MAX6875=m +CONFIG_EEPROM_93CX6=m +CONFIG_EEPROM_93XX46=m +CONFIG_EEPROM_IDT_89HPESX=m +CONFIG_CB710_CORE=m +# CONFIG_CB710_DEBUG is not set +CONFIG_CB710_DEBUG_ASSUMPTIONS=y + +# +# Texas Instruments shared transport line discipline +# +CONFIG_TI_ST=m +CONFIG_SENSORS_LIS3_I2C=m + +# +# Altera FPGA firmware download module (requires I2C) +# +CONFIG_ALTERA_STAPL=m +CONFIG_INTEL_MEI=y +CONFIG_INTEL_MEI_ME=y +CONFIG_INTEL_MEI_TXE=m +CONFIG_VMWARE_VMCI=m + +# +# Intel MIC & related support +# + +# +# Intel MIC Bus Driver +# +CONFIG_INTEL_MIC_BUS=m + +# +# SCIF Bus Driver +# +CONFIG_SCIF_BUS=m + +# +# VOP Bus Driver +# +CONFIG_VOP_BUS=m + +# +# Intel MIC Host Driver +# +CONFIG_INTEL_MIC_HOST=m + +# +# Intel MIC Card Driver +# +CONFIG_INTEL_MIC_CARD=m + +# +# SCIF Driver +# +CONFIG_SCIF=m + +# +# Intel MIC Coprocessor State Management (COSM) Drivers +# +CONFIG_MIC_COSM=m + +# +# VOP Driver +# +CONFIG_VOP=m +CONFIG_VHOST_RING=m +CONFIG_GENWQE=m +CONFIG_GENWQE_PLATFORM_ERROR_RECOVERY=0 +CONFIG_ECHO=m +CONFIG_MISC_RTSX_PCI=m +CONFIG_MISC_RTSX_USB=m +CONFIG_HAVE_IDE=y +# CONFIG_IDE is not set + +# +# SCSI device support +# +CONFIG_SCSI_MOD=m +CONFIG_RAID_ATTRS=m +CONFIG_SCSI=m +CONFIG_SCSI_DMA=y +CONFIG_SCSI_NETLINK=y +# CONFIG_SCSI_MQ_DEFAULT is not set +CONFIG_SCSI_PROC_FS=y + +# +# SCSI support type (disk, tape, CD-ROM) +# +CONFIG_BLK_DEV_SD=m +CONFIG_CHR_DEV_ST=m +CONFIG_CHR_DEV_OSST=m +CONFIG_BLK_DEV_SR=m +CONFIG_BLK_DEV_SR_VENDOR=y +CONFIG_CHR_DEV_SG=m +CONFIG_CHR_DEV_SCH=m +CONFIG_SCSI_ENCLOSURE=m +CONFIG_SCSI_CONSTANTS=y +CONFIG_SCSI_LOGGING=y +CONFIG_SCSI_SCAN_ASYNC=y + +# +# SCSI Transports +# +CONFIG_SCSI_SPI_ATTRS=m +CONFIG_SCSI_FC_ATTRS=m +CONFIG_SCSI_ISCSI_ATTRS=m +CONFIG_SCSI_SAS_ATTRS=m +CONFIG_SCSI_SAS_LIBSAS=m +CONFIG_SCSI_SAS_ATA=y +CONFIG_SCSI_SAS_HOST_SMP=y +CONFIG_SCSI_SRP_ATTRS=m +CONFIG_SCSI_LOWLEVEL=y +CONFIG_ISCSI_TCP=m +CONFIG_ISCSI_BOOT_SYSFS=m +CONFIG_SCSI_CXGB3_ISCSI=m +CONFIG_SCSI_CXGB4_ISCSI=m +CONFIG_SCSI_BNX2_ISCSI=m +CONFIG_SCSI_BNX2X_FCOE=m +CONFIG_BE2ISCSI=m +CONFIG_BLK_DEV_3W_XXXX_RAID=m +CONFIG_SCSI_HPSA=m +CONFIG_SCSI_3W_9XXX=m +CONFIG_SCSI_3W_SAS=m +CONFIG_SCSI_ACARD=m +CONFIG_SCSI_AACRAID=m +CONFIG_SCSI_AIC7XXX=m +CONFIG_AIC7XXX_CMDS_PER_DEVICE=4 +CONFIG_AIC7XXX_RESET_DELAY_MS=15000 +# CONFIG_AIC7XXX_DEBUG_ENABLE is not set +CONFIG_AIC7XXX_DEBUG_MASK=0 +# CONFIG_AIC7XXX_REG_PRETTY_PRINT is not set +CONFIG_SCSI_AIC79XX=m +CONFIG_AIC79XX_CMDS_PER_DEVICE=4 +CONFIG_AIC79XX_RESET_DELAY_MS=15000 +# CONFIG_AIC79XX_DEBUG_ENABLE is not set +CONFIG_AIC79XX_DEBUG_MASK=0 +# CONFIG_AIC79XX_REG_PRETTY_PRINT is not set +CONFIG_SCSI_AIC94XX=m +# CONFIG_AIC94XX_DEBUG is not set +CONFIG_SCSI_MVSAS=m +# CONFIG_SCSI_MVSAS_DEBUG is not set +CONFIG_SCSI_MVSAS_TASKLET=y +CONFIG_SCSI_MVUMI=m +CONFIG_SCSI_DPT_I2O=m +CONFIG_SCSI_ADVANSYS=m +CONFIG_SCSI_ARCMSR=m +CONFIG_SCSI_ESAS2R=m +CONFIG_MEGARAID_NEWGEN=y +CONFIG_MEGARAID_MM=m +CONFIG_MEGARAID_MAILBOX=m +CONFIG_MEGARAID_LEGACY=m +CONFIG_MEGARAID_SAS=m +CONFIG_SCSI_MPT3SAS=m +CONFIG_SCSI_MPT2SAS_MAX_SGE=128 +CONFIG_SCSI_MPT3SAS_MAX_SGE=128 +CONFIG_SCSI_MPT2SAS=m +CONFIG_SCSI_SMARTPQI=m +CONFIG_SCSI_UFSHCD=m +CONFIG_SCSI_UFSHCD_PCI=m +CONFIG_SCSI_UFS_DWC_TC_PCI=m +CONFIG_SCSI_UFSHCD_PLATFORM=m +CONFIG_SCSI_UFS_DWC_TC_PLATFORM=m +CONFIG_SCSI_HPTIOP=m +CONFIG_SCSI_BUSLOGIC=m +CONFIG_SCSI_FLASHPOINT=y +CONFIG_VMWARE_PVSCSI=m +CONFIG_HYPERV_STORAGE=m +CONFIG_LIBFC=m +CONFIG_LIBFCOE=m +CONFIG_FCOE=m +CONFIG_FCOE_FNIC=m +CONFIG_SCSI_SNIC=m +# CONFIG_SCSI_SNIC_DEBUG_FS is not set +CONFIG_SCSI_DMX3191D=m +CONFIG_SCSI_GDTH=m +CONFIG_SCSI_ISCI=m +CONFIG_SCSI_IPS=m +CONFIG_SCSI_INITIO=m +CONFIG_SCSI_INIA100=m +CONFIG_SCSI_PPA=m +CONFIG_SCSI_IMM=m +# CONFIG_SCSI_IZIP_EPP16 is not set +# CONFIG_SCSI_IZIP_SLOW_CTR is not set +CONFIG_SCSI_STEX=m +CONFIG_SCSI_SYM53C8XX_2=m +CONFIG_SCSI_SYM53C8XX_DMA_ADDRESSING_MODE=1 +CONFIG_SCSI_SYM53C8XX_DEFAULT_TAGS=16 +CONFIG_SCSI_SYM53C8XX_MAX_TAGS=64 +CONFIG_SCSI_SYM53C8XX_MMIO=y +CONFIG_SCSI_IPR=m +# CONFIG_SCSI_IPR_TRACE is not set +# CONFIG_SCSI_IPR_DUMP is not set +CONFIG_SCSI_QLOGIC_1280=m +CONFIG_SCSI_QLA_FC=m +CONFIG_TCM_QLA2XXX=m +# CONFIG_TCM_QLA2XXX_DEBUG is not set +CONFIG_SCSI_QLA_ISCSI=m +CONFIG_QEDI=m +CONFIG_QEDF=m +CONFIG_SCSI_LPFC=m +# CONFIG_SCSI_LPFC_DEBUG_FS is not set +CONFIG_SCSI_DC395x=m +CONFIG_SCSI_AM53C974=m +CONFIG_SCSI_WD719X=m +# CONFIG_SCSI_DEBUG is not set +CONFIG_SCSI_PMCRAID=m +CONFIG_SCSI_PM8001=m +CONFIG_SCSI_BFA_FC=m +CONFIG_SCSI_VIRTIO=m +CONFIG_SCSI_CHELSIO_FCOE=m +CONFIG_SCSI_LOWLEVEL_PCMCIA=y +CONFIG_PCMCIA_AHA152X=m +CONFIG_PCMCIA_QLOGIC=m +CONFIG_PCMCIA_SYM53C500=m +CONFIG_SCSI_DH=y +CONFIG_SCSI_DH_RDAC=m +CONFIG_SCSI_DH_HP_SW=m +CONFIG_SCSI_DH_EMC=m +CONFIG_SCSI_DH_ALUA=m +CONFIG_SCSI_OSD_INITIATOR=m +CONFIG_SCSI_OSD_ULD=m +CONFIG_SCSI_OSD_DPRINT_SENSE=1 +# CONFIG_SCSI_OSD_DEBUG is not set +CONFIG_ATA=m +CONFIG_ATA_VERBOSE_ERROR=y +CONFIG_ATA_ACPI=y +CONFIG_SATA_ZPODD=y +CONFIG_SATA_PMP=y + +# +# Controllers with non-SFF native interface +# +CONFIG_SATA_AHCI=m +CONFIG_SATA_MOBILE_LPM_POLICY=0 +CONFIG_SATA_AHCI_PLATFORM=m +CONFIG_SATA_INIC162X=m +CONFIG_SATA_ACARD_AHCI=m +CONFIG_SATA_SIL24=m +CONFIG_ATA_SFF=y + +# +# SFF controllers with custom DMA interface +# +CONFIG_PDC_ADMA=m +CONFIG_SATA_QSTOR=m +CONFIG_SATA_SX4=m +CONFIG_ATA_BMDMA=y + +# +# SATA SFF controllers with BMDMA +# +CONFIG_ATA_PIIX=m +CONFIG_SATA_DWC=m +# CONFIG_SATA_DWC_OLD_DMA is not set +# CONFIG_SATA_DWC_DEBUG is not set +CONFIG_SATA_MV=m +CONFIG_SATA_NV=m +CONFIG_SATA_PROMISE=m +CONFIG_SATA_SIL=m +CONFIG_SATA_SIS=m +CONFIG_SATA_SVW=m +CONFIG_SATA_ULI=m +CONFIG_SATA_VIA=m +CONFIG_SATA_VITESSE=m + +# +# PATA SFF controllers with BMDMA +# +CONFIG_PATA_ALI=m +CONFIG_PATA_AMD=m +CONFIG_PATA_ARTOP=m +CONFIG_PATA_ATIIXP=m +CONFIG_PATA_ATP867X=m +CONFIG_PATA_CMD64X=m +CONFIG_PATA_CYPRESS=m +CONFIG_PATA_EFAR=m +CONFIG_PATA_HPT366=m +CONFIG_PATA_HPT37X=m +CONFIG_PATA_HPT3X2N=m +CONFIG_PATA_HPT3X3=m +CONFIG_PATA_HPT3X3_DMA=y +CONFIG_PATA_IT8213=m +CONFIG_PATA_IT821X=m +CONFIG_PATA_JMICRON=m +CONFIG_PATA_MARVELL=m +CONFIG_PATA_NETCELL=m +CONFIG_PATA_NINJA32=m +CONFIG_PATA_NS87415=m +CONFIG_PATA_OLDPIIX=m +CONFIG_PATA_OPTIDMA=m +CONFIG_PATA_PDC2027X=m +CONFIG_PATA_PDC_OLD=m +CONFIG_PATA_RADISYS=m +CONFIG_PATA_RDC=m +CONFIG_PATA_SCH=m +CONFIG_PATA_SERVERWORKS=m +CONFIG_PATA_SIL680=m +CONFIG_PATA_SIS=m +CONFIG_PATA_TOSHIBA=m +CONFIG_PATA_TRIFLEX=m +CONFIG_PATA_VIA=m +CONFIG_PATA_WINBOND=m + +# +# PIO-only SFF controllers +# +CONFIG_PATA_CMD640_PCI=m +CONFIG_PATA_MPIIX=m +CONFIG_PATA_NS87410=m +CONFIG_PATA_OPTI=m +CONFIG_PATA_PCMCIA=m +CONFIG_PATA_RZ1000=m + +# +# Generic fallback / legacy drivers +# +CONFIG_PATA_ACPI=m +CONFIG_ATA_GENERIC=m +CONFIG_PATA_LEGACY=m +CONFIG_MD=y +CONFIG_BLK_DEV_MD=m +CONFIG_MD_LINEAR=m +CONFIG_MD_RAID0=m +CONFIG_MD_RAID1=m +CONFIG_MD_RAID10=m +CONFIG_MD_RAID456=m +CONFIG_MD_MULTIPATH=m +CONFIG_MD_FAULTY=m +CONFIG_MD_CLUSTER=m +CONFIG_BCACHE=m +# CONFIG_BCACHE_DEBUG is not set +# CONFIG_BCACHE_CLOSURES_DEBUG is not set +CONFIG_BLK_DEV_DM_BUILTIN=y +CONFIG_BLK_DEV_DM=m +# CONFIG_DM_MQ_DEFAULT is not set +# CONFIG_DM_DEBUG is not set +CONFIG_DM_BUFIO=m +# CONFIG_DM_DEBUG_BLOCK_MANAGER_LOCKING is not set +CONFIG_DM_BIO_PRISON=m +CONFIG_DM_PERSISTENT_DATA=m +CONFIG_DM_UNSTRIPED=m +CONFIG_DM_CRYPT=m +CONFIG_DM_SNAPSHOT=m +CONFIG_DM_THIN_PROVISIONING=m +CONFIG_DM_CACHE=m +CONFIG_DM_CACHE_SMQ=m +CONFIG_DM_WRITECACHE=m +CONFIG_DM_ERA=m +CONFIG_DM_MIRROR=m +CONFIG_DM_LOG_USERSPACE=m +CONFIG_DM_RAID=m +CONFIG_DM_ZERO=m +CONFIG_DM_MULTIPATH=m +CONFIG_DM_MULTIPATH_QL=m +CONFIG_DM_MULTIPATH_ST=m +CONFIG_DM_DELAY=m +CONFIG_DM_UEVENT=y +CONFIG_DM_FLAKEY=m +CONFIG_DM_VERITY=m +# CONFIG_DM_VERITY_FEC is not set +CONFIG_DM_SWITCH=m +CONFIG_DM_LOG_WRITES=m +CONFIG_DM_INTEGRITY=m +CONFIG_DM_ZONED=m +CONFIG_TARGET_CORE=m +CONFIG_TCM_IBLOCK=m +CONFIG_TCM_FILEIO=m +CONFIG_TCM_PSCSI=m +CONFIG_TCM_USER2=m +CONFIG_LOOPBACK_TARGET=m +CONFIG_TCM_FC=m +CONFIG_ISCSI_TARGET=m +CONFIG_ISCSI_TARGET_CXGB4=m +CONFIG_SBP_TARGET=m +CONFIG_FUSION=y +CONFIG_FUSION_SPI=m +CONFIG_FUSION_FC=m +CONFIG_FUSION_SAS=m +CONFIG_FUSION_MAX_SGE=128 +CONFIG_FUSION_CTL=m +CONFIG_FUSION_LAN=m +CONFIG_FUSION_LOGGING=y + +# +# IEEE 1394 (FireWire) support +# +CONFIG_FIREWIRE=m +CONFIG_FIREWIRE_OHCI=m +CONFIG_FIREWIRE_SBP2=m +CONFIG_FIREWIRE_NET=m +CONFIG_FIREWIRE_NOSY=m +CONFIG_MACINTOSH_DRIVERS=y +CONFIG_MAC_EMUMOUSEBTN=m +CONFIG_NETDEVICES=y +CONFIG_MII=m +CONFIG_NET_CORE=y +CONFIG_BONDING=m +CONFIG_DUMMY=m +CONFIG_EQUALIZER=m +CONFIG_NET_FC=y +CONFIG_IFB=m +CONFIG_NET_TEAM=m +CONFIG_NET_TEAM_MODE_BROADCAST=m +CONFIG_NET_TEAM_MODE_ROUNDROBIN=m +CONFIG_NET_TEAM_MODE_RANDOM=m +CONFIG_NET_TEAM_MODE_ACTIVEBACKUP=m +CONFIG_NET_TEAM_MODE_LOADBALANCE=m +CONFIG_MACVLAN=m +CONFIG_MACVTAP=m +CONFIG_IPVLAN=m +CONFIG_IPVTAP=m +CONFIG_VXLAN=m +CONFIG_GENEVE=m +CONFIG_GTP=m +CONFIG_MACSEC=m +CONFIG_NETCONSOLE=m +CONFIG_NETCONSOLE_DYNAMIC=y +CONFIG_NETPOLL=y +CONFIG_NET_POLL_CONTROLLER=y +# CONFIG_NTB_NETDEV is not set +CONFIG_RIONET=m +CONFIG_RIONET_TX_SIZE=128 +CONFIG_RIONET_RX_SIZE=128 +CONFIG_TUN=m +CONFIG_TAP=m +# CONFIG_TUN_VNET_CROSS_LE is not set +CONFIG_VETH=m +CONFIG_VIRTIO_NET=m +CONFIG_NLMON=m +CONFIG_NET_VRF=m +CONFIG_VSOCKMON=m +CONFIG_SUNGEM_PHY=m +CONFIG_ARCNET=m +CONFIG_ARCNET_1201=m +CONFIG_ARCNET_1051=m +CONFIG_ARCNET_RAW=m +CONFIG_ARCNET_CAP=m +CONFIG_ARCNET_COM90xx=m +CONFIG_ARCNET_COM90xxIO=m +CONFIG_ARCNET_RIM_I=m +CONFIG_ARCNET_COM20020=m +CONFIG_ARCNET_COM20020_PCI=m +CONFIG_ARCNET_COM20020_CS=m +CONFIG_ATM_DRIVERS=y +# CONFIG_ATM_DUMMY is not set +CONFIG_ATM_TCP=m +CONFIG_ATM_LANAI=m +CONFIG_ATM_ENI=m +# CONFIG_ATM_ENI_DEBUG is not set +# CONFIG_ATM_ENI_TUNE_BURST is not set +CONFIG_ATM_FIRESTREAM=m +CONFIG_ATM_ZATM=m +# CONFIG_ATM_ZATM_DEBUG is not set +CONFIG_ATM_NICSTAR=m +CONFIG_ATM_NICSTAR_USE_SUNI=y +CONFIG_ATM_NICSTAR_USE_IDT77105=y +CONFIG_ATM_IDT77252=m +# CONFIG_ATM_IDT77252_DEBUG is not set +# CONFIG_ATM_IDT77252_RCV_ALL is not set +CONFIG_ATM_IDT77252_USE_SUNI=y +CONFIG_ATM_AMBASSADOR=m +# CONFIG_ATM_AMBASSADOR_DEBUG is not set +CONFIG_ATM_HORIZON=m +# CONFIG_ATM_HORIZON_DEBUG is not set +CONFIG_ATM_IA=m +# CONFIG_ATM_IA_DEBUG is not set +CONFIG_ATM_FORE200E=m +CONFIG_ATM_FORE200E_USE_TASKLET=y +CONFIG_ATM_FORE200E_TX_RETRY=16 +CONFIG_ATM_FORE200E_DEBUG=0 +CONFIG_ATM_HE=m +CONFIG_ATM_HE_USE_SUNI=y +CONFIG_ATM_SOLOS=m + +# +# CAIF transport drivers +# +CONFIG_CAIF_TTY=m +CONFIG_CAIF_SPI_SLAVE=m +CONFIG_CAIF_SPI_SYNC=y +CONFIG_CAIF_HSI=m +CONFIG_CAIF_VIRTIO=m + +# +# Distributed Switch Architecture drivers +# +CONFIG_B53=m +CONFIG_B53_SPI_DRIVER=m +CONFIG_B53_MDIO_DRIVER=m +CONFIG_B53_MMAP_DRIVER=m +CONFIG_B53_SRAB_DRIVER=m +# CONFIG_NET_DSA_BCM_SF2 is not set +CONFIG_NET_DSA_LOOP=m +CONFIG_NET_DSA_MT7530=m +CONFIG_NET_DSA_MV88E6060=m +CONFIG_MICROCHIP_KSZ=m +CONFIG_MICROCHIP_KSZ_SPI_DRIVER=m +CONFIG_NET_DSA_MV88E6XXX=m +CONFIG_NET_DSA_MV88E6XXX_GLOBAL2=y +CONFIG_NET_DSA_MV88E6XXX_PTP=y +CONFIG_NET_DSA_QCA8K=m +# CONFIG_NET_DSA_REALTEK_SMI is not set +CONFIG_NET_DSA_SMSC_LAN9303=m +CONFIG_NET_DSA_SMSC_LAN9303_I2C=m +CONFIG_NET_DSA_SMSC_LAN9303_MDIO=m +CONFIG_ETHERNET=y +CONFIG_MDIO=m +CONFIG_NET_VENDOR_3COM=y +CONFIG_PCMCIA_3C574=m +CONFIG_PCMCIA_3C589=m +CONFIG_VORTEX=m +CONFIG_TYPHOON=m +CONFIG_NET_VENDOR_ADAPTEC=y +CONFIG_ADAPTEC_STARFIRE=m +CONFIG_NET_VENDOR_AGERE=y +CONFIG_ET131X=m +CONFIG_NET_VENDOR_ALACRITECH=y +CONFIG_SLICOSS=m +CONFIG_NET_VENDOR_ALTEON=y +CONFIG_ACENIC=m +# CONFIG_ACENIC_OMIT_TIGON_I is not set +CONFIG_ALTERA_TSE=m +CONFIG_NET_VENDOR_AMAZON=y +CONFIG_ENA_ETHERNET=m +CONFIG_NET_VENDOR_AMD=y +CONFIG_AMD8111_ETH=m +CONFIG_PCNET32=m +CONFIG_PCMCIA_NMCLAN=m +CONFIG_AMD_XGBE=m +CONFIG_AMD_XGBE_DCB=y +CONFIG_AMD_XGBE_HAVE_ECC=y +CONFIG_NET_VENDOR_AQUANTIA=y +CONFIG_AQTION=m +CONFIG_NET_VENDOR_ARC=y +CONFIG_NET_VENDOR_ATHEROS=y +CONFIG_ATL2=m +CONFIG_ATL1=m +CONFIG_ATL1E=m +CONFIG_ATL1C=m +CONFIG_ALX=m +CONFIG_NET_VENDOR_AURORA=y +CONFIG_AURORA_NB8800=m +CONFIG_NET_VENDOR_BROADCOM=y +CONFIG_B44=m +CONFIG_B44_PCI_AUTOSELECT=y +CONFIG_B44_PCICORE_AUTOSELECT=y +CONFIG_B44_PCI=y +# CONFIG_BCMGENET is not set +CONFIG_BNX2=m +CONFIG_CNIC=m +CONFIG_TIGON3=m +CONFIG_TIGON3_HWMON=y +CONFIG_BNX2X=m +CONFIG_BNX2X_SRIOV=y +# CONFIG_SYSTEMPORT is not set +CONFIG_BNXT=m +CONFIG_BNXT_SRIOV=y +CONFIG_BNXT_FLOWER_OFFLOAD=y +CONFIG_BNXT_DCB=y +CONFIG_BNXT_HWMON=y +CONFIG_NET_VENDOR_BROCADE=y +CONFIG_BNA=m +CONFIG_NET_VENDOR_CADENCE=y +CONFIG_MACB=m +CONFIG_MACB_USE_HWSTAMP=y +CONFIG_MACB_PCI=m +CONFIG_NET_VENDOR_CAVIUM=y +CONFIG_THUNDER_NIC_PF=m +CONFIG_THUNDER_NIC_VF=m +CONFIG_THUNDER_NIC_BGX=m +CONFIG_THUNDER_NIC_RGX=m +CONFIG_CAVIUM_PTP=m +CONFIG_LIQUIDIO=m +CONFIG_LIQUIDIO_VF=m +CONFIG_NET_VENDOR_CHELSIO=y +CONFIG_CHELSIO_T1=m +CONFIG_CHELSIO_T1_1G=y +CONFIG_CHELSIO_T3=m +CONFIG_CHELSIO_T4=m +CONFIG_CHELSIO_T4_DCB=y +# CONFIG_CHELSIO_T4_FCOE is not set +CONFIG_CHELSIO_T4VF=m +CONFIG_CHELSIO_LIB=m +CONFIG_NET_VENDOR_CISCO=y +CONFIG_ENIC=m +# CONFIG_NET_VENDOR_CORTINA is not set +CONFIG_CX_ECAT=m +CONFIG_DNET=m +CONFIG_NET_VENDOR_DEC=y +CONFIG_NET_TULIP=y +CONFIG_DE2104X=m +CONFIG_DE2104X_DSL=0 +CONFIG_TULIP=m +CONFIG_TULIP_MWI=y +CONFIG_TULIP_MMIO=y +CONFIG_TULIP_NAPI=y +CONFIG_TULIP_NAPI_HW_MITIGATION=y +CONFIG_DE4X5=m +CONFIG_WINBOND_840=m +CONFIG_DM9102=m +CONFIG_ULI526X=m +CONFIG_PCMCIA_XIRCOM=m +CONFIG_NET_VENDOR_DLINK=y +CONFIG_DL2K=m +CONFIG_SUNDANCE=m +# CONFIG_SUNDANCE_MMIO is not set +CONFIG_NET_VENDOR_EMULEX=y +CONFIG_BE2NET=m +CONFIG_BE2NET_HWMON=y +CONFIG_BE2NET_BE2=y +CONFIG_BE2NET_BE3=y +CONFIG_BE2NET_LANCER=y +CONFIG_BE2NET_SKYHAWK=y +CONFIG_NET_VENDOR_EZCHIP=y +CONFIG_NET_VENDOR_FUJITSU=y +CONFIG_PCMCIA_FMVJ18X=m +CONFIG_NET_VENDOR_HP=y +CONFIG_HP100=m +CONFIG_NET_VENDOR_HUAWEI=y +CONFIG_HINIC=m +CONFIG_NET_VENDOR_I825XX=y +CONFIG_NET_VENDOR_INTEL=y +CONFIG_E100=m +CONFIG_E1000=m +CONFIG_E1000E=m +CONFIG_E1000E_HWTS=y +CONFIG_IGB=m +CONFIG_IGB_HWMON=y +CONFIG_IGB_DCA=y +CONFIG_IGBVF=m +CONFIG_IXGB=m +CONFIG_IXGBE=m +CONFIG_IXGBE_HWMON=y +CONFIG_IXGBE_DCA=y +CONFIG_IXGBE_DCB=y +CONFIG_IXGBEVF=m +CONFIG_I40E=m +CONFIG_I40E_DCB=y +CONFIG_I40EVF=m +CONFIG_ICE=m +CONFIG_FM10K=m +CONFIG_JME=m +CONFIG_NET_VENDOR_MARVELL=y +CONFIG_MVMDIO=m +CONFIG_SKGE=m +# CONFIG_SKGE_DEBUG is not set +CONFIG_SKGE_GENESIS=y +CONFIG_SKY2=m +# CONFIG_SKY2_DEBUG is not set +CONFIG_NET_VENDOR_MELLANOX=y +CONFIG_MLX4_EN=m +CONFIG_MLX4_EN_DCB=y +CONFIG_MLX4_CORE=m +CONFIG_MLX4_DEBUG=y +CONFIG_MLX4_CORE_GEN2=y +CONFIG_MLX5_CORE=m +CONFIG_MLX5_ACCEL=y +CONFIG_MLX5_FPGA=y +# CONFIG_MLX5_CORE_EN is not set +CONFIG_MLXSW_CORE=m +CONFIG_MLXSW_CORE_HWMON=y +CONFIG_MLXSW_CORE_THERMAL=y +CONFIG_MLXSW_PCI=m +CONFIG_MLXSW_I2C=m +CONFIG_MLXSW_SWITCHIB=m +CONFIG_MLXSW_SWITCHX2=m +CONFIG_MLXSW_SPECTRUM=m +CONFIG_MLXSW_SPECTRUM_DCB=y +CONFIG_MLXSW_MINIMAL=m +CONFIG_MLXFW=m +CONFIG_NET_VENDOR_MICREL=y +CONFIG_KS8842=m +CONFIG_KS8851=m +CONFIG_KS8851_MLL=m +CONFIG_KSZ884X_PCI=m +CONFIG_NET_VENDOR_MICROCHIP=y +CONFIG_ENC28J60=m +# CONFIG_ENC28J60_WRITEVERIFY is not set +CONFIG_ENCX24J600=m +CONFIG_LAN743X=m +CONFIG_NET_VENDOR_MICROSEMI=y +CONFIG_MSCC_OCELOT_SWITCH=m +CONFIG_MSCC_OCELOT_SWITCH_OCELOT=m +CONFIG_NET_VENDOR_MYRI=y +CONFIG_MYRI10GE=m +CONFIG_MYRI10GE_DCA=y +CONFIG_FEALNX=m +CONFIG_NET_VENDOR_NATSEMI=y +CONFIG_NATSEMI=m +CONFIG_NS83820=m +CONFIG_NET_VENDOR_NETERION=y +CONFIG_S2IO=m +CONFIG_VXGE=m +# CONFIG_VXGE_DEBUG_TRACE_ALL is not set +CONFIG_NET_VENDOR_NETRONOME=y +CONFIG_NFP=m +# CONFIG_NFP_APP_FLOWER is not set +CONFIG_NFP_APP_ABM_NIC=y +# CONFIG_NFP_DEBUG is not set +CONFIG_NET_VENDOR_NI=y +CONFIG_NET_VENDOR_8390=y +CONFIG_PCMCIA_AXNET=m +CONFIG_NE2K_PCI=m +CONFIG_PCMCIA_PCNET=m +CONFIG_NET_VENDOR_NVIDIA=y +CONFIG_FORCEDETH=m +CONFIG_NET_VENDOR_OKI=y +CONFIG_ETHOC=m +CONFIG_NET_VENDOR_PACKET_ENGINES=y +CONFIG_HAMACHI=m +CONFIG_YELLOWFIN=m +CONFIG_NET_VENDOR_QLOGIC=y +CONFIG_QLA3XXX=m +CONFIG_QLCNIC=m +CONFIG_QLCNIC_SRIOV=y +CONFIG_QLCNIC_DCB=y +CONFIG_QLCNIC_HWMON=y +CONFIG_QLGE=m +CONFIG_NETXEN_NIC=m +CONFIG_QED=m +CONFIG_QED_LL2=y +CONFIG_QED_SRIOV=y +CONFIG_QEDE=m +CONFIG_QED_RDMA=y +CONFIG_QED_ISCSI=y +CONFIG_QED_FCOE=y +CONFIG_QED_OOO=y +CONFIG_NET_VENDOR_QUALCOMM=y +CONFIG_QCOM_EMAC=m +CONFIG_RMNET=m +CONFIG_NET_VENDOR_RDC=y +CONFIG_R6040=m +CONFIG_NET_VENDOR_REALTEK=y +CONFIG_ATP=m +CONFIG_8139CP=m +CONFIG_8139TOO=m +# CONFIG_8139TOO_PIO is not set +CONFIG_8139TOO_TUNE_TWISTER=y +CONFIG_8139TOO_8129=y +# CONFIG_8139_OLD_RX_RESET is not set +CONFIG_R8169=m +CONFIG_NET_VENDOR_RENESAS=y +CONFIG_NET_VENDOR_ROCKER=y +CONFIG_ROCKER=m +CONFIG_NET_VENDOR_SAMSUNG=y +CONFIG_SXGBE_ETH=m +CONFIG_NET_VENDOR_SEEQ=y +CONFIG_NET_VENDOR_SOLARFLARE=y +CONFIG_SFC=m +CONFIG_SFC_MTD=y +CONFIG_SFC_MCDI_MON=y +CONFIG_SFC_SRIOV=y +CONFIG_SFC_MCDI_LOGGING=y +CONFIG_SFC_FALCON=m +CONFIG_SFC_FALCON_MTD=y +CONFIG_NET_VENDOR_SILAN=y +CONFIG_SC92031=m +CONFIG_NET_VENDOR_SIS=y +CONFIG_SIS900=m +CONFIG_SIS190=m +CONFIG_NET_VENDOR_SMSC=y +CONFIG_PCMCIA_SMC91C92=m +CONFIG_EPIC100=m +CONFIG_SMSC911X=m +CONFIG_SMSC9420=m +# CONFIG_NET_VENDOR_SOCIONEXT is not set +CONFIG_NET_VENDOR_STMICRO=y +CONFIG_STMMAC_ETH=m +CONFIG_STMMAC_PLATFORM=m +CONFIG_DWMAC_GENERIC=m +CONFIG_STMMAC_PCI=m +CONFIG_NET_VENDOR_SUN=y +CONFIG_HAPPYMEAL=m +CONFIG_SUNGEM=m +CONFIG_CASSINI=m +CONFIG_NIU=m +CONFIG_NET_VENDOR_SYNOPSYS=y +CONFIG_DWC_XLGMAC=m +CONFIG_DWC_XLGMAC_PCI=m +CONFIG_NET_VENDOR_TEHUTI=y +CONFIG_TEHUTI=m +CONFIG_NET_VENDOR_TI=y +CONFIG_TI_CPSW_ALE=m +CONFIG_TLAN=m +CONFIG_NET_VENDOR_VIA=y +CONFIG_VIA_RHINE=m +CONFIG_VIA_RHINE_MMIO=y +CONFIG_VIA_VELOCITY=m +CONFIG_NET_VENDOR_WIZNET=y +CONFIG_WIZNET_W5100=m +CONFIG_WIZNET_W5300=m +# CONFIG_WIZNET_BUS_DIRECT is not set +# CONFIG_WIZNET_BUS_INDIRECT is not set +CONFIG_WIZNET_BUS_ANY=y +CONFIG_WIZNET_W5100_SPI=m +CONFIG_NET_VENDOR_XIRCOM=y +CONFIG_PCMCIA_XIRC2PS=m +CONFIG_FDDI=m +CONFIG_DEFXX=m +# CONFIG_DEFXX_MMIO is not set +CONFIG_SKFP=m +CONFIG_HIPPI=y +CONFIG_ROADRUNNER=m +CONFIG_ROADRUNNER_LARGE_RINGS=y +CONFIG_NET_SB1000=m +CONFIG_MDIO_DEVICE=m +CONFIG_MDIO_BUS=m +# CONFIG_MDIO_BCM_UNIMAC is not set +CONFIG_MDIO_BITBANG=m +CONFIG_MDIO_CAVIUM=m +CONFIG_MDIO_GPIO=m +CONFIG_MDIO_I2C=m +CONFIG_MDIO_MSCC_MIIM=m +CONFIG_MDIO_THUNDER=m +CONFIG_PHYLINK=m +CONFIG_PHYLIB=m +CONFIG_SWPHY=y +CONFIG_LED_TRIGGER_PHY=y + +# +# MII PHY device drivers +# +CONFIG_SFP=m +CONFIG_AMD_PHY=m +CONFIG_AQUANTIA_PHY=m +CONFIG_ASIX_PHY=m +CONFIG_AT803X_PHY=m +CONFIG_BCM7XXX_PHY=m +CONFIG_BCM87XX_PHY=m +CONFIG_BCM_NET_PHYLIB=m +CONFIG_BROADCOM_PHY=m +CONFIG_CICADA_PHY=m +CONFIG_CORTINA_PHY=m +CONFIG_DAVICOM_PHY=m +CONFIG_DP83822_PHY=m +CONFIG_DP83TC811_PHY=m +CONFIG_DP83848_PHY=m +CONFIG_DP83867_PHY=m +CONFIG_FIXED_PHY=m +CONFIG_ICPLUS_PHY=m +CONFIG_INTEL_XWAY_PHY=m +CONFIG_LSI_ET1011C_PHY=m +CONFIG_LXT_PHY=m +CONFIG_MARVELL_PHY=m +CONFIG_MARVELL_10G_PHY=m +CONFIG_MICREL_PHY=m +CONFIG_MICROCHIP_PHY=m +CONFIG_MICROCHIP_T1_PHY=m +CONFIG_MICROSEMI_PHY=m +CONFIG_NATIONAL_PHY=m +CONFIG_QSEMI_PHY=m +CONFIG_REALTEK_PHY=m +CONFIG_RENESAS_PHY=m +CONFIG_ROCKCHIP_PHY=m +CONFIG_SMSC_PHY=m +CONFIG_STE10XP=m +CONFIG_TERANETICS_PHY=m +CONFIG_VITESSE_PHY=m +CONFIG_XILINX_GMII2RGMII=m +CONFIG_MICREL_KS8995MA=m +CONFIG_PLIP=m +CONFIG_PPP=m +CONFIG_PPP_BSDCOMP=m +CONFIG_PPP_DEFLATE=m +CONFIG_PPP_FILTER=y +CONFIG_PPP_MPPE=m +CONFIG_PPP_MULTILINK=y +CONFIG_PPPOATM=m +CONFIG_PPPOE=m +CONFIG_PPTP=m +CONFIG_PPPOL2TP=m +CONFIG_PPP_ASYNC=m +CONFIG_PPP_SYNC_TTY=m +CONFIG_SLIP=m +CONFIG_SLHC=m +CONFIG_SLIP_COMPRESSED=y +CONFIG_SLIP_SMART=y +CONFIG_SLIP_MODE_SLIP6=y + +# +# Host-side USB support is needed for USB Network Adapter support +# +CONFIG_USB_NET_DRIVERS=m +CONFIG_USB_CATC=m +CONFIG_USB_KAWETH=m +CONFIG_USB_PEGASUS=m +CONFIG_USB_RTL8150=m +CONFIG_USB_RTL8152=m +CONFIG_USB_LAN78XX=m +CONFIG_USB_USBNET=m +CONFIG_USB_NET_AX8817X=m +CONFIG_USB_NET_AX88179_178A=m +CONFIG_USB_NET_CDCETHER=m +CONFIG_USB_NET_CDC_EEM=m +CONFIG_USB_NET_CDC_NCM=m +CONFIG_USB_NET_HUAWEI_CDC_NCM=m +CONFIG_USB_NET_CDC_MBIM=m +CONFIG_USB_NET_DM9601=m +CONFIG_USB_NET_SR9700=m +CONFIG_USB_NET_SR9800=m +CONFIG_USB_NET_SMSC75XX=m +CONFIG_USB_NET_SMSC95XX=m +CONFIG_USB_NET_GL620A=m +CONFIG_USB_NET_NET1080=m +CONFIG_USB_NET_PLUSB=m +CONFIG_USB_NET_MCS7830=m +CONFIG_USB_NET_RNDIS_HOST=m +CONFIG_USB_NET_CDC_SUBSET_ENABLE=m +CONFIG_USB_NET_CDC_SUBSET=m +CONFIG_USB_ALI_M5632=y +CONFIG_USB_AN2720=y +CONFIG_USB_BELKIN=y +CONFIG_USB_ARMLINUX=y +CONFIG_USB_EPSON2888=y +CONFIG_USB_KC2190=y +CONFIG_USB_NET_ZAURUS=m +CONFIG_USB_NET_CX82310_ETH=m +CONFIG_USB_NET_KALMIA=m +CONFIG_USB_NET_QMI_WWAN=m +CONFIG_USB_HSO=m +CONFIG_USB_NET_INT51X1=m +CONFIG_USB_CDC_PHONET=m +CONFIG_USB_IPHETH=m +CONFIG_USB_SIERRA_NET=m +CONFIG_USB_VL600=m +CONFIG_USB_NET_CH9200=m +CONFIG_WLAN=y +CONFIG_WLAN_VENDOR_ADMTEK=y +CONFIG_ADM8211=m +CONFIG_ATH_COMMON=m +CONFIG_WLAN_VENDOR_ATH=y +# CONFIG_ATH_DEBUG is not set +CONFIG_ATH5K=m +# CONFIG_ATH5K_DEBUG is not set +# CONFIG_ATH5K_TRACER is not set +CONFIG_ATH5K_PCI=y +CONFIG_ATH9K_HW=m +CONFIG_ATH9K_COMMON=m +CONFIG_ATH9K_BTCOEX_SUPPORT=y +CONFIG_ATH9K=m +CONFIG_ATH9K_PCI=y +CONFIG_ATH9K_AHB=y +# CONFIG_ATH9K_DEBUGFS is not set +CONFIG_ATH9K_DYNACK=y +CONFIG_ATH9K_WOW=y +CONFIG_ATH9K_RFKILL=y +CONFIG_ATH9K_CHANNEL_CONTEXT=y +CONFIG_ATH9K_PCOEM=y +CONFIG_ATH9K_HTC=m +# CONFIG_ATH9K_HTC_DEBUGFS is not set +CONFIG_ATH9K_HWRNG=y +CONFIG_CARL9170=m +CONFIG_CARL9170_LEDS=y +CONFIG_CARL9170_WPC=y +# CONFIG_CARL9170_HWRNG is not set +CONFIG_ATH6KL=m +CONFIG_ATH6KL_SDIO=m +CONFIG_ATH6KL_USB=m +# CONFIG_ATH6KL_DEBUG is not set +# CONFIG_ATH6KL_TRACING is not set +CONFIG_AR5523=m +CONFIG_WIL6210=m +CONFIG_WIL6210_ISR_COR=y +CONFIG_WIL6210_TRACING=y +CONFIG_WIL6210_DEBUGFS=y +CONFIG_ATH10K=m +CONFIG_ATH10K_CE=y +CONFIG_ATH10K_PCI=m +CONFIG_ATH10K_SDIO=m +CONFIG_ATH10K_USB=m +# CONFIG_ATH10K_DEBUG is not set +# CONFIG_ATH10K_DEBUGFS is not set +# CONFIG_ATH10K_TRACING is not set +CONFIG_WCN36XX=m +# CONFIG_WCN36XX_DEBUGFS is not set +CONFIG_WLAN_VENDOR_ATMEL=y +CONFIG_ATMEL=m +CONFIG_PCI_ATMEL=m +CONFIG_PCMCIA_ATMEL=m +CONFIG_AT76C50X_USB=m +CONFIG_WLAN_VENDOR_BROADCOM=y +CONFIG_B43=m +CONFIG_B43_BCMA=y +CONFIG_B43_SSB=y +CONFIG_B43_BUSES_BCMA_AND_SSB=y +# CONFIG_B43_BUSES_BCMA is not set +# CONFIG_B43_BUSES_SSB is not set +CONFIG_B43_PCI_AUTOSELECT=y +CONFIG_B43_PCICORE_AUTOSELECT=y +CONFIG_B43_SDIO=y +CONFIG_B43_BCMA_PIO=y +CONFIG_B43_PIO=y +CONFIG_B43_PHY_G=y +CONFIG_B43_PHY_N=y +CONFIG_B43_PHY_LP=y +CONFIG_B43_PHY_HT=y +CONFIG_B43_LEDS=y +CONFIG_B43_HWRNG=y +# CONFIG_B43_DEBUG is not set +CONFIG_B43LEGACY=m +CONFIG_B43LEGACY_PCI_AUTOSELECT=y +CONFIG_B43LEGACY_PCICORE_AUTOSELECT=y +CONFIG_B43LEGACY_LEDS=y +CONFIG_B43LEGACY_HWRNG=y +# CONFIG_B43LEGACY_DEBUG is not set +CONFIG_B43LEGACY_DMA=y +CONFIG_B43LEGACY_PIO=y +CONFIG_B43LEGACY_DMA_AND_PIO_MODE=y +# CONFIG_B43LEGACY_DMA_MODE is not set +# CONFIG_B43LEGACY_PIO_MODE is not set +CONFIG_BRCMUTIL=m +CONFIG_BRCMSMAC=m +CONFIG_BRCMFMAC=m +CONFIG_BRCMFMAC_PROTO_BCDC=y +CONFIG_BRCMFMAC_PROTO_MSGBUF=y +CONFIG_BRCMFMAC_SDIO=y +CONFIG_BRCMFMAC_USB=y +CONFIG_BRCMFMAC_PCIE=y +CONFIG_BRCM_TRACING=y +# CONFIG_BRCMDBG is not set +CONFIG_WLAN_VENDOR_CISCO=y +CONFIG_AIRO=m +CONFIG_AIRO_CS=m +CONFIG_WLAN_VENDOR_INTEL=y +CONFIG_IPW2100=m +CONFIG_IPW2100_MONITOR=y +# CONFIG_IPW2100_DEBUG is not set +CONFIG_IPW2200=m +CONFIG_IPW2200_MONITOR=y +CONFIG_IPW2200_RADIOTAP=y +CONFIG_IPW2200_PROMISCUOUS=y +CONFIG_IPW2200_QOS=y +# CONFIG_IPW2200_DEBUG is not set +CONFIG_LIBIPW=m +# CONFIG_LIBIPW_DEBUG is not set +CONFIG_IWLEGACY=m +CONFIG_IWL4965=m +CONFIG_IWL3945=m + +# +# iwl3945 / iwl4965 Debugging Options +# +# CONFIG_IWLEGACY_DEBUG is not set +CONFIG_IWLWIFI=m +CONFIG_IWLWIFI_LEDS=y +CONFIG_IWLDVM=m +CONFIG_IWLMVM=m +CONFIG_IWLWIFI_OPMODE_MODULAR=y +CONFIG_IWLWIFI_BCAST_FILTERING=y + +# +# Debugging Options +# +# CONFIG_IWLWIFI_DEBUG is not set +CONFIG_IWLWIFI_DEVICE_TRACING=y +CONFIG_WLAN_VENDOR_INTERSIL=y +CONFIG_HOSTAP=m +CONFIG_HOSTAP_FIRMWARE=y +CONFIG_HOSTAP_FIRMWARE_NVRAM=y +CONFIG_HOSTAP_PLX=m +CONFIG_HOSTAP_PCI=m +CONFIG_HOSTAP_CS=m +CONFIG_HERMES=m +CONFIG_HERMES_PRISM=y +CONFIG_HERMES_CACHE_FW_ON_INIT=y +CONFIG_PLX_HERMES=m +CONFIG_TMD_HERMES=m +CONFIG_NORTEL_HERMES=m +CONFIG_PCI_HERMES=m +CONFIG_PCMCIA_HERMES=m +CONFIG_PCMCIA_SPECTRUM=m +CONFIG_ORINOCO_USB=m +CONFIG_P54_COMMON=m +CONFIG_P54_USB=m +CONFIG_P54_PCI=m +CONFIG_P54_SPI=m +CONFIG_P54_SPI_DEFAULT_EEPROM=y +CONFIG_P54_LEDS=y +CONFIG_PRISM54=m +CONFIG_WLAN_VENDOR_MARVELL=y +CONFIG_LIBERTAS=m +CONFIG_LIBERTAS_USB=m +CONFIG_LIBERTAS_CS=m +CONFIG_LIBERTAS_SDIO=m +CONFIG_LIBERTAS_SPI=m +# CONFIG_LIBERTAS_DEBUG is not set +CONFIG_LIBERTAS_MESH=y +CONFIG_LIBERTAS_THINFIRM=m +# CONFIG_LIBERTAS_THINFIRM_DEBUG is not set +CONFIG_LIBERTAS_THINFIRM_USB=m +CONFIG_MWIFIEX=m +CONFIG_MWIFIEX_SDIO=m +CONFIG_MWIFIEX_PCIE=m +CONFIG_MWIFIEX_USB=m +CONFIG_MWL8K=m +CONFIG_WLAN_VENDOR_MEDIATEK=y +CONFIG_MT7601U=m +CONFIG_MT76_CORE=m +CONFIG_MT76_LEDS=y +CONFIG_MT76x2_COMMON=m +# CONFIG_MT76x0U is not set +CONFIG_MT76x2E=m +# CONFIG_MT76x2U is not set +CONFIG_WLAN_VENDOR_RALINK=y +CONFIG_RT2X00=m +CONFIG_RT2400PCI=m +CONFIG_RT2500PCI=m +CONFIG_RT61PCI=m +CONFIG_RT2800PCI=m +CONFIG_RT2800PCI_RT33XX=y +CONFIG_RT2800PCI_RT35XX=y +CONFIG_RT2800PCI_RT53XX=y +CONFIG_RT2800PCI_RT3290=y +CONFIG_RT2500USB=m +CONFIG_RT73USB=m +CONFIG_RT2800USB=m +CONFIG_RT2800USB_RT33XX=y +CONFIG_RT2800USB_RT35XX=y +CONFIG_RT2800USB_RT3573=y +CONFIG_RT2800USB_RT53XX=y +CONFIG_RT2800USB_RT55XX=y +CONFIG_RT2800USB_UNKNOWN=y +CONFIG_RT2800_LIB=m +CONFIG_RT2800_LIB_MMIO=m +CONFIG_RT2X00_LIB_MMIO=m +CONFIG_RT2X00_LIB_PCI=m +CONFIG_RT2X00_LIB_USB=m +CONFIG_RT2X00_LIB=m +CONFIG_RT2X00_LIB_FIRMWARE=y +CONFIG_RT2X00_LIB_CRYPTO=y +CONFIG_RT2X00_LIB_LEDS=y +# CONFIG_RT2X00_DEBUG is not set +CONFIG_WLAN_VENDOR_REALTEK=y +CONFIG_RTL8180=m +CONFIG_RTL8187=m +CONFIG_RTL8187_LEDS=y +CONFIG_RTL_CARDS=m +CONFIG_RTL8192CE=m +CONFIG_RTL8192SE=m +CONFIG_RTL8192DE=m +CONFIG_RTL8723AE=m +CONFIG_RTL8723BE=m +CONFIG_RTL8188EE=m +CONFIG_RTL8192EE=m +CONFIG_RTL8821AE=m +CONFIG_RTL8192CU=m +CONFIG_RTLWIFI=m +CONFIG_RTLWIFI_PCI=m +CONFIG_RTLWIFI_USB=m +# CONFIG_RTLWIFI_DEBUG is not set +CONFIG_RTL8192C_COMMON=m +CONFIG_RTL8723_COMMON=m +CONFIG_RTLBTCOEXIST=m +CONFIG_RTL8XXXU=m +CONFIG_RTL8XXXU_UNTESTED=y +CONFIG_WLAN_VENDOR_RSI=y +CONFIG_RSI_91X=m +# CONFIG_RSI_DEBUGFS is not set +CONFIG_RSI_SDIO=m +CONFIG_RSI_USB=m +CONFIG_RSI_COEX=y +CONFIG_WLAN_VENDOR_ST=y +CONFIG_CW1200=m +CONFIG_CW1200_WLAN_SDIO=m +CONFIG_CW1200_WLAN_SPI=m +CONFIG_WLAN_VENDOR_TI=y +CONFIG_WL1251=m +CONFIG_WL1251_SPI=m +CONFIG_WL1251_SDIO=m +CONFIG_WL12XX=m +CONFIG_WL18XX=m +CONFIG_WLCORE=m +CONFIG_WLCORE_SDIO=m +CONFIG_WILINK_PLATFORM_DATA=y +CONFIG_WLAN_VENDOR_ZYDAS=y +CONFIG_USB_ZD1201=m +CONFIG_ZD1211RW=m +# CONFIG_ZD1211RW_DEBUG is not set +CONFIG_WLAN_VENDOR_QUANTENNA=y +CONFIG_QTNFMAC=m +CONFIG_QTNFMAC_PEARL_PCIE=m +CONFIG_PCMCIA_RAYCS=m +CONFIG_PCMCIA_WL3501=m +# CONFIG_MAC80211_HWSIM is not set +CONFIG_USB_NET_RNDIS_WLAN=m + +# +# WiMAX Wireless Broadband devices +# +CONFIG_WIMAX_I2400M=m +CONFIG_WIMAX_I2400M_USB=m +CONFIG_WIMAX_I2400M_DEBUG_LEVEL=8 +CONFIG_WAN=y +CONFIG_LANMEDIA=m +CONFIG_HDLC=m +CONFIG_HDLC_RAW=m +CONFIG_HDLC_RAW_ETH=m +CONFIG_HDLC_CISCO=m +CONFIG_HDLC_FR=m +CONFIG_HDLC_PPP=m +CONFIG_HDLC_X25=m +CONFIG_PCI200SYN=m +CONFIG_WANXL=m +CONFIG_PC300TOO=m +CONFIG_FARSYNC=m +CONFIG_DSCC4=m +CONFIG_DSCC4_PCISYNC=y +CONFIG_DSCC4_PCI_RST=y +CONFIG_DLCI=m +CONFIG_DLCI_MAX=8 +CONFIG_LAPBETHER=m +CONFIG_X25_ASY=m +CONFIG_SBNI=m +CONFIG_SBNI_MULTILINE=y +CONFIG_IEEE802154_DRIVERS=m +CONFIG_IEEE802154_FAKELB=m +CONFIG_IEEE802154_AT86RF230=m +# CONFIG_IEEE802154_AT86RF230_DEBUGFS is not set +CONFIG_IEEE802154_MRF24J40=m +CONFIG_IEEE802154_CC2520=m +CONFIG_IEEE802154_ATUSB=m +CONFIG_IEEE802154_ADF7242=m +CONFIG_IEEE802154_CA8210=m +# CONFIG_IEEE802154_CA8210_DEBUGFS is not set +CONFIG_IEEE802154_MCR20A=m +# CONFIG_IEEE802154_HWSIM is not set +CONFIG_VMXNET3=m +CONFIG_FUJITSU_ES=m +CONFIG_THUNDERBOLT_NET=m +CONFIG_HYPERV_NET=m +CONFIG_NETDEVSIM=m +CONFIG_NET_FAILOVER=m +CONFIG_ISDN=y +CONFIG_ISDN_I4L=m +CONFIG_ISDN_PPP=y +CONFIG_ISDN_PPP_VJ=y +CONFIG_ISDN_MPP=y +CONFIG_IPPP_FILTER=y +CONFIG_ISDN_PPP_BSDCOMP=m +CONFIG_ISDN_AUDIO=y +CONFIG_ISDN_TTY_FAX=y +CONFIG_ISDN_X25=y + +# +# ISDN feature submodules +# +CONFIG_ISDN_DIVERSION=m + +# +# ISDN4Linux hardware drivers +# + +# +# Passive cards +# +CONFIG_ISDN_DRV_HISAX=m + +# +# D-channel protocol features +# +CONFIG_HISAX_EURO=y +CONFIG_DE_AOC=y +# CONFIG_HISAX_NO_SENDCOMPLETE is not set +# CONFIG_HISAX_NO_LLC is not set +# CONFIG_HISAX_NO_KEYPAD is not set +CONFIG_HISAX_1TR6=y +CONFIG_HISAX_NI1=y +CONFIG_HISAX_MAX_CARDS=8 + +# +# HiSax supported cards +# +CONFIG_HISAX_16_3=y +CONFIG_HISAX_TELESPCI=y +CONFIG_HISAX_S0BOX=y +CONFIG_HISAX_FRITZPCI=y +CONFIG_HISAX_AVM_A1_PCMCIA=y +CONFIG_HISAX_ELSA=y +CONFIG_HISAX_DIEHLDIVA=y +CONFIG_HISAX_SEDLBAUER=y +CONFIG_HISAX_NETJET=y +CONFIG_HISAX_NETJET_U=y +CONFIG_HISAX_NICCY=y +CONFIG_HISAX_BKM_A4T=y +CONFIG_HISAX_SCT_QUADRO=y +CONFIG_HISAX_GAZEL=y +CONFIG_HISAX_HFC_PCI=y +CONFIG_HISAX_W6692=y +CONFIG_HISAX_HFC_SX=y +CONFIG_HISAX_ENTERNOW_PCI=y +# CONFIG_HISAX_DEBUG is not set + +# +# HiSax PCMCIA card service modules +# +CONFIG_HISAX_SEDLBAUER_CS=m +CONFIG_HISAX_ELSA_CS=m +CONFIG_HISAX_AVM_A1_CS=m +CONFIG_HISAX_TELES_CS=m + +# +# HiSax sub driver modules +# +CONFIG_HISAX_ST5481=m +CONFIG_HISAX_HFCUSB=m +CONFIG_HISAX_HFC4S8S=m +CONFIG_HISAX_FRITZ_PCIPNP=m +CONFIG_ISDN_CAPI=m +CONFIG_CAPI_TRACE=y +CONFIG_ISDN_CAPI_CAPI20=m +CONFIG_ISDN_CAPI_MIDDLEWARE=y +CONFIG_ISDN_CAPI_CAPIDRV=m +# CONFIG_ISDN_CAPI_CAPIDRV_VERBOSE is not set + +# +# CAPI hardware drivers +# +CONFIG_CAPI_AVM=y +CONFIG_ISDN_DRV_AVMB1_B1PCI=m +CONFIG_ISDN_DRV_AVMB1_B1PCIV4=y +CONFIG_ISDN_DRV_AVMB1_B1PCMCIA=m +CONFIG_ISDN_DRV_AVMB1_AVM_CS=m +CONFIG_ISDN_DRV_AVMB1_T1PCI=m +CONFIG_ISDN_DRV_AVMB1_C4=m +CONFIG_CAPI_EICON=y +CONFIG_ISDN_DIVAS=m +CONFIG_ISDN_DIVAS_BRIPCI=y +CONFIG_ISDN_DIVAS_PRIPCI=y +CONFIG_ISDN_DIVAS_DIVACAPI=m +CONFIG_ISDN_DIVAS_USERIDI=m +CONFIG_ISDN_DIVAS_MAINT=m +CONFIG_ISDN_DRV_GIGASET=m +CONFIG_GIGASET_CAPI=y +CONFIG_GIGASET_BASE=m +CONFIG_GIGASET_M105=m +CONFIG_GIGASET_M101=m +# CONFIG_GIGASET_DEBUG is not set +CONFIG_HYSDN=m +CONFIG_HYSDN_CAPI=y +CONFIG_MISDN=m +CONFIG_MISDN_DSP=m +CONFIG_MISDN_L1OIP=m + +# +# mISDN hardware drivers +# +CONFIG_MISDN_HFCPCI=m +CONFIG_MISDN_HFCMULTI=m +CONFIG_MISDN_HFCUSB=m +CONFIG_MISDN_AVMFRITZ=m +CONFIG_MISDN_SPEEDFAX=m +CONFIG_MISDN_INFINEON=m +CONFIG_MISDN_W6692=m +CONFIG_MISDN_NETJET=m +CONFIG_MISDN_IPAC=m +CONFIG_MISDN_ISAR=m +CONFIG_ISDN_HDLC=m +CONFIG_NVM=y +CONFIG_NVM_PBLK=m +# CONFIG_NVM_PBLK_DEBUG is not set + +# +# Input device support +# +CONFIG_INPUT=y +CONFIG_INPUT_LEDS=y +CONFIG_INPUT_FF_MEMLESS=m +CONFIG_INPUT_POLLDEV=m +CONFIG_INPUT_SPARSEKMAP=m +CONFIG_INPUT_MATRIXKMAP=m + +# +# Userland interfaces +# +CONFIG_INPUT_MOUSEDEV=y +CONFIG_INPUT_MOUSEDEV_PSAUX=y +CONFIG_INPUT_MOUSEDEV_SCREEN_X=1024 +CONFIG_INPUT_MOUSEDEV_SCREEN_Y=768 +CONFIG_INPUT_JOYDEV=m +CONFIG_INPUT_EVDEV=m +# CONFIG_INPUT_EVBUG is not set + +# +# Input Device Drivers +# +CONFIG_INPUT_KEYBOARD=y +CONFIG_KEYBOARD_ADC=m +CONFIG_KEYBOARD_ADP5588=m +CONFIG_KEYBOARD_ADP5589=m +CONFIG_KEYBOARD_ATKBD=y +CONFIG_KEYBOARD_QT1070=m +CONFIG_KEYBOARD_QT2160=m +CONFIG_KEYBOARD_DLINK_DIR685=m +CONFIG_KEYBOARD_LKKBD=m +CONFIG_KEYBOARD_GPIO=m +CONFIG_KEYBOARD_GPIO_POLLED=m +CONFIG_KEYBOARD_TCA6416=m +CONFIG_KEYBOARD_TCA8418=m +CONFIG_KEYBOARD_MATRIX=m +CONFIG_KEYBOARD_LM8323=m +CONFIG_KEYBOARD_LM8333=m +CONFIG_KEYBOARD_MAX7359=m +CONFIG_KEYBOARD_MCS=m +CONFIG_KEYBOARD_MPR121=m +CONFIG_KEYBOARD_NEWTON=m +CONFIG_KEYBOARD_OPENCORES=m +CONFIG_KEYBOARD_SAMSUNG=m +CONFIG_KEYBOARD_STOWAWAY=m +CONFIG_KEYBOARD_SUNKBD=m +CONFIG_KEYBOARD_TM2_TOUCHKEY=m +CONFIG_KEYBOARD_XTKBD=m +CONFIG_KEYBOARD_CROS_EC=m +CONFIG_KEYBOARD_MTK_PMIC=m +CONFIG_INPUT_MOUSE=y +CONFIG_MOUSE_PS2=m +CONFIG_MOUSE_PS2_ALPS=y +CONFIG_MOUSE_PS2_BYD=y +CONFIG_MOUSE_PS2_LOGIPS2PP=y +CONFIG_MOUSE_PS2_SYNAPTICS=y +CONFIG_MOUSE_PS2_SYNAPTICS_SMBUS=y +CONFIG_MOUSE_PS2_CYPRESS=y +CONFIG_MOUSE_PS2_LIFEBOOK=y +CONFIG_MOUSE_PS2_TRACKPOINT=y +CONFIG_MOUSE_PS2_ELANTECH=y +CONFIG_MOUSE_PS2_ELANTECH_SMBUS=y +CONFIG_MOUSE_PS2_SENTELIC=y +CONFIG_MOUSE_PS2_TOUCHKIT=y +CONFIG_MOUSE_PS2_FOCALTECH=y +# CONFIG_MOUSE_PS2_VMMOUSE is not set +CONFIG_MOUSE_PS2_SMBUS=y +CONFIG_MOUSE_SERIAL=m +CONFIG_MOUSE_APPLETOUCH=m +CONFIG_MOUSE_BCM5974=m +CONFIG_MOUSE_CYAPA=m +CONFIG_MOUSE_ELAN_I2C=m +CONFIG_MOUSE_ELAN_I2C_I2C=y +CONFIG_MOUSE_ELAN_I2C_SMBUS=y +CONFIG_MOUSE_VSXXXAA=m +CONFIG_MOUSE_GPIO=m +CONFIG_MOUSE_SYNAPTICS_I2C=m +CONFIG_MOUSE_SYNAPTICS_USB=m +CONFIG_INPUT_JOYSTICK=y +CONFIG_JOYSTICK_ANALOG=m +CONFIG_JOYSTICK_A3D=m +CONFIG_JOYSTICK_ADI=m +CONFIG_JOYSTICK_COBRA=m +CONFIG_JOYSTICK_GF2K=m +CONFIG_JOYSTICK_GRIP=m +CONFIG_JOYSTICK_GRIP_MP=m +CONFIG_JOYSTICK_GUILLEMOT=m +CONFIG_JOYSTICK_INTERACT=m +CONFIG_JOYSTICK_SIDEWINDER=m +CONFIG_JOYSTICK_TMDC=m +CONFIG_JOYSTICK_IFORCE=m +CONFIG_JOYSTICK_IFORCE_USB=y +CONFIG_JOYSTICK_IFORCE_232=y +CONFIG_JOYSTICK_WARRIOR=m +CONFIG_JOYSTICK_MAGELLAN=m +CONFIG_JOYSTICK_SPACEORB=m +CONFIG_JOYSTICK_SPACEBALL=m +CONFIG_JOYSTICK_STINGER=m +CONFIG_JOYSTICK_TWIDJOY=m +CONFIG_JOYSTICK_ZHENHUA=m +CONFIG_JOYSTICK_DB9=m +CONFIG_JOYSTICK_GAMECON=m +CONFIG_JOYSTICK_TURBOGRAFX=m +CONFIG_JOYSTICK_AS5011=m +# CONFIG_JOYSTICK_JOYDUMP is not set +CONFIG_JOYSTICK_XPAD=m +CONFIG_JOYSTICK_XPAD_FF=y +CONFIG_JOYSTICK_XPAD_LEDS=y +CONFIG_JOYSTICK_WALKERA0701=m +CONFIG_JOYSTICK_PSXPAD_SPI=m +CONFIG_JOYSTICK_PSXPAD_SPI_FF=y +CONFIG_JOYSTICK_PXRC=m +CONFIG_INPUT_TABLET=y +CONFIG_TABLET_USB_ACECAD=m +CONFIG_TABLET_USB_AIPTEK=m +CONFIG_TABLET_USB_GTCO=m +CONFIG_TABLET_USB_HANWANG=m +CONFIG_TABLET_USB_KBTAB=m +CONFIG_TABLET_USB_PEGASUS=m +CONFIG_TABLET_SERIAL_WACOM4=m +CONFIG_INPUT_TOUCHSCREEN=y +CONFIG_TOUCHSCREEN_PROPERTIES=y +CONFIG_TOUCHSCREEN_ADS7846=m +CONFIG_TOUCHSCREEN_AD7877=m +CONFIG_TOUCHSCREEN_AD7879=m +CONFIG_TOUCHSCREEN_AD7879_I2C=m +CONFIG_TOUCHSCREEN_AD7879_SPI=m +# CONFIG_TOUCHSCREEN_ADC is not set +CONFIG_TOUCHSCREEN_ATMEL_MXT=m +# CONFIG_TOUCHSCREEN_ATMEL_MXT_T37 is not set +CONFIG_TOUCHSCREEN_AUO_PIXCIR=m +CONFIG_TOUCHSCREEN_BU21013=m +# CONFIG_TOUCHSCREEN_BU21029 is not set +CONFIG_TOUCHSCREEN_CHIPONE_ICN8505=m +CONFIG_TOUCHSCREEN_CY8CTMG110=m +CONFIG_TOUCHSCREEN_CYTTSP_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP_SPI=m +CONFIG_TOUCHSCREEN_CYTTSP4_CORE=m +CONFIG_TOUCHSCREEN_CYTTSP4_I2C=m +CONFIG_TOUCHSCREEN_CYTTSP4_SPI=m +CONFIG_TOUCHSCREEN_DA9052=m +CONFIG_TOUCHSCREEN_DYNAPRO=m +CONFIG_TOUCHSCREEN_HAMPSHIRE=m +CONFIG_TOUCHSCREEN_EETI=m +CONFIG_TOUCHSCREEN_EGALAX_SERIAL=m +CONFIG_TOUCHSCREEN_EXC3000=m +CONFIG_TOUCHSCREEN_FUJITSU=m +CONFIG_TOUCHSCREEN_GOODIX=m +CONFIG_TOUCHSCREEN_HIDEEP=m +CONFIG_TOUCHSCREEN_ILI210X=m +CONFIG_TOUCHSCREEN_S6SY761=m +CONFIG_TOUCHSCREEN_GUNZE=m +CONFIG_TOUCHSCREEN_EKTF2127=m +CONFIG_TOUCHSCREEN_ELAN=m +CONFIG_TOUCHSCREEN_ELO=m +CONFIG_TOUCHSCREEN_WACOM_W8001=m +CONFIG_TOUCHSCREEN_WACOM_I2C=m +CONFIG_TOUCHSCREEN_MAX11801=m +CONFIG_TOUCHSCREEN_MCS5000=m +CONFIG_TOUCHSCREEN_MMS114=m +CONFIG_TOUCHSCREEN_MELFAS_MIP4=m +CONFIG_TOUCHSCREEN_MTOUCH=m +CONFIG_TOUCHSCREEN_INEXIO=m +CONFIG_TOUCHSCREEN_MK712=m +CONFIG_TOUCHSCREEN_PENMOUNT=m +CONFIG_TOUCHSCREEN_EDT_FT5X06=m +CONFIG_TOUCHSCREEN_TOUCHRIGHT=m +CONFIG_TOUCHSCREEN_TOUCHWIN=m +CONFIG_TOUCHSCREEN_TI_AM335X_TSC=m +CONFIG_TOUCHSCREEN_UCB1400=m +CONFIG_TOUCHSCREEN_PIXCIR=m +CONFIG_TOUCHSCREEN_WDT87XX_I2C=m +CONFIG_TOUCHSCREEN_WM831X=m +CONFIG_TOUCHSCREEN_WM97XX=m +CONFIG_TOUCHSCREEN_WM9705=y +CONFIG_TOUCHSCREEN_WM9712=y +CONFIG_TOUCHSCREEN_WM9713=y +CONFIG_TOUCHSCREEN_USB_COMPOSITE=m +CONFIG_TOUCHSCREEN_MC13783=m +CONFIG_TOUCHSCREEN_USB_EGALAX=y +CONFIG_TOUCHSCREEN_USB_PANJIT=y +CONFIG_TOUCHSCREEN_USB_3M=y +CONFIG_TOUCHSCREEN_USB_ITM=y +CONFIG_TOUCHSCREEN_USB_ETURBO=y +CONFIG_TOUCHSCREEN_USB_GUNZE=y +CONFIG_TOUCHSCREEN_USB_DMC_TSC10=y +CONFIG_TOUCHSCREEN_USB_IRTOUCH=y +CONFIG_TOUCHSCREEN_USB_IDEALTEK=y +CONFIG_TOUCHSCREEN_USB_GENERAL_TOUCH=y +CONFIG_TOUCHSCREEN_USB_GOTOP=y +CONFIG_TOUCHSCREEN_USB_JASTEC=y +CONFIG_TOUCHSCREEN_USB_ELO=y +CONFIG_TOUCHSCREEN_USB_E2I=y +CONFIG_TOUCHSCREEN_USB_ZYTRONIC=y +CONFIG_TOUCHSCREEN_USB_ETT_TC45USB=y +CONFIG_TOUCHSCREEN_USB_NEXIO=y +CONFIG_TOUCHSCREEN_USB_EASYTOUCH=y +CONFIG_TOUCHSCREEN_TOUCHIT213=m +CONFIG_TOUCHSCREEN_TSC_SERIO=m +CONFIG_TOUCHSCREEN_TSC200X_CORE=m +CONFIG_TOUCHSCREEN_TSC2004=m +CONFIG_TOUCHSCREEN_TSC2005=m +CONFIG_TOUCHSCREEN_TSC2007=m +CONFIG_TOUCHSCREEN_TSC2007_IIO=y +CONFIG_TOUCHSCREEN_PCAP=m +CONFIG_TOUCHSCREEN_RM_TS=m +CONFIG_TOUCHSCREEN_SILEAD=m +CONFIG_TOUCHSCREEN_SIS_I2C=m +CONFIG_TOUCHSCREEN_ST1232=m +CONFIG_TOUCHSCREEN_STMFTS=m +CONFIG_TOUCHSCREEN_SUR40=m +CONFIG_TOUCHSCREEN_SURFACE3_SPI=m +CONFIG_TOUCHSCREEN_SX8654=m +CONFIG_TOUCHSCREEN_TPS6507X=m +CONFIG_TOUCHSCREEN_ZET6223=m +CONFIG_TOUCHSCREEN_ZFORCE=m +CONFIG_TOUCHSCREEN_ROHM_BU21023=m +CONFIG_INPUT_MISC=y +CONFIG_INPUT_88PM80X_ONKEY=m +CONFIG_INPUT_AD714X=m +CONFIG_INPUT_AD714X_I2C=m +CONFIG_INPUT_AD714X_SPI=m +CONFIG_INPUT_ARIZONA_HAPTICS=m +CONFIG_INPUT_BMA150=m +CONFIG_INPUT_E3X0_BUTTON=m +CONFIG_INPUT_PCSPKR=m +CONFIG_INPUT_MAX77693_HAPTIC=m +CONFIG_INPUT_MC13783_PWRBUTTON=m +CONFIG_INPUT_MMA8450=m +CONFIG_INPUT_APANEL=m +CONFIG_INPUT_GP2A=m +CONFIG_INPUT_GPIO_BEEPER=m +CONFIG_INPUT_GPIO_DECODER=m +CONFIG_INPUT_ATLAS_BTNS=m +CONFIG_INPUT_ATI_REMOTE2=m +CONFIG_INPUT_KEYSPAN_REMOTE=m +CONFIG_INPUT_KXTJ9=m +CONFIG_INPUT_KXTJ9_POLLED_MODE=y +CONFIG_INPUT_POWERMATE=m +CONFIG_INPUT_YEALINK=m +CONFIG_INPUT_CM109=m +CONFIG_INPUT_REGULATOR_HAPTIC=m +CONFIG_INPUT_RETU_PWRBUTTON=m +CONFIG_INPUT_AXP20X_PEK=m +CONFIG_INPUT_UINPUT=m +CONFIG_INPUT_PCF50633_PMU=m +CONFIG_INPUT_PCF8574=m +CONFIG_INPUT_PWM_BEEPER=m +CONFIG_INPUT_PWM_VIBRA=m +CONFIG_INPUT_GPIO_ROTARY_ENCODER=m +CONFIG_INPUT_DA9052_ONKEY=m +CONFIG_INPUT_DA9063_ONKEY=m +CONFIG_INPUT_WM831X_ON=m +CONFIG_INPUT_PCAP=m +CONFIG_INPUT_ADXL34X=m +CONFIG_INPUT_ADXL34X_I2C=m +CONFIG_INPUT_ADXL34X_SPI=m +CONFIG_INPUT_IMS_PCU=m +CONFIG_INPUT_CMA3000=m +CONFIG_INPUT_CMA3000_I2C=m +CONFIG_INPUT_IDEAPAD_SLIDEBAR=m +CONFIG_INPUT_SOC_BUTTON_ARRAY=m +CONFIG_INPUT_DRV260X_HAPTICS=m +CONFIG_INPUT_DRV2665_HAPTICS=m +CONFIG_INPUT_DRV2667_HAPTICS=m +CONFIG_INPUT_RAVE_SP_PWRBUTTON=m +CONFIG_RMI4_CORE=m +CONFIG_RMI4_I2C=m +CONFIG_RMI4_SPI=m +CONFIG_RMI4_SMB=m +CONFIG_RMI4_F03=y +CONFIG_RMI4_F03_SERIO=m +CONFIG_RMI4_2D_SENSOR=y +CONFIG_RMI4_F11=y +CONFIG_RMI4_F12=y +CONFIG_RMI4_F30=y +CONFIG_RMI4_F34=y +CONFIG_RMI4_F54=y +CONFIG_RMI4_F55=y + +# +# Hardware I/O ports +# +CONFIG_SERIO=y +CONFIG_ARCH_MIGHT_HAVE_PC_SERIO=y +CONFIG_SERIO_I8042=y +CONFIG_SERIO_SERPORT=m +CONFIG_SERIO_CT82C710=m +CONFIG_SERIO_PARKBD=m +CONFIG_SERIO_PCIPS2=m +CONFIG_SERIO_LIBPS2=y +CONFIG_SERIO_RAW=m +CONFIG_SERIO_ALTERA_PS2=m +CONFIG_SERIO_PS2MULT=m +CONFIG_SERIO_ARC_PS2=m +CONFIG_HYPERV_KEYBOARD=m +CONFIG_SERIO_GPIO_PS2=m +CONFIG_USERIO=m +CONFIG_GAMEPORT=m +CONFIG_GAMEPORT_NS558=m +CONFIG_GAMEPORT_L4=m +CONFIG_GAMEPORT_EMU10K1=m +CONFIG_GAMEPORT_FM801=m + +# +# Character devices +# +CONFIG_TTY=y +CONFIG_VT=y +CONFIG_CONSOLE_TRANSLATIONS=y +CONFIG_VT_CONSOLE=y +CONFIG_VT_CONSOLE_SLEEP=y +CONFIG_HW_CONSOLE=y +CONFIG_VT_HW_CONSOLE_BINDING=y +CONFIG_UNIX98_PTYS=y +CONFIG_LEGACY_PTYS=y +CONFIG_LEGACY_PTY_COUNT=256 +CONFIG_SERIAL_NONSTANDARD=y +CONFIG_ROCKETPORT=m +CONFIG_CYCLADES=m +CONFIG_CYZ_INTR=y +CONFIG_MOXA_INTELLIO=m +CONFIG_MOXA_SMARTIO=m +CONFIG_SYNCLINK=m +CONFIG_SYNCLINKMP=m +CONFIG_SYNCLINK_GT=m +CONFIG_NOZOMI=m +CONFIG_ISI=m +CONFIG_N_HDLC=m +CONFIG_N_GSM=m +CONFIG_TRACE_ROUTER=m +CONFIG_TRACE_SINK=m +CONFIG_DEVMEM=y +# CONFIG_DEVKMEM is not set + +# +# Serial drivers +# +CONFIG_SERIAL_EARLYCON=y +CONFIG_SERIAL_8250=y +# CONFIG_SERIAL_8250_DEPRECATED_OPTIONS is not set +CONFIG_SERIAL_8250_PNP=y +CONFIG_SERIAL_8250_FINTEK=y +CONFIG_SERIAL_8250_CONSOLE=y +CONFIG_SERIAL_8250_DMA=y +CONFIG_SERIAL_8250_PCI=y +CONFIG_SERIAL_8250_EXAR=y +CONFIG_SERIAL_8250_CS=m +CONFIG_SERIAL_8250_MEN_MCB=m +CONFIG_SERIAL_8250_NR_UARTS=4 +CONFIG_SERIAL_8250_RUNTIME_UARTS=4 +CONFIG_SERIAL_8250_EXTENDED=y +CONFIG_SERIAL_8250_MANY_PORTS=y +CONFIG_SERIAL_8250_SHARE_IRQ=y +CONFIG_SERIAL_8250_DETECT_IRQ=y +CONFIG_SERIAL_8250_RSA=y +CONFIG_SERIAL_8250_DW=m +CONFIG_SERIAL_8250_RT288X=y +CONFIG_SERIAL_8250_LPSS=y +CONFIG_SERIAL_8250_MID=y +CONFIG_SERIAL_8250_MOXA=m + +# +# Non-8250 serial port support +# +CONFIG_SERIAL_MAX3100=m +CONFIG_SERIAL_MAX310X=y +CONFIG_SERIAL_UARTLITE=m +CONFIG_SERIAL_UARTLITE_NR_UARTS=1 +CONFIG_SERIAL_CORE=y +CONFIG_SERIAL_CORE_CONSOLE=y +CONFIG_SERIAL_JSM=m +CONFIG_SERIAL_SCCNXP=m +CONFIG_SERIAL_SC16IS7XX_CORE=m +CONFIG_SERIAL_SC16IS7XX=m +CONFIG_SERIAL_SC16IS7XX_I2C=y +CONFIG_SERIAL_SC16IS7XX_SPI=y +CONFIG_SERIAL_ALTERA_JTAGUART=m +CONFIG_SERIAL_ALTERA_UART=m +CONFIG_SERIAL_ALTERA_UART_MAXPORTS=4 +CONFIG_SERIAL_ALTERA_UART_BAUDRATE=115200 +CONFIG_SERIAL_IFX6X60=m +CONFIG_SERIAL_ARC=m +CONFIG_SERIAL_ARC_NR_PORTS=1 +CONFIG_SERIAL_RP2=m +CONFIG_SERIAL_RP2_NR_UARTS=32 +CONFIG_SERIAL_FSL_LPUART=m +CONFIG_SERIAL_MEN_Z135=m +CONFIG_SERIAL_DEV_BUS=m +CONFIG_PRINTER=m +CONFIG_LP_CONSOLE=y +CONFIG_PPDEV=m +CONFIG_HVC_DRIVER=y +CONFIG_VIRTIO_CONSOLE=m +CONFIG_IPMI_HANDLER=m +CONFIG_IPMI_DMI_DECODE=y +CONFIG_IPMI_PANIC_EVENT=y +CONFIG_IPMI_PANIC_STRING=y +CONFIG_IPMI_DEVICE_INTERFACE=m +CONFIG_IPMI_SI=m +CONFIG_IPMI_SSIF=m +CONFIG_IPMI_WATCHDOG=m +CONFIG_IPMI_POWEROFF=m +CONFIG_HW_RANDOM=m +CONFIG_HW_RANDOM_TIMERIOMEM=m +CONFIG_HW_RANDOM_INTEL=m +CONFIG_HW_RANDOM_AMD=m +CONFIG_HW_RANDOM_VIA=m +CONFIG_HW_RANDOM_VIRTIO=m +CONFIG_NVRAM=m +CONFIG_R3964=m +CONFIG_APPLICOM=m + +# +# PCMCIA character devices +# +CONFIG_SYNCLINK_CS=m +CONFIG_CARDMAN_4000=m +CONFIG_CARDMAN_4040=m +CONFIG_SCR24X=m +CONFIG_IPWIRELESS=m +CONFIG_MWAVE=m +CONFIG_RAW_DRIVER=m +CONFIG_MAX_RAW_DEVS=256 +CONFIG_HPET=y +CONFIG_HPET_MMAP=y +CONFIG_HPET_MMAP_DEFAULT=y +CONFIG_HANGCHECK_TIMER=m +CONFIG_TCG_TPM=m +CONFIG_HW_RANDOM_TPM=y +CONFIG_TCG_TIS_CORE=m +CONFIG_TCG_TIS=m +CONFIG_TCG_TIS_SPI=m +CONFIG_TCG_TIS_I2C_ATMEL=m +CONFIG_TCG_TIS_I2C_INFINEON=m +CONFIG_TCG_TIS_I2C_NUVOTON=m +CONFIG_TCG_NSC=m +CONFIG_TCG_ATMEL=m +CONFIG_TCG_INFINEON=m +CONFIG_TCG_CRB=m +CONFIG_TCG_VTPM_PROXY=m +CONFIG_TCG_TIS_ST33ZP24=m +CONFIG_TCG_TIS_ST33ZP24_I2C=m +CONFIG_TCG_TIS_ST33ZP24_SPI=m +CONFIG_TELCLOCK=m +CONFIG_DEVPORT=y +CONFIG_XILLYBUS=m +CONFIG_XILLYBUS_PCIE=m +# CONFIG_RANDOM_TRUST_CPU is not set + +# +# I2C support +# +CONFIG_I2C=m +CONFIG_I2C_BOARDINFO=y +CONFIG_I2C_COMPAT=y +CONFIG_I2C_CHARDEV=m +CONFIG_I2C_MUX=m + +# +# Multiplexer I2C Chip support +# +CONFIG_I2C_MUX_GPIO=m +CONFIG_I2C_MUX_LTC4306=m +CONFIG_I2C_MUX_PCA9541=m +CONFIG_I2C_MUX_PCA954x=m +CONFIG_I2C_MUX_REG=m +CONFIG_I2C_MUX_MLXCPLD=m +CONFIG_I2C_HELPER_AUTO=y +CONFIG_I2C_SMBUS=m +CONFIG_I2C_ALGOBIT=m +CONFIG_I2C_ALGOPCA=m + +# +# I2C Hardware Bus support +# + +# +# PC SMBus host controller drivers +# +CONFIG_I2C_ALI1535=m +CONFIG_I2C_ALI1563=m +CONFIG_I2C_ALI15X3=m +CONFIG_I2C_AMD756=m +CONFIG_I2C_AMD756_S4882=m +CONFIG_I2C_AMD8111=m +CONFIG_I2C_I801=m +CONFIG_I2C_ISCH=m +CONFIG_I2C_ISMT=m +CONFIG_I2C_PIIX4=m +CONFIG_I2C_NFORCE2=m +CONFIG_I2C_NFORCE2_S4985=m +CONFIG_I2C_SIS5595=m +CONFIG_I2C_SIS630=m +CONFIG_I2C_SIS96X=m +CONFIG_I2C_VIA=m +CONFIG_I2C_VIAPRO=m + +# +# ACPI drivers +# +CONFIG_I2C_SCMI=m + +# +# I2C system bus drivers (mostly embedded / system-on-chip) +# +CONFIG_I2C_CBUS_GPIO=m +CONFIG_I2C_DESIGNWARE_CORE=m +CONFIG_I2C_DESIGNWARE_PLATFORM=m +# CONFIG_I2C_DESIGNWARE_SLAVE is not set +CONFIG_I2C_DESIGNWARE_PCI=m +# CONFIG_I2C_DESIGNWARE_BAYTRAIL is not set +CONFIG_I2C_EMEV2=m +CONFIG_I2C_GPIO=m +# CONFIG_I2C_GPIO_FAULT_INJECTOR is not set +CONFIG_I2C_KEMPLD=m +CONFIG_I2C_OCORES=m +CONFIG_I2C_PCA_PLATFORM=m +CONFIG_I2C_SIMTEC=m +CONFIG_I2C_XILINX=m + +# +# External I2C/SMBus adapter drivers +# +CONFIG_I2C_DIOLAN_U2C=m +CONFIG_I2C_DLN2=m +CONFIG_I2C_PARPORT=m +CONFIG_I2C_PARPORT_LIGHT=m +CONFIG_I2C_ROBOTFUZZ_OSIF=m +CONFIG_I2C_TAOS_EVM=m +CONFIG_I2C_TINY_USB=m +CONFIG_I2C_VIPERBOARD=m + +# +# Other I2C/SMBus bus drivers +# +CONFIG_I2C_MLXCPLD=m +CONFIG_I2C_CROS_EC_TUNNEL=m +# CONFIG_I2C_STUB is not set +CONFIG_I2C_SLAVE=y +CONFIG_I2C_SLAVE_EEPROM=m +# CONFIG_I2C_DEBUG_CORE is not set +# CONFIG_I2C_DEBUG_ALGO is not set +# CONFIG_I2C_DEBUG_BUS is not set +CONFIG_SPI=y +# CONFIG_SPI_DEBUG is not set +CONFIG_SPI_MASTER=y +CONFIG_SPI_MEM=y + +# +# SPI Master Controller Drivers +# +CONFIG_SPI_ALTERA=m +CONFIG_SPI_AXI_SPI_ENGINE=m +CONFIG_SPI_BITBANG=m +CONFIG_SPI_BUTTERFLY=m +CONFIG_SPI_CADENCE=m +CONFIG_SPI_DESIGNWARE=m +CONFIG_SPI_DW_PCI=m +CONFIG_SPI_DW_MID_DMA=y +CONFIG_SPI_DW_MMIO=m +CONFIG_SPI_DLN2=m +CONFIG_SPI_GPIO=m +CONFIG_SPI_LM70_LLP=m +CONFIG_SPI_OC_TINY=m +CONFIG_SPI_PXA2XX=m +CONFIG_SPI_PXA2XX_PCI=m +CONFIG_SPI_ROCKCHIP=m +CONFIG_SPI_SC18IS602=m +CONFIG_SPI_XCOMM=m +CONFIG_SPI_XILINX=m +CONFIG_SPI_ZYNQMP_GQSPI=m + +# +# SPI Protocol Masters +# +CONFIG_SPI_SPIDEV=m +CONFIG_SPI_LOOPBACK_TEST=m +CONFIG_SPI_TLE62X0=m +CONFIG_SPI_SLAVE=y +CONFIG_SPI_SLAVE_TIME=m +CONFIG_SPI_SLAVE_SYSTEM_CONTROL=m +CONFIG_SPMI=m +CONFIG_HSI=m +CONFIG_HSI_BOARDINFO=y + +# +# HSI controllers +# + +# +# HSI clients +# +CONFIG_HSI_CHAR=m +CONFIG_PPS=y +# CONFIG_PPS_DEBUG is not set +# CONFIG_NTP_PPS is not set + +# +# PPS clients support +# +# CONFIG_PPS_CLIENT_KTIMER is not set +CONFIG_PPS_CLIENT_LDISC=m +CONFIG_PPS_CLIENT_PARPORT=m +CONFIG_PPS_CLIENT_GPIO=m + +# +# PPS generators support +# + +# +# PTP clock support +# +CONFIG_PTP_1588_CLOCK=y +CONFIG_DP83640_PHY=m +CONFIG_PTP_1588_CLOCK_KVM=m +CONFIG_PINCTRL=y +CONFIG_PINMUX=y +CONFIG_PINCONF=y +CONFIG_GENERIC_PINCONF=y +# CONFIG_DEBUG_PINCTRL is not set +CONFIG_PINCTRL_AMD=m +CONFIG_PINCTRL_MCP23S08=m +CONFIG_PINCTRL_BAYTRAIL=y +CONFIG_PINCTRL_CHERRYVIEW=m +CONFIG_PINCTRL_INTEL=m +CONFIG_PINCTRL_BROXTON=m +CONFIG_PINCTRL_CANNONLAKE=m +CONFIG_PINCTRL_CEDARFORK=m +CONFIG_PINCTRL_DENVERTON=m +CONFIG_PINCTRL_GEMINILAKE=m +# CONFIG_PINCTRL_ICELAKE is not set +CONFIG_PINCTRL_LEWISBURG=m +CONFIG_PINCTRL_SUNRISEPOINT=m +CONFIG_GPIOLIB=y +CONFIG_GPIOLIB_FASTPATH_LIMIT=512 +CONFIG_GPIO_ACPI=y +CONFIG_GPIOLIB_IRQCHIP=y +# CONFIG_DEBUG_GPIO is not set +CONFIG_GPIO_SYSFS=y +CONFIG_GPIO_GENERIC=m +CONFIG_GPIO_MAX730X=m + +# +# Memory mapped GPIO drivers +# +CONFIG_GPIO_AMDPT=m +CONFIG_GPIO_DWAPB=m +CONFIG_GPIO_EXAR=m +CONFIG_GPIO_GENERIC_PLATFORM=m +CONFIG_GPIO_ICH=m +CONFIG_GPIO_LYNXPOINT=y +CONFIG_GPIO_MB86S7X=m +CONFIG_GPIO_MENZ127=m +CONFIG_GPIO_MOCKUP=m +CONFIG_GPIO_VX855=m + +# +# Port-mapped I/O GPIO drivers +# +CONFIG_GPIO_F7188X=m +CONFIG_GPIO_IT87=m +CONFIG_GPIO_SCH=m +CONFIG_GPIO_SCH311X=m +CONFIG_GPIO_WINBOND=m +CONFIG_GPIO_WS16C48=m + +# +# I2C GPIO expanders +# +CONFIG_GPIO_ADP5588=m +CONFIG_GPIO_MAX7300=m +CONFIG_GPIO_MAX732X=m +CONFIG_GPIO_PCA953X=m +CONFIG_GPIO_PCF857X=m +CONFIG_GPIO_TPIC2810=m + +# +# MFD GPIO expanders +# +CONFIG_GPIO_ARIZONA=m +CONFIG_GPIO_BD9571MWV=m +CONFIG_GPIO_DA9052=m +CONFIG_GPIO_DLN2=m +CONFIG_GPIO_JANZ_TTL=m +CONFIG_GPIO_KEMPLD=m +CONFIG_GPIO_LP3943=m +CONFIG_GPIO_LP873X=m +CONFIG_GPIO_TPS65086=m +CONFIG_GPIO_TPS65912=m +CONFIG_GPIO_UCB1400=m +CONFIG_GPIO_WHISKEY_COVE=m +CONFIG_GPIO_WM831X=m +CONFIG_GPIO_WM8994=m + +# +# PCI GPIO expanders +# +CONFIG_GPIO_AMD8111=m +CONFIG_GPIO_ML_IOH=m +CONFIG_GPIO_PCI_IDIO_16=m +CONFIG_GPIO_PCIE_IDIO_24=m +CONFIG_GPIO_RDC321X=m + +# +# SPI GPIO expanders +# +CONFIG_GPIO_MAX3191X=m +CONFIG_GPIO_MAX7301=m +CONFIG_GPIO_MC33880=m +CONFIG_GPIO_PISOSR=m +CONFIG_GPIO_XRA1403=m + +# +# USB GPIO expanders +# +CONFIG_GPIO_VIPERBOARD=m +CONFIG_W1=m +CONFIG_W1_CON=y + +# +# 1-wire Bus Masters +# +CONFIG_W1_MASTER_MATROX=m +CONFIG_W1_MASTER_DS2490=m +CONFIG_W1_MASTER_DS2482=m +CONFIG_W1_MASTER_DS1WM=m +CONFIG_W1_MASTER_GPIO=m + +# +# 1-wire Slaves +# +CONFIG_W1_SLAVE_THERM=m +CONFIG_W1_SLAVE_SMEM=m +# CONFIG_W1_SLAVE_DS2405 is not set +CONFIG_W1_SLAVE_DS2408=m +# CONFIG_W1_SLAVE_DS2408_READBACK is not set +CONFIG_W1_SLAVE_DS2413=m +CONFIG_W1_SLAVE_DS2406=m +CONFIG_W1_SLAVE_DS2423=m +CONFIG_W1_SLAVE_DS2805=m +CONFIG_W1_SLAVE_DS2431=m +CONFIG_W1_SLAVE_DS2433=m +CONFIG_W1_SLAVE_DS2433_CRC=y +CONFIG_W1_SLAVE_DS2438=m +CONFIG_W1_SLAVE_DS2780=m +CONFIG_W1_SLAVE_DS2781=m +CONFIG_W1_SLAVE_DS28E04=m +CONFIG_W1_SLAVE_DS28E17=m +CONFIG_POWER_AVS=y +CONFIG_POWER_RESET=y +# CONFIG_POWER_RESET_RESTART is not set +CONFIG_POWER_SUPPLY=y +# CONFIG_POWER_SUPPLY_DEBUG is not set +CONFIG_PDA_POWER=m +CONFIG_GENERIC_ADC_BATTERY=m +CONFIG_WM831X_BACKUP=m +CONFIG_WM831X_POWER=m +# CONFIG_TEST_POWER is not set +# CONFIG_CHARGER_ADP5061 is not set +CONFIG_BATTERY_DS2760=m +CONFIG_BATTERY_DS2780=m +CONFIG_BATTERY_DS2781=m +CONFIG_BATTERY_DS2782=m +CONFIG_BATTERY_SBS=m +CONFIG_CHARGER_SBS=m +CONFIG_MANAGER_SBS=m +CONFIG_BATTERY_BQ27XXX=m +CONFIG_BATTERY_BQ27XXX_I2C=m +CONFIG_BATTERY_BQ27XXX_HDQ=m +# CONFIG_BATTERY_BQ27XXX_DT_UPDATES_NVM is not set +CONFIG_BATTERY_DA9052=m +CONFIG_CHARGER_DA9150=m +CONFIG_BATTERY_DA9150=m +CONFIG_CHARGER_AXP20X=m +CONFIG_BATTERY_AXP20X=m +CONFIG_AXP20X_POWER=m +CONFIG_AXP288_CHARGER=m +CONFIG_AXP288_FUEL_GAUGE=m +CONFIG_BATTERY_MAX17040=m +CONFIG_BATTERY_MAX17042=m +CONFIG_BATTERY_MAX1721X=m +CONFIG_CHARGER_PCF50633=m +CONFIG_CHARGER_ISP1704=m +CONFIG_CHARGER_MAX8903=m +CONFIG_CHARGER_LP8727=m +CONFIG_CHARGER_GPIO=m +CONFIG_CHARGER_MANAGER=y +CONFIG_CHARGER_LTC3651=m +CONFIG_CHARGER_MAX14577=m +CONFIG_CHARGER_MAX77693=m +CONFIG_CHARGER_BQ2415X=m +CONFIG_CHARGER_BQ24190=m +CONFIG_CHARGER_BQ24257=m +CONFIG_CHARGER_BQ24735=m +CONFIG_CHARGER_BQ25890=m +CONFIG_CHARGER_SMB347=m +CONFIG_BATTERY_GAUGE_LTC2941=m +CONFIG_BATTERY_RT5033=m +CONFIG_CHARGER_RT9455=m +# CONFIG_CHARGER_CROS_USBPD is not set +CONFIG_HWMON=m +CONFIG_HWMON_VID=m +# CONFIG_HWMON_DEBUG_CHIP is not set + +# +# Native drivers +# +CONFIG_SENSORS_ABITUGURU=m +CONFIG_SENSORS_ABITUGURU3=m +CONFIG_SENSORS_AD7314=m +CONFIG_SENSORS_AD7414=m +CONFIG_SENSORS_AD7418=m +CONFIG_SENSORS_ADM1021=m +CONFIG_SENSORS_ADM1025=m +CONFIG_SENSORS_ADM1026=m +CONFIG_SENSORS_ADM1029=m +CONFIG_SENSORS_ADM1031=m +CONFIG_SENSORS_ADM9240=m +CONFIG_SENSORS_ADT7X10=m +CONFIG_SENSORS_ADT7310=m +CONFIG_SENSORS_ADT7410=m +CONFIG_SENSORS_ADT7411=m +CONFIG_SENSORS_ADT7462=m +CONFIG_SENSORS_ADT7470=m +CONFIG_SENSORS_ADT7475=m +CONFIG_SENSORS_ASC7621=m +CONFIG_SENSORS_K8TEMP=m +CONFIG_SENSORS_K10TEMP=m +CONFIG_SENSORS_FAM15H_POWER=m +CONFIG_SENSORS_APPLESMC=m +CONFIG_SENSORS_ASB100=m +CONFIG_SENSORS_ASPEED=m +CONFIG_SENSORS_ATXP1=m +CONFIG_SENSORS_DS620=m +CONFIG_SENSORS_DS1621=m +CONFIG_SENSORS_DELL_SMM=m +CONFIG_SENSORS_DA9052_ADC=m +CONFIG_SENSORS_I5K_AMB=m +CONFIG_SENSORS_F71805F=m +CONFIG_SENSORS_F71882FG=m +CONFIG_SENSORS_F75375S=m +CONFIG_SENSORS_MC13783_ADC=m +CONFIG_SENSORS_FSCHMD=m +CONFIG_SENSORS_FTSTEUTATES=m +CONFIG_SENSORS_GL518SM=m +CONFIG_SENSORS_GL520SM=m +CONFIG_SENSORS_G760A=m +CONFIG_SENSORS_G762=m +CONFIG_SENSORS_HIH6130=m +CONFIG_SENSORS_IBMAEM=m +CONFIG_SENSORS_IBMPEX=m +CONFIG_SENSORS_IIO_HWMON=m +CONFIG_SENSORS_I5500=m +CONFIG_SENSORS_CORETEMP=m +CONFIG_SENSORS_IT87=m +CONFIG_SENSORS_JC42=m +CONFIG_SENSORS_POWR1220=m +CONFIG_SENSORS_LINEAGE=m +CONFIG_SENSORS_LTC2945=m +CONFIG_SENSORS_LTC2990=m +CONFIG_SENSORS_LTC4151=m +CONFIG_SENSORS_LTC4215=m +CONFIG_SENSORS_LTC4222=m +CONFIG_SENSORS_LTC4245=m +CONFIG_SENSORS_LTC4260=m +CONFIG_SENSORS_LTC4261=m +CONFIG_SENSORS_MAX1111=m +CONFIG_SENSORS_MAX16065=m +CONFIG_SENSORS_MAX1619=m +CONFIG_SENSORS_MAX1668=m +CONFIG_SENSORS_MAX197=m +CONFIG_SENSORS_MAX31722=m +CONFIG_SENSORS_MAX6621=m +CONFIG_SENSORS_MAX6639=m +CONFIG_SENSORS_MAX6642=m +CONFIG_SENSORS_MAX6650=m +CONFIG_SENSORS_MAX6697=m +CONFIG_SENSORS_MAX31790=m +CONFIG_SENSORS_MCP3021=m +# CONFIG_SENSORS_MLXREG_FAN is not set +CONFIG_SENSORS_TC654=m +CONFIG_SENSORS_MENF21BMC_HWMON=m +CONFIG_SENSORS_ADCXX=m +CONFIG_SENSORS_LM63=m +CONFIG_SENSORS_LM70=m +CONFIG_SENSORS_LM73=m +CONFIG_SENSORS_LM75=m +CONFIG_SENSORS_LM77=m +CONFIG_SENSORS_LM78=m +CONFIG_SENSORS_LM80=m +CONFIG_SENSORS_LM83=m +CONFIG_SENSORS_LM85=m +CONFIG_SENSORS_LM87=m +CONFIG_SENSORS_LM90=m +CONFIG_SENSORS_LM92=m +CONFIG_SENSORS_LM93=m +CONFIG_SENSORS_LM95234=m +CONFIG_SENSORS_LM95241=m +CONFIG_SENSORS_LM95245=m +CONFIG_SENSORS_PC87360=m +CONFIG_SENSORS_PC87427=m +CONFIG_SENSORS_NTC_THERMISTOR=m +CONFIG_SENSORS_NCT6683=m +CONFIG_SENSORS_NCT6775=m +CONFIG_SENSORS_NCT7802=m +CONFIG_SENSORS_NCT7904=m +# CONFIG_SENSORS_NPCM7XX is not set +CONFIG_SENSORS_PCF8591=m +CONFIG_PMBUS=m +CONFIG_SENSORS_PMBUS=m +CONFIG_SENSORS_ADM1275=m +CONFIG_SENSORS_IBM_CFFPS=m +CONFIG_SENSORS_IR35221=m +CONFIG_SENSORS_LM25066=m +CONFIG_SENSORS_LTC2978=m +CONFIG_SENSORS_LTC2978_REGULATOR=y +CONFIG_SENSORS_LTC3815=m +CONFIG_SENSORS_MAX16064=m +CONFIG_SENSORS_MAX20751=m +CONFIG_SENSORS_MAX31785=m +CONFIG_SENSORS_MAX34440=m +CONFIG_SENSORS_MAX8688=m +CONFIG_SENSORS_TPS40422=m +CONFIG_SENSORS_TPS53679=m +CONFIG_SENSORS_UCD9000=m +CONFIG_SENSORS_UCD9200=m +CONFIG_SENSORS_ZL6100=m +CONFIG_SENSORS_SHT15=m +CONFIG_SENSORS_SHT21=m +CONFIG_SENSORS_SHT3x=m +CONFIG_SENSORS_SHTC1=m +CONFIG_SENSORS_SIS5595=m +CONFIG_SENSORS_DME1737=m +CONFIG_SENSORS_EMC1403=m +CONFIG_SENSORS_EMC2103=m +CONFIG_SENSORS_EMC6W201=m +CONFIG_SENSORS_SMSC47M1=m +CONFIG_SENSORS_SMSC47M192=m +CONFIG_SENSORS_SMSC47B397=m +CONFIG_SENSORS_SCH56XX_COMMON=m +CONFIG_SENSORS_SCH5627=m +CONFIG_SENSORS_SCH5636=m +CONFIG_SENSORS_STTS751=m +CONFIG_SENSORS_SMM665=m +CONFIG_SENSORS_ADC128D818=m +CONFIG_SENSORS_ADS1015=m +CONFIG_SENSORS_ADS7828=m +CONFIG_SENSORS_ADS7871=m +CONFIG_SENSORS_AMC6821=m +CONFIG_SENSORS_INA209=m +CONFIG_SENSORS_INA2XX=m +CONFIG_SENSORS_INA3221=m +CONFIG_SENSORS_TC74=m +CONFIG_SENSORS_THMC50=m +CONFIG_SENSORS_TMP102=m +CONFIG_SENSORS_TMP103=m +CONFIG_SENSORS_TMP108=m +CONFIG_SENSORS_TMP401=m +CONFIG_SENSORS_TMP421=m +CONFIG_SENSORS_VIA_CPUTEMP=m +CONFIG_SENSORS_VIA686A=m +CONFIG_SENSORS_VT1211=m +CONFIG_SENSORS_VT8231=m +CONFIG_SENSORS_W83773G=m +CONFIG_SENSORS_W83781D=m +CONFIG_SENSORS_W83791D=m +CONFIG_SENSORS_W83792D=m +CONFIG_SENSORS_W83793=m +CONFIG_SENSORS_W83795=m +# CONFIG_SENSORS_W83795_FANCTRL is not set +CONFIG_SENSORS_W83L785TS=m +CONFIG_SENSORS_W83L786NG=m +CONFIG_SENSORS_W83627HF=m +CONFIG_SENSORS_W83627EHF=m +CONFIG_SENSORS_WM831X=m +CONFIG_SENSORS_XGENE=m + +# +# ACPI drivers +# +CONFIG_SENSORS_ACPI_POWER=m +CONFIG_SENSORS_ATK0110=m +CONFIG_THERMAL=y +# CONFIG_THERMAL_STATISTICS is not set +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 +CONFIG_THERMAL_WRITABLE_TRIPS=y +CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y +# CONFIG_THERMAL_DEFAULT_GOV_FAIR_SHARE is not set +# CONFIG_THERMAL_DEFAULT_GOV_USER_SPACE is not set +# CONFIG_THERMAL_DEFAULT_GOV_POWER_ALLOCATOR is not set +CONFIG_THERMAL_GOV_FAIR_SHARE=y +CONFIG_THERMAL_GOV_STEP_WISE=y +CONFIG_THERMAL_GOV_BANG_BANG=y +CONFIG_THERMAL_GOV_USER_SPACE=y +# CONFIG_THERMAL_GOV_POWER_ALLOCATOR is not set +CONFIG_CLOCK_THERMAL=y +CONFIG_DEVFREQ_THERMAL=y +# CONFIG_THERMAL_EMULATION is not set +CONFIG_INTEL_POWERCLAMP=m +CONFIG_X86_PKG_TEMP_THERMAL=m +CONFIG_INTEL_SOC_DTS_IOSF_CORE=m +CONFIG_INTEL_SOC_DTS_THERMAL=m + +# +# ACPI INT340X thermal drivers +# +CONFIG_INT340X_THERMAL=m +CONFIG_ACPI_THERMAL_REL=m +CONFIG_INT3406_THERMAL=m +CONFIG_INTEL_BXT_PMIC_THERMAL=m +CONFIG_INTEL_PCH_THERMAL=m +CONFIG_GENERIC_ADC_THERMAL=m +CONFIG_WATCHDOG=y +CONFIG_WATCHDOG_CORE=y +# CONFIG_WATCHDOG_NOWAYOUT is not set +CONFIG_WATCHDOG_HANDLE_BOOT_ENABLED=y +# CONFIG_WATCHDOG_SYSFS is not set + +# +# Watchdog Device Drivers +# +CONFIG_SOFT_WATCHDOG=m +# CONFIG_SOFT_WATCHDOG_PRETIMEOUT is not set +CONFIG_DA9052_WATCHDOG=m +CONFIG_DA9063_WATCHDOG=m +CONFIG_DA9062_WATCHDOG=m +CONFIG_MENF21BMC_WATCHDOG=m +# CONFIG_MENZ069_WATCHDOG is not set +CONFIG_WDAT_WDT=m +CONFIG_WM831X_WATCHDOG=m +CONFIG_XILINX_WATCHDOG=m +CONFIG_ZIIRAVE_WATCHDOG=m +CONFIG_RAVE_SP_WATCHDOG=m +CONFIG_CADENCE_WATCHDOG=m +CONFIG_DW_WATCHDOG=m +CONFIG_MAX63XX_WATCHDOG=m +CONFIG_RETU_WATCHDOG=m +CONFIG_ACQUIRE_WDT=m +CONFIG_ADVANTECH_WDT=m +CONFIG_ALIM1535_WDT=m +CONFIG_ALIM7101_WDT=m +CONFIG_EBC_C384_WDT=m +CONFIG_F71808E_WDT=m +# CONFIG_SP5100_TCO is not set +CONFIG_SBC_FITPC2_WATCHDOG=m +CONFIG_EUROTECH_WDT=m +CONFIG_IB700_WDT=m +CONFIG_IBMASR=m +CONFIG_WAFER_WDT=m +CONFIG_I6300ESB_WDT=m +CONFIG_IE6XX_WDT=m +CONFIG_ITCO_WDT=m +CONFIG_ITCO_VENDOR_SUPPORT=y +CONFIG_IT8712F_WDT=m +CONFIG_IT87_WDT=m +CONFIG_HP_WATCHDOG=m +CONFIG_KEMPLD_WDT=m +CONFIG_HPWDT_NMI_DECODING=y +CONFIG_SC1200_WDT=m +CONFIG_PC87413_WDT=m +CONFIG_NV_TCO=m +CONFIG_60XX_WDT=m +CONFIG_CPU5_WDT=m +CONFIG_SMSC_SCH311X_WDT=m +CONFIG_SMSC37B787_WDT=m +CONFIG_VIA_WDT=m +CONFIG_W83627HF_WDT=m +CONFIG_W83877F_WDT=m +CONFIG_W83977F_WDT=m +CONFIG_MACHZ_WDT=m +CONFIG_SBC_EPX_C3_WATCHDOG=m +CONFIG_INTEL_MEI_WDT=m +CONFIG_NI903X_WDT=m +CONFIG_NIC7018_WDT=m +CONFIG_MEN_A21_WDT=m + +# +# PCI-based Watchdog Cards +# +CONFIG_PCIPCWATCHDOG=m +CONFIG_WDTPCI=m + +# +# USB-based Watchdog Cards +# +CONFIG_USBPCWATCHDOG=m + +# +# Watchdog Pretimeout Governors +# +CONFIG_WATCHDOG_PRETIMEOUT_GOV=y +# CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_NOOP is not set +CONFIG_WATCHDOG_PRETIMEOUT_DEFAULT_GOV_PANIC=y +CONFIG_WATCHDOG_PRETIMEOUT_GOV_NOOP=m +CONFIG_WATCHDOG_PRETIMEOUT_GOV_PANIC=y +CONFIG_SSB_POSSIBLE=y +CONFIG_SSB=m +CONFIG_SSB_SPROM=y +CONFIG_SSB_BLOCKIO=y +CONFIG_SSB_PCIHOST_POSSIBLE=y +CONFIG_SSB_PCIHOST=y +CONFIG_SSB_B43_PCI_BRIDGE=y +CONFIG_SSB_PCMCIAHOST_POSSIBLE=y +CONFIG_SSB_PCMCIAHOST=y +CONFIG_SSB_SDIOHOST_POSSIBLE=y +CONFIG_SSB_SDIOHOST=y +CONFIG_SSB_DRIVER_PCICORE_POSSIBLE=y +CONFIG_SSB_DRIVER_PCICORE=y +CONFIG_SSB_DRIVER_GPIO=y +CONFIG_BCMA_POSSIBLE=y +CONFIG_BCMA=m +CONFIG_BCMA_BLOCKIO=y +CONFIG_BCMA_HOST_PCI_POSSIBLE=y +CONFIG_BCMA_HOST_PCI=y +CONFIG_BCMA_HOST_SOC=y +CONFIG_BCMA_DRIVER_PCI=y +CONFIG_BCMA_SFLASH=y +CONFIG_BCMA_DRIVER_GMAC_CMN=y +CONFIG_BCMA_DRIVER_GPIO=y +# CONFIG_BCMA_DEBUG is not set + +# +# Multifunction device drivers +# +CONFIG_MFD_CORE=y +CONFIG_MFD_BCM590XX=m +CONFIG_MFD_BD9571MWV=m +CONFIG_MFD_AXP20X=m +CONFIG_MFD_AXP20X_I2C=m +CONFIG_MFD_CROS_EC=m +# CONFIG_MFD_CROS_EC_CHARDEV is not set +# CONFIG_MFD_MADERA is not set +CONFIG_PMIC_DA9052=y +CONFIG_MFD_DA9052_SPI=y +CONFIG_MFD_DA9062=m +CONFIG_MFD_DA9063=m +CONFIG_MFD_DA9150=m +CONFIG_MFD_DLN2=m +CONFIG_MFD_MC13XXX=m +CONFIG_MFD_MC13XXX_SPI=m +CONFIG_MFD_MC13XXX_I2C=m +CONFIG_HTC_PASIC3=m +CONFIG_MFD_INTEL_QUARK_I2C_GPIO=m +CONFIG_LPC_ICH=m +CONFIG_LPC_SCH=m +CONFIG_INTEL_SOC_PMIC_BXTWC=m +CONFIG_INTEL_SOC_PMIC_CHTDC_TI=m +CONFIG_MFD_INTEL_LPSS=m +CONFIG_MFD_INTEL_LPSS_ACPI=m +CONFIG_MFD_INTEL_LPSS_PCI=m +CONFIG_MFD_JANZ_CMODIO=m +CONFIG_MFD_KEMPLD=m +CONFIG_MFD_88PM800=m +CONFIG_MFD_88PM805=m +CONFIG_MFD_MAX14577=m +CONFIG_MFD_MAX77693=m +CONFIG_MFD_MAX8907=m +CONFIG_MFD_MT6397=m +CONFIG_MFD_MENF21BMC=m +CONFIG_EZX_PCAP=y +CONFIG_MFD_VIPERBOARD=m +CONFIG_MFD_RETU=m +CONFIG_MFD_PCF50633=m +CONFIG_PCF50633_ADC=m +CONFIG_PCF50633_GPIO=m +CONFIG_UCB1400_CORE=m +CONFIG_MFD_RDC321X=m +CONFIG_MFD_RT5033=m +CONFIG_MFD_SI476X_CORE=m +CONFIG_MFD_SM501=m +CONFIG_MFD_SM501_GPIO=y +CONFIG_MFD_SKY81452=m +CONFIG_ABX500_CORE=y +CONFIG_MFD_SYSCON=y +CONFIG_MFD_TI_AM335X_TSCADC=m +CONFIG_MFD_LP3943=m +CONFIG_MFD_TI_LMU=m +CONFIG_TPS6105X=m +CONFIG_TPS65010=m +CONFIG_TPS6507X=m +CONFIG_MFD_TPS65086=m +CONFIG_MFD_TI_LP873X=m +CONFIG_MFD_TPS65912=y +CONFIG_MFD_TPS65912_I2C=m +CONFIG_MFD_TPS65912_SPI=y +CONFIG_MFD_WL1273_CORE=m +CONFIG_MFD_LM3533=m +CONFIG_MFD_VX855=m +CONFIG_MFD_ARIZONA=y +CONFIG_MFD_ARIZONA_I2C=m +CONFIG_MFD_ARIZONA_SPI=m +CONFIG_MFD_CS47L24=y +CONFIG_MFD_WM5102=y +CONFIG_MFD_WM5110=y +CONFIG_MFD_WM8997=y +CONFIG_MFD_WM8998=y +CONFIG_MFD_WM831X=y +CONFIG_MFD_WM831X_SPI=y +CONFIG_MFD_WM8994=m +CONFIG_RAVE_SP_CORE=m +CONFIG_REGULATOR=y +# CONFIG_REGULATOR_DEBUG is not set +CONFIG_REGULATOR_FIXED_VOLTAGE=m +CONFIG_REGULATOR_VIRTUAL_CONSUMER=m +CONFIG_REGULATOR_USERSPACE_CONSUMER=m +CONFIG_REGULATOR_88PG86X=m +CONFIG_REGULATOR_88PM800=m +CONFIG_REGULATOR_ACT8865=m +CONFIG_REGULATOR_AD5398=m +CONFIG_REGULATOR_ANATOP=m +CONFIG_REGULATOR_ARIZONA_LDO1=m +CONFIG_REGULATOR_ARIZONA_MICSUPP=m +CONFIG_REGULATOR_AXP20X=m +CONFIG_REGULATOR_BCM590XX=m +CONFIG_REGULATOR_BD9571MWV=m +CONFIG_REGULATOR_DA9052=m +CONFIG_REGULATOR_DA9062=m +CONFIG_REGULATOR_DA9063=m +CONFIG_REGULATOR_DA9210=m +CONFIG_REGULATOR_DA9211=m +CONFIG_REGULATOR_FAN53555=m +CONFIG_REGULATOR_GPIO=m +CONFIG_REGULATOR_ISL9305=m +CONFIG_REGULATOR_ISL6271A=m +CONFIG_REGULATOR_LM363X=m +CONFIG_REGULATOR_LP3971=m +CONFIG_REGULATOR_LP3972=m +CONFIG_REGULATOR_LP872X=m +CONFIG_REGULATOR_LP8755=m +CONFIG_REGULATOR_LTC3589=m +CONFIG_REGULATOR_LTC3676=m +CONFIG_REGULATOR_MAX14577=m +CONFIG_REGULATOR_MAX1586=m +CONFIG_REGULATOR_MAX8649=m +CONFIG_REGULATOR_MAX8660=m +CONFIG_REGULATOR_MAX8907=m +CONFIG_REGULATOR_MAX8952=m +CONFIG_REGULATOR_MAX77693=m +CONFIG_REGULATOR_MC13XXX_CORE=m +CONFIG_REGULATOR_MC13783=m +CONFIG_REGULATOR_MC13892=m +CONFIG_REGULATOR_MT6311=m +CONFIG_REGULATOR_MT6323=m +CONFIG_REGULATOR_MT6397=m +CONFIG_REGULATOR_PCAP=m +CONFIG_REGULATOR_PCF50633=m +CONFIG_REGULATOR_PFUZE100=m +CONFIG_REGULATOR_PV88060=m +CONFIG_REGULATOR_PV88080=m +CONFIG_REGULATOR_PV88090=m +CONFIG_REGULATOR_PWM=m +CONFIG_REGULATOR_QCOM_SPMI=m +CONFIG_REGULATOR_RT5033=m +CONFIG_REGULATOR_SKY81452=m +CONFIG_REGULATOR_TPS51632=m +CONFIG_REGULATOR_TPS6105X=m +CONFIG_REGULATOR_TPS62360=m +CONFIG_REGULATOR_TPS65023=m +CONFIG_REGULATOR_TPS6507X=m +CONFIG_REGULATOR_TPS65086=m +CONFIG_REGULATOR_TPS65132=m +CONFIG_REGULATOR_TPS6524X=m +CONFIG_REGULATOR_TPS65912=m +CONFIG_REGULATOR_WM831X=m +CONFIG_REGULATOR_WM8994=m +CONFIG_CEC_CORE=m +CONFIG_CEC_NOTIFIER=y +CONFIG_CEC_PIN=y +CONFIG_RC_CORE=m +CONFIG_RC_MAP=m +CONFIG_LIRC=y +CONFIG_RC_DECODERS=y +CONFIG_IR_NEC_DECODER=m +CONFIG_IR_RC5_DECODER=m +CONFIG_IR_RC6_DECODER=m +CONFIG_IR_JVC_DECODER=m +CONFIG_IR_SONY_DECODER=m +CONFIG_IR_SANYO_DECODER=m +CONFIG_IR_SHARP_DECODER=m +CONFIG_IR_MCE_KBD_DECODER=m +CONFIG_IR_XMP_DECODER=m +CONFIG_IR_IMON_DECODER=m +CONFIG_RC_DEVICES=y +CONFIG_RC_ATI_REMOTE=m +CONFIG_IR_ENE=m +CONFIG_IR_IMON=m +CONFIG_IR_IMON_RAW=m +CONFIG_IR_MCEUSB=m +CONFIG_IR_ITE_CIR=m +CONFIG_IR_FINTEK=m +CONFIG_IR_NUVOTON=m +CONFIG_IR_REDRAT3=m +CONFIG_IR_STREAMZAP=m +CONFIG_IR_WINBOND_CIR=m +CONFIG_IR_IGORPLUGUSB=m +CONFIG_IR_IGUANA=m +CONFIG_IR_TTUSBIR=m +CONFIG_RC_LOOPBACK=m +CONFIG_IR_SERIAL=m +CONFIG_IR_SERIAL_TRANSMITTER=y +CONFIG_IR_SIR=m +CONFIG_MEDIA_SUPPORT=m + +# +# Multimedia core support +# +CONFIG_MEDIA_CAMERA_SUPPORT=y +CONFIG_MEDIA_ANALOG_TV_SUPPORT=y +CONFIG_MEDIA_DIGITAL_TV_SUPPORT=y +CONFIG_MEDIA_RADIO_SUPPORT=y +CONFIG_MEDIA_SDR_SUPPORT=y +CONFIG_MEDIA_CEC_SUPPORT=y +CONFIG_MEDIA_CEC_RC=y +# CONFIG_CEC_PIN_ERROR_INJ is not set +CONFIG_MEDIA_CONTROLLER=y +CONFIG_MEDIA_CONTROLLER_DVB=y +CONFIG_VIDEO_DEV=m +CONFIG_VIDEO_V4L2_SUBDEV_API=y +CONFIG_VIDEO_V4L2=m +# CONFIG_VIDEO_ADV_DEBUG is not set +# CONFIG_VIDEO_FIXED_MINOR_RANGES is not set +CONFIG_VIDEO_PCI_SKELETON=m +CONFIG_VIDEO_TUNER=m +CONFIG_V4L2_MEM2MEM_DEV=m +CONFIG_V4L2_FLASH_LED_CLASS=m +CONFIG_V4L2_FWNODE=m +CONFIG_VIDEOBUF_GEN=m +CONFIG_VIDEOBUF_DMA_SG=m +CONFIG_VIDEOBUF_VMALLOC=m +CONFIG_DVB_CORE=m +# CONFIG_DVB_MMAP is not set +CONFIG_DVB_NET=y +CONFIG_TTPCI_EEPROM=m +CONFIG_DVB_MAX_ADAPTERS=8 +CONFIG_DVB_DYNAMIC_MINORS=y +# CONFIG_DVB_DEMUX_SECTION_LOSS_LOG is not set +# CONFIG_DVB_ULE_DEBUG is not set + +# +# Media drivers +# +CONFIG_MEDIA_USB_SUPPORT=y + +# +# Webcam devices +# +CONFIG_USB_VIDEO_CLASS=m +CONFIG_USB_VIDEO_CLASS_INPUT_EVDEV=y +CONFIG_USB_GSPCA=m +CONFIG_USB_M5602=m +CONFIG_USB_STV06XX=m +CONFIG_USB_GL860=m +CONFIG_USB_GSPCA_BENQ=m +CONFIG_USB_GSPCA_CONEX=m +CONFIG_USB_GSPCA_CPIA1=m +CONFIG_USB_GSPCA_DTCS033=m +CONFIG_USB_GSPCA_ETOMS=m +CONFIG_USB_GSPCA_FINEPIX=m +CONFIG_USB_GSPCA_JEILINJ=m +CONFIG_USB_GSPCA_JL2005BCD=m +CONFIG_USB_GSPCA_KINECT=m +CONFIG_USB_GSPCA_KONICA=m +CONFIG_USB_GSPCA_MARS=m +CONFIG_USB_GSPCA_MR97310A=m +CONFIG_USB_GSPCA_NW80X=m +CONFIG_USB_GSPCA_OV519=m +CONFIG_USB_GSPCA_OV534=m +CONFIG_USB_GSPCA_OV534_9=m +CONFIG_USB_GSPCA_PAC207=m +CONFIG_USB_GSPCA_PAC7302=m +CONFIG_USB_GSPCA_PAC7311=m +CONFIG_USB_GSPCA_SE401=m +CONFIG_USB_GSPCA_SN9C2028=m +CONFIG_USB_GSPCA_SN9C20X=m +CONFIG_USB_GSPCA_SONIXB=m +CONFIG_USB_GSPCA_SONIXJ=m +CONFIG_USB_GSPCA_SPCA500=m +CONFIG_USB_GSPCA_SPCA501=m +CONFIG_USB_GSPCA_SPCA505=m +CONFIG_USB_GSPCA_SPCA506=m +CONFIG_USB_GSPCA_SPCA508=m +CONFIG_USB_GSPCA_SPCA561=m +CONFIG_USB_GSPCA_SPCA1528=m +CONFIG_USB_GSPCA_SQ905=m +CONFIG_USB_GSPCA_SQ905C=m +CONFIG_USB_GSPCA_SQ930X=m +CONFIG_USB_GSPCA_STK014=m +CONFIG_USB_GSPCA_STK1135=m +CONFIG_USB_GSPCA_STV0680=m +CONFIG_USB_GSPCA_SUNPLUS=m +CONFIG_USB_GSPCA_T613=m +CONFIG_USB_GSPCA_TOPRO=m +CONFIG_USB_GSPCA_TOUPTEK=m +CONFIG_USB_GSPCA_TV8532=m +CONFIG_USB_GSPCA_VC032X=m +CONFIG_USB_GSPCA_VICAM=m +CONFIG_USB_GSPCA_XIRLINK_CIT=m +CONFIG_USB_GSPCA_ZC3XX=m +CONFIG_USB_PWC=m +# CONFIG_USB_PWC_DEBUG is not set +CONFIG_USB_PWC_INPUT_EVDEV=y +CONFIG_VIDEO_CPIA2=m +CONFIG_USB_ZR364XX=m +CONFIG_USB_STKWEBCAM=m +CONFIG_USB_S2255=m +CONFIG_VIDEO_USBTV=m + +# +# Analog TV USB devices +# +CONFIG_VIDEO_PVRUSB2=m +CONFIG_VIDEO_PVRUSB2_SYSFS=y +CONFIG_VIDEO_PVRUSB2_DVB=y +# CONFIG_VIDEO_PVRUSB2_DEBUGIFC is not set +CONFIG_VIDEO_HDPVR=m +CONFIG_VIDEO_USBVISION=m +CONFIG_VIDEO_STK1160_COMMON=m +CONFIG_VIDEO_STK1160=m +CONFIG_VIDEO_GO7007=m +CONFIG_VIDEO_GO7007_USB=m +CONFIG_VIDEO_GO7007_LOADER=m +CONFIG_VIDEO_GO7007_USB_S2250_BOARD=m + +# +# Analog/digital TV USB devices +# +CONFIG_VIDEO_AU0828=m +CONFIG_VIDEO_AU0828_V4L2=y +CONFIG_VIDEO_AU0828_RC=y +CONFIG_VIDEO_CX231XX=m +CONFIG_VIDEO_CX231XX_RC=y +CONFIG_VIDEO_CX231XX_ALSA=m +CONFIG_VIDEO_CX231XX_DVB=m +CONFIG_VIDEO_TM6000=m +CONFIG_VIDEO_TM6000_ALSA=m +CONFIG_VIDEO_TM6000_DVB=m + +# +# Digital TV USB devices +# +CONFIG_DVB_USB=m +# CONFIG_DVB_USB_DEBUG is not set +CONFIG_DVB_USB_DIB3000MC=m +CONFIG_DVB_USB_A800=m +CONFIG_DVB_USB_DIBUSB_MB=m +CONFIG_DVB_USB_DIBUSB_MB_FAULTY=y +CONFIG_DVB_USB_DIBUSB_MC=m +CONFIG_DVB_USB_DIB0700=m +CONFIG_DVB_USB_UMT_010=m +CONFIG_DVB_USB_CXUSB=m +CONFIG_DVB_USB_M920X=m +CONFIG_DVB_USB_DIGITV=m +CONFIG_DVB_USB_VP7045=m +CONFIG_DVB_USB_VP702X=m +CONFIG_DVB_USB_GP8PSK=m +CONFIG_DVB_USB_NOVA_T_USB2=m +CONFIG_DVB_USB_TTUSB2=m +CONFIG_DVB_USB_DTT200U=m +CONFIG_DVB_USB_OPERA1=m +CONFIG_DVB_USB_AF9005=m +CONFIG_DVB_USB_AF9005_REMOTE=m +CONFIG_DVB_USB_PCTV452E=m +CONFIG_DVB_USB_DW2102=m +CONFIG_DVB_USB_CINERGY_T2=m +CONFIG_DVB_USB_DTV5100=m +CONFIG_DVB_USB_AZ6027=m +CONFIG_DVB_USB_TECHNISAT_USB2=m +CONFIG_DVB_USB_V2=m +CONFIG_DVB_USB_AF9015=m +CONFIG_DVB_USB_AF9035=m +CONFIG_DVB_USB_ANYSEE=m +CONFIG_DVB_USB_AU6610=m +CONFIG_DVB_USB_AZ6007=m +CONFIG_DVB_USB_CE6230=m +CONFIG_DVB_USB_EC168=m +CONFIG_DVB_USB_GL861=m +CONFIG_DVB_USB_LME2510=m +CONFIG_DVB_USB_MXL111SF=m +CONFIG_DVB_USB_RTL28XXU=m +CONFIG_DVB_USB_DVBSKY=m +CONFIG_DVB_USB_ZD1301=m +CONFIG_DVB_TTUSB_BUDGET=m +CONFIG_DVB_TTUSB_DEC=m +CONFIG_SMS_USB_DRV=m +CONFIG_DVB_B2C2_FLEXCOP_USB=m +# CONFIG_DVB_B2C2_FLEXCOP_USB_DEBUG is not set +CONFIG_DVB_AS102=m + +# +# Webcam, TV (analog/digital) USB devices +# +CONFIG_VIDEO_EM28XX=m +CONFIG_VIDEO_EM28XX_V4L2=m +CONFIG_VIDEO_EM28XX_ALSA=m +CONFIG_VIDEO_EM28XX_DVB=m +CONFIG_VIDEO_EM28XX_RC=m + +# +# Software defined radio USB devices +# +CONFIG_USB_AIRSPY=m +CONFIG_USB_HACKRF=m +CONFIG_USB_MSI2500=m + +# +# USB HDMI CEC adapters +# +CONFIG_USB_PULSE8_CEC=m +CONFIG_USB_RAINSHADOW_CEC=m +CONFIG_MEDIA_PCI_SUPPORT=y + +# +# Media capture support +# +CONFIG_VIDEO_MEYE=m +CONFIG_VIDEO_SOLO6X10=m +CONFIG_VIDEO_TW5864=m +CONFIG_VIDEO_TW68=m +CONFIG_VIDEO_TW686X=m + +# +# Media capture/analog TV support +# +CONFIG_VIDEO_IVTV=m +# CONFIG_VIDEO_IVTV_DEPRECATED_IOCTLS is not set +CONFIG_VIDEO_IVTV_ALSA=m +CONFIG_VIDEO_FB_IVTV=m +CONFIG_VIDEO_HEXIUM_GEMINI=m +CONFIG_VIDEO_HEXIUM_ORION=m +CONFIG_VIDEO_MXB=m +CONFIG_VIDEO_DT3155=m + +# +# Media capture/analog/hybrid TV support +# +CONFIG_VIDEO_CX18=m +CONFIG_VIDEO_CX18_ALSA=m +CONFIG_VIDEO_CX23885=m +CONFIG_MEDIA_ALTERA_CI=m +CONFIG_VIDEO_CX25821=m +CONFIG_VIDEO_CX25821_ALSA=m +CONFIG_VIDEO_CX88=m +CONFIG_VIDEO_CX88_ALSA=m +CONFIG_VIDEO_CX88_BLACKBIRD=m +CONFIG_VIDEO_CX88_DVB=m +CONFIG_VIDEO_CX88_ENABLE_VP3054=y +CONFIG_VIDEO_CX88_VP3054=m +CONFIG_VIDEO_CX88_MPEG=m +CONFIG_VIDEO_BT848=m +CONFIG_DVB_BT8XX=m +CONFIG_VIDEO_SAA7134=m +CONFIG_VIDEO_SAA7134_ALSA=m +CONFIG_VIDEO_SAA7134_RC=y +CONFIG_VIDEO_SAA7134_DVB=m +CONFIG_VIDEO_SAA7134_GO7007=m +CONFIG_VIDEO_SAA7164=m +# CONFIG_VIDEO_COBALT is not set + +# +# Media digital TV PCI Adapters +# +CONFIG_DVB_AV7110_IR=y +CONFIG_DVB_AV7110=m +CONFIG_DVB_AV7110_OSD=y +CONFIG_DVB_BUDGET_CORE=m +CONFIG_DVB_BUDGET=m +CONFIG_DVB_BUDGET_CI=m +CONFIG_DVB_BUDGET_AV=m +CONFIG_DVB_BUDGET_PATCH=m +CONFIG_DVB_B2C2_FLEXCOP_PCI=m +# CONFIG_DVB_B2C2_FLEXCOP_PCI_DEBUG is not set +CONFIG_DVB_PLUTO2=m +CONFIG_DVB_DM1105=m +CONFIG_DVB_PT1=m +CONFIG_DVB_PT3=m +CONFIG_MANTIS_CORE=m +CONFIG_DVB_MANTIS=m +CONFIG_DVB_HOPPER=m +CONFIG_DVB_NGENE=m +CONFIG_DVB_DDBRIDGE=m +# CONFIG_DVB_DDBRIDGE_MSIENABLE is not set +CONFIG_DVB_SMIPCIE=m +CONFIG_DVB_NETUP_UNIDVB=m +CONFIG_VIDEO_IPU3_CIO2=m +CONFIG_V4L_PLATFORM_DRIVERS=y +CONFIG_VIDEO_CAFE_CCIC=m +CONFIG_VIDEO_CADENCE=y +CONFIG_VIDEO_CADENCE_CSI2RX=m +CONFIG_VIDEO_CADENCE_CSI2TX=m +CONFIG_SOC_CAMERA=m +CONFIG_SOC_CAMERA_PLATFORM=m +CONFIG_V4L_MEM2MEM_DRIVERS=y +CONFIG_VIDEO_MEM2MEM_DEINTERLACE=m +CONFIG_VIDEO_SH_VEU=m +# CONFIG_V4L_TEST_DRIVERS is not set +CONFIG_DVB_PLATFORM_DRIVERS=y +CONFIG_CEC_PLATFORM_DRIVERS=y +# CONFIG_VIDEO_CROS_EC_CEC is not set +CONFIG_CEC_GPIO=m +CONFIG_SDR_PLATFORM_DRIVERS=y + +# +# Supported MMC/SDIO adapters +# +CONFIG_SMS_SDIO_DRV=m +CONFIG_RADIO_ADAPTERS=y +CONFIG_RADIO_TEA575X=m +CONFIG_RADIO_SI470X=m +CONFIG_USB_SI470X=m +CONFIG_I2C_SI470X=m +CONFIG_RADIO_SI4713=m +CONFIG_USB_SI4713=m +CONFIG_PLATFORM_SI4713=m +CONFIG_I2C_SI4713=m +CONFIG_RADIO_SI476X=m +CONFIG_USB_MR800=m +CONFIG_USB_DSBR=m +CONFIG_RADIO_MAXIRADIO=m +CONFIG_RADIO_SHARK=m +CONFIG_RADIO_SHARK2=m +CONFIG_USB_KEENE=m +CONFIG_USB_RAREMONO=m +CONFIG_USB_MA901=m +CONFIG_RADIO_TEA5764=m +CONFIG_RADIO_SAA7706H=m +CONFIG_RADIO_TEF6862=m +CONFIG_RADIO_WL1273=m + +# +# Texas Instruments WL128x FM driver (ST based) +# +CONFIG_RADIO_WL128X=m + +# +# Supported FireWire (IEEE 1394) Adapters +# +CONFIG_DVB_FIREDTV=m +CONFIG_DVB_FIREDTV_INPUT=y +CONFIG_MEDIA_COMMON_OPTIONS=y + +# +# common driver options +# +CONFIG_VIDEO_CX2341X=m +CONFIG_VIDEO_TVEEPROM=m +CONFIG_CYPRESS_FIRMWARE=m +CONFIG_VIDEOBUF2_CORE=m +CONFIG_VIDEOBUF2_V4L2=m +CONFIG_VIDEOBUF2_MEMOPS=m +CONFIG_VIDEOBUF2_DMA_CONTIG=m +CONFIG_VIDEOBUF2_VMALLOC=m +CONFIG_VIDEOBUF2_DMA_SG=m +CONFIG_VIDEOBUF2_DVB=m +CONFIG_DVB_B2C2_FLEXCOP=m +CONFIG_VIDEO_SAA7146=m +CONFIG_VIDEO_SAA7146_VV=m +CONFIG_SMS_SIANO_MDTV=m +CONFIG_SMS_SIANO_RC=y +# CONFIG_SMS_SIANO_DEBUGFS is not set + +# +# Media ancillary drivers (tuners, sensors, i2c, spi, frontends) +# +CONFIG_MEDIA_SUBDRV_AUTOSELECT=y +CONFIG_MEDIA_ATTACH=y +CONFIG_VIDEO_IR_I2C=m + +# +# Audio decoders, processors and mixers +# +CONFIG_VIDEO_TVAUDIO=m +CONFIG_VIDEO_TDA7432=m +CONFIG_VIDEO_TDA9840=m +CONFIG_VIDEO_TEA6415C=m +CONFIG_VIDEO_TEA6420=m +CONFIG_VIDEO_MSP3400=m +CONFIG_VIDEO_CS3308=m +CONFIG_VIDEO_CS5345=m +CONFIG_VIDEO_CS53L32A=m +CONFIG_VIDEO_UDA1342=m +CONFIG_VIDEO_WM8775=m +CONFIG_VIDEO_WM8739=m +CONFIG_VIDEO_VP27SMPX=m +CONFIG_VIDEO_SONY_BTF_MPX=m + +# +# RDS decoders +# +CONFIG_VIDEO_SAA6588=m + +# +# Video decoders +# +CONFIG_VIDEO_BT819=m +CONFIG_VIDEO_BT856=m +CONFIG_VIDEO_BT866=m +CONFIG_VIDEO_KS0127=m +CONFIG_VIDEO_SAA7110=m +CONFIG_VIDEO_SAA711X=m +CONFIG_VIDEO_TVP5150=m +CONFIG_VIDEO_TW2804=m +CONFIG_VIDEO_TW9903=m +CONFIG_VIDEO_TW9906=m +CONFIG_VIDEO_VPX3220=m + +# +# Video and audio decoders +# +CONFIG_VIDEO_SAA717X=m +CONFIG_VIDEO_CX25840=m + +# +# Video encoders +# +CONFIG_VIDEO_SAA7127=m +CONFIG_VIDEO_SAA7185=m +CONFIG_VIDEO_ADV7170=m +CONFIG_VIDEO_ADV7175=m + +# +# Camera sensor devices +# +CONFIG_VIDEO_OV2640=m +CONFIG_VIDEO_OV7640=m +CONFIG_VIDEO_OV7670=m +CONFIG_VIDEO_MT9M111=m +CONFIG_VIDEO_MT9V011=m + +# +# Flash devices +# + +# +# Video improvement chips +# +CONFIG_VIDEO_UPD64031A=m +CONFIG_VIDEO_UPD64083=m + +# +# Audio/Video compression chips +# +CONFIG_VIDEO_SAA6752HS=m + +# +# SDR tuner chips +# + +# +# Miscellaneous helper chips +# +CONFIG_VIDEO_M52790=m + +# +# Sensors used on soc_camera driver +# + +# +# soc_camera sensor drivers +# +CONFIG_SOC_CAMERA_MT9M001=m +CONFIG_SOC_CAMERA_MT9M111=m +CONFIG_SOC_CAMERA_MT9T112=m +CONFIG_SOC_CAMERA_MT9V022=m +CONFIG_SOC_CAMERA_OV5642=m +CONFIG_SOC_CAMERA_OV772X=m +CONFIG_SOC_CAMERA_OV9640=m +CONFIG_SOC_CAMERA_OV9740=m +CONFIG_SOC_CAMERA_RJ54N1=m +CONFIG_SOC_CAMERA_TW9910=m + +# +# Media SPI Adapters +# +CONFIG_CXD2880_SPI_DRV=m +CONFIG_MEDIA_TUNER=m +CONFIG_MEDIA_TUNER_SIMPLE=m +CONFIG_MEDIA_TUNER_TDA18250=m +CONFIG_MEDIA_TUNER_TDA8290=m +CONFIG_MEDIA_TUNER_TDA827X=m +CONFIG_MEDIA_TUNER_TDA18271=m +CONFIG_MEDIA_TUNER_TDA9887=m +CONFIG_MEDIA_TUNER_TEA5761=m +CONFIG_MEDIA_TUNER_TEA5767=m +CONFIG_MEDIA_TUNER_MSI001=m +CONFIG_MEDIA_TUNER_MT20XX=m +CONFIG_MEDIA_TUNER_MT2060=m +CONFIG_MEDIA_TUNER_MT2063=m +CONFIG_MEDIA_TUNER_MT2266=m +CONFIG_MEDIA_TUNER_MT2131=m +CONFIG_MEDIA_TUNER_QT1010=m +CONFIG_MEDIA_TUNER_XC2028=m +CONFIG_MEDIA_TUNER_XC5000=m +CONFIG_MEDIA_TUNER_XC4000=m +CONFIG_MEDIA_TUNER_MXL5005S=m +CONFIG_MEDIA_TUNER_MXL5007T=m +CONFIG_MEDIA_TUNER_MC44S803=m +CONFIG_MEDIA_TUNER_MAX2165=m +CONFIG_MEDIA_TUNER_TDA18218=m +CONFIG_MEDIA_TUNER_FC0011=m +CONFIG_MEDIA_TUNER_FC0012=m +CONFIG_MEDIA_TUNER_FC0013=m +CONFIG_MEDIA_TUNER_TDA18212=m +CONFIG_MEDIA_TUNER_E4000=m +CONFIG_MEDIA_TUNER_FC2580=m +CONFIG_MEDIA_TUNER_M88RS6000T=m +CONFIG_MEDIA_TUNER_TUA9001=m +CONFIG_MEDIA_TUNER_SI2157=m +CONFIG_MEDIA_TUNER_IT913X=m +CONFIG_MEDIA_TUNER_R820T=m +CONFIG_MEDIA_TUNER_MXL301RF=m +CONFIG_MEDIA_TUNER_QM1D1C0042=m +CONFIG_MEDIA_TUNER_QM1D1B0004=m + +# +# Multistandard (satellite) frontends +# +CONFIG_DVB_STB0899=m +CONFIG_DVB_STB6100=m +CONFIG_DVB_STV090x=m +CONFIG_DVB_STV0910=m +CONFIG_DVB_STV6110x=m +CONFIG_DVB_STV6111=m +CONFIG_DVB_MXL5XX=m +CONFIG_DVB_M88DS3103=m + +# +# Multistandard (cable + terrestrial) frontends +# +CONFIG_DVB_DRXK=m +CONFIG_DVB_TDA18271C2DD=m +CONFIG_DVB_SI2165=m +CONFIG_DVB_MN88472=m +CONFIG_DVB_MN88473=m + +# +# DVB-S (satellite) frontends +# +CONFIG_DVB_CX24110=m +CONFIG_DVB_CX24123=m +CONFIG_DVB_MT312=m +CONFIG_DVB_ZL10036=m +CONFIG_DVB_ZL10039=m +CONFIG_DVB_S5H1420=m +CONFIG_DVB_STV0288=m +CONFIG_DVB_STB6000=m +CONFIG_DVB_STV0299=m +CONFIG_DVB_STV6110=m +CONFIG_DVB_STV0900=m +CONFIG_DVB_TDA8083=m +CONFIG_DVB_TDA10086=m +CONFIG_DVB_TDA8261=m +CONFIG_DVB_VES1X93=m +CONFIG_DVB_TUNER_ITD1000=m +CONFIG_DVB_TUNER_CX24113=m +CONFIG_DVB_TDA826X=m +CONFIG_DVB_TUA6100=m +CONFIG_DVB_CX24116=m +CONFIG_DVB_CX24117=m +CONFIG_DVB_CX24120=m +CONFIG_DVB_SI21XX=m +CONFIG_DVB_TS2020=m +CONFIG_DVB_DS3000=m +CONFIG_DVB_MB86A16=m +CONFIG_DVB_TDA10071=m + +# +# DVB-T (terrestrial) frontends +# +CONFIG_DVB_SP8870=m +CONFIG_DVB_SP887X=m +CONFIG_DVB_CX22700=m +CONFIG_DVB_CX22702=m +CONFIG_DVB_DRXD=m +CONFIG_DVB_L64781=m +CONFIG_DVB_TDA1004X=m +CONFIG_DVB_NXT6000=m +CONFIG_DVB_MT352=m +CONFIG_DVB_ZL10353=m +CONFIG_DVB_DIB3000MB=m +CONFIG_DVB_DIB3000MC=m +CONFIG_DVB_DIB7000M=m +CONFIG_DVB_DIB7000P=m +CONFIG_DVB_TDA10048=m +CONFIG_DVB_AF9013=m +CONFIG_DVB_EC100=m +CONFIG_DVB_STV0367=m +CONFIG_DVB_CXD2820R=m +CONFIG_DVB_CXD2841ER=m +CONFIG_DVB_RTL2830=m +CONFIG_DVB_RTL2832=m +CONFIG_DVB_RTL2832_SDR=m +CONFIG_DVB_SI2168=m +CONFIG_DVB_AS102_FE=m +CONFIG_DVB_ZD1301_DEMOD=m +CONFIG_DVB_GP8PSK_FE=m + +# +# DVB-C (cable) frontends +# +CONFIG_DVB_VES1820=m +CONFIG_DVB_TDA10021=m +CONFIG_DVB_TDA10023=m +CONFIG_DVB_STV0297=m + +# +# ATSC (North American/Korean Terrestrial/Cable DTV) frontends +# +CONFIG_DVB_NXT200X=m +CONFIG_DVB_OR51211=m +CONFIG_DVB_OR51132=m +CONFIG_DVB_BCM3510=m +CONFIG_DVB_LGDT330X=m +CONFIG_DVB_LGDT3305=m +CONFIG_DVB_LGDT3306A=m +CONFIG_DVB_LG2160=m +CONFIG_DVB_S5H1409=m +CONFIG_DVB_AU8522=m +CONFIG_DVB_AU8522_DTV=m +CONFIG_DVB_AU8522_V4L=m +CONFIG_DVB_S5H1411=m + +# +# ISDB-T (terrestrial) frontends +# +CONFIG_DVB_S921=m +CONFIG_DVB_DIB8000=m +CONFIG_DVB_MB86A20S=m + +# +# ISDB-S (satellite) & ISDB-T (terrestrial) frontends +# +CONFIG_DVB_TC90522=m + +# +# Digital terrestrial only tuners/PLL +# +CONFIG_DVB_PLL=m +CONFIG_DVB_TUNER_DIB0070=m +CONFIG_DVB_TUNER_DIB0090=m + +# +# SEC control devices for DVB-S +# +CONFIG_DVB_DRX39XYJ=m +CONFIG_DVB_LNBH25=m +CONFIG_DVB_LNBP21=m +CONFIG_DVB_LNBP22=m +CONFIG_DVB_ISL6405=m +CONFIG_DVB_ISL6421=m +CONFIG_DVB_ISL6423=m +CONFIG_DVB_A8293=m +CONFIG_DVB_LGS8GXX=m +CONFIG_DVB_ATBM8830=m +CONFIG_DVB_TDA665x=m +CONFIG_DVB_IX2505V=m +CONFIG_DVB_M88RS2000=m +CONFIG_DVB_AF9033=m +CONFIG_DVB_HORUS3A=m +CONFIG_DVB_ASCOT2E=m +CONFIG_DVB_HELENE=m + +# +# Common Interface (EN50221) controller drivers +# +CONFIG_DVB_CXD2099=m +CONFIG_DVB_SP2=m + +# +# Tools to develop new frontends +# +CONFIG_DVB_DUMMY_FE=m + +# +# Graphics support +# +CONFIG_AGP=m +CONFIG_AGP_AMD64=m +CONFIG_AGP_INTEL=m +CONFIG_AGP_SIS=m +CONFIG_AGP_VIA=m +CONFIG_INTEL_GTT=m +CONFIG_VGA_ARB=y +CONFIG_VGA_ARB_MAX_GPUS=16 +CONFIG_VGA_SWITCHEROO=y +CONFIG_DRM=m +CONFIG_DRM_MIPI_DSI=y +CONFIG_DRM_DP_AUX_CHARDEV=y +CONFIG_DRM_DEBUG_SELFTEST=m +CONFIG_DRM_KMS_HELPER=m +CONFIG_DRM_KMS_FB_HELPER=y +CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 +# CONFIG_DRM_LOAD_EDID_FIRMWARE is not set +# CONFIG_DRM_DP_CEC is not set +CONFIG_DRM_TTM=m +CONFIG_DRM_GEM_CMA_HELPER=y +CONFIG_DRM_KMS_CMA_HELPER=y +CONFIG_DRM_VM=y +CONFIG_DRM_SCHED=m + +# +# I2C encoder or helper chips +# +CONFIG_DRM_I2C_CH7006=m +CONFIG_DRM_I2C_SIL164=m +CONFIG_DRM_I2C_NXP_TDA998X=m +CONFIG_DRM_I2C_NXP_TDA9950=m +CONFIG_DRM_RADEON=m +# CONFIG_DRM_RADEON_USERPTR is not set +CONFIG_DRM_AMDGPU=m +CONFIG_DRM_AMDGPU_SI=y +CONFIG_DRM_AMDGPU_CIK=y +CONFIG_DRM_AMDGPU_USERPTR=y +# CONFIG_DRM_AMDGPU_GART_DEBUGFS is not set + +# +# ACP (Audio CoProcessor) Configuration +# +CONFIG_DRM_AMD_ACP=y + +# +# Display Engine Configuration +# +CONFIG_DRM_AMD_DC=y +CONFIG_DRM_AMD_DC_DCN1_0=y +# CONFIG_DEBUG_KERNEL_DC is not set + +# +# AMD Library routines +# +CONFIG_CHASH=m +# CONFIG_CHASH_STATS is not set +# CONFIG_CHASH_SELFTEST is not set +CONFIG_DRM_NOUVEAU=m +CONFIG_NOUVEAU_DEBUG=5 +CONFIG_NOUVEAU_DEBUG_DEFAULT=3 +# CONFIG_NOUVEAU_DEBUG_MMU is not set +CONFIG_DRM_NOUVEAU_BACKLIGHT=y +CONFIG_DRM_I915=m +# CONFIG_DRM_I915_ALPHA_SUPPORT is not set +CONFIG_DRM_I915_CAPTURE_ERROR=y +CONFIG_DRM_I915_COMPRESS_ERROR=y +CONFIG_DRM_I915_USERPTR=y +CONFIG_DRM_I915_GVT=y +CONFIG_DRM_I915_GVT_KVMGT=m +CONFIG_DRM_VGEM=m +# CONFIG_DRM_VKMS is not set +CONFIG_DRM_VMWGFX=m +CONFIG_DRM_VMWGFX_FBCON=y +CONFIG_DRM_GMA500=m +CONFIG_DRM_GMA600=y +CONFIG_DRM_GMA3600=y +CONFIG_DRM_UDL=m +CONFIG_DRM_AST=m +CONFIG_DRM_MGAG200=m +CONFIG_DRM_CIRRUS_QEMU=m +CONFIG_DRM_QXL=m +CONFIG_DRM_BOCHS=m +CONFIG_DRM_VIRTIO_GPU=m +CONFIG_DRM_PANEL=y + +# +# Display Panels +# +CONFIG_DRM_PANEL_RASPBERRYPI_TOUCHSCREEN=m +CONFIG_DRM_BRIDGE=y +CONFIG_DRM_PANEL_BRIDGE=y + +# +# Display Interface Bridges +# +CONFIG_DRM_ANALOGIX_ANX78XX=m +CONFIG_HSA_AMD=m +CONFIG_DRM_HISI_HIBMC=m +CONFIG_DRM_TINYDRM=m +CONFIG_TINYDRM_MIPI_DBI=m +CONFIG_TINYDRM_ILI9225=m +# CONFIG_TINYDRM_ILI9341 is not set +CONFIG_TINYDRM_MI0283QT=m +CONFIG_TINYDRM_REPAPER=m +CONFIG_TINYDRM_ST7586=m +CONFIG_TINYDRM_ST7735R=m +# CONFIG_DRM_LEGACY is not set +CONFIG_DRM_PANEL_ORIENTATION_QUIRKS=y +CONFIG_DRM_LIB_RANDOM=y + +# +# Frame buffer Devices +# +CONFIG_FB=y +CONFIG_FIRMWARE_EDID=y +CONFIG_FB_CMDLINE=y +CONFIG_FB_NOTIFY=y +CONFIG_FB_BOOT_VESA_SUPPORT=y +CONFIG_FB_CFB_FILLRECT=y +CONFIG_FB_CFB_COPYAREA=y +CONFIG_FB_CFB_IMAGEBLIT=y +CONFIG_FB_SYS_FILLRECT=m +CONFIG_FB_SYS_COPYAREA=m +CONFIG_FB_SYS_IMAGEBLIT=m +CONFIG_FB_FOREIGN_ENDIAN=y +CONFIG_FB_BOTH_ENDIAN=y +# CONFIG_FB_BIG_ENDIAN is not set +# CONFIG_FB_LITTLE_ENDIAN is not set +CONFIG_FB_SYS_FOPS=m +CONFIG_FB_DEFERRED_IO=y +CONFIG_FB_BACKLIGHT=y +CONFIG_FB_MODE_HELPERS=y +CONFIG_FB_TILEBLITTING=y + +# +# Frame buffer hardware drivers +# +# CONFIG_FB_CIRRUS is not set +# CONFIG_FB_PM2 is not set +# CONFIG_FB_CYBER2000 is not set +# CONFIG_FB_ARC is not set +# CONFIG_FB_ASILIANT is not set +# CONFIG_FB_IMSTT is not set +# CONFIG_FB_VGA16 is not set +# CONFIG_FB_UVESA is not set +CONFIG_FB_VESA=y +CONFIG_FB_EFI=y +# CONFIG_FB_N411 is not set +# CONFIG_FB_HGA is not set +# CONFIG_FB_OPENCORES is not set +# CONFIG_FB_S1D13XXX is not set +# CONFIG_FB_NVIDIA is not set +# CONFIG_FB_RIVA is not set +# CONFIG_FB_I740 is not set +# CONFIG_FB_LE80578 is not set +# CONFIG_FB_MATROX is not set +# CONFIG_FB_RADEON is not set +# CONFIG_FB_ATY128 is not set +# CONFIG_FB_ATY is not set +# CONFIG_FB_S3 is not set +# CONFIG_FB_SAVAGE is not set +# CONFIG_FB_SIS is not set +# CONFIG_FB_VIA is not set +# CONFIG_FB_NEOMAGIC is not set +# CONFIG_FB_KYRO is not set +# CONFIG_FB_3DFX is not set +# CONFIG_FB_VOODOO1 is not set +# CONFIG_FB_VT8623 is not set +# CONFIG_FB_TRIDENT is not set +# CONFIG_FB_ARK is not set +# CONFIG_FB_PM3 is not set +# CONFIG_FB_CARMINE is not set +# CONFIG_FB_SM501 is not set +# CONFIG_FB_SMSCUFX is not set +# CONFIG_FB_UDL is not set +# CONFIG_FB_IBM_GXT4500 is not set +# CONFIG_FB_VIRTUAL is not set +# CONFIG_FB_METRONOME is not set +# CONFIG_FB_MB862XX is not set +# CONFIG_FB_BROADSHEET is not set +# CONFIG_FB_HYPERV is not set +CONFIG_FB_SIMPLE=y +# CONFIG_FB_SM712 is not set +CONFIG_BACKLIGHT_LCD_SUPPORT=y +CONFIG_LCD_CLASS_DEVICE=m +CONFIG_LCD_L4F00242T03=m +CONFIG_LCD_LMS283GF05=m +CONFIG_LCD_LTV350QV=m +CONFIG_LCD_ILI922X=m +CONFIG_LCD_ILI9320=m +CONFIG_LCD_TDO24M=m +CONFIG_LCD_VGG2432A4=m +CONFIG_LCD_PLATFORM=m +CONFIG_LCD_S6E63M0=m +CONFIG_LCD_LD9040=m +CONFIG_LCD_AMS369FG06=m +CONFIG_LCD_LMS501KF03=m +CONFIG_LCD_HX8357=m +CONFIG_LCD_OTM3225A=m +CONFIG_BACKLIGHT_CLASS_DEVICE=y +CONFIG_BACKLIGHT_GENERIC=m +CONFIG_BACKLIGHT_LM3533=m +CONFIG_BACKLIGHT_PWM=m +CONFIG_BACKLIGHT_DA9052=m +CONFIG_BACKLIGHT_APPLE=m +CONFIG_BACKLIGHT_PM8941_WLED=m +CONFIG_BACKLIGHT_SAHARA=m +CONFIG_BACKLIGHT_WM831X=m +CONFIG_BACKLIGHT_ADP8860=m +CONFIG_BACKLIGHT_ADP8870=m +CONFIG_BACKLIGHT_PCF50633=m +CONFIG_BACKLIGHT_LM3630A=m +CONFIG_BACKLIGHT_LM3639=m +CONFIG_BACKLIGHT_LP855X=m +CONFIG_BACKLIGHT_SKY81452=m +CONFIG_BACKLIGHT_GPIO=m +CONFIG_BACKLIGHT_LV5207LP=m +CONFIG_BACKLIGHT_BD6107=m +CONFIG_BACKLIGHT_ARCXCNN=m +CONFIG_BACKLIGHT_RAVE_SP=m +CONFIG_HDMI=y + +# +# Console display driver support +# +CONFIG_VGA_CONSOLE=y +CONFIG_VGACON_SOFT_SCROLLBACK=y +CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=64 +# CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT is not set +CONFIG_DUMMY_CONSOLE=y +CONFIG_DUMMY_CONSOLE_COLUMNS=80 +CONFIG_DUMMY_CONSOLE_ROWS=25 +CONFIG_FRAMEBUFFER_CONSOLE=y +CONFIG_FRAMEBUFFER_CONSOLE_DETECT_PRIMARY=y +CONFIG_FRAMEBUFFER_CONSOLE_ROTATION=y +# CONFIG_FRAMEBUFFER_CONSOLE_DEFERRED_TAKEOVER is not set +CONFIG_LOGO=y +CONFIG_LOGO_LINUX_MONO=y +CONFIG_LOGO_LINUX_VGA16=y +CONFIG_LOGO_LINUX_CLUT224=y +CONFIG_SOUND=m +CONFIG_SOUND_OSS_CORE=y +CONFIG_SOUND_OSS_CORE_PRECLAIM=y +CONFIG_SND=m +CONFIG_SND_TIMER=m +CONFIG_SND_PCM=m +CONFIG_SND_PCM_ELD=y +CONFIG_SND_PCM_IEC958=y +CONFIG_SND_DMAENGINE_PCM=m +CONFIG_SND_HWDEP=m +CONFIG_SND_SEQ_DEVICE=m +CONFIG_SND_RAWMIDI=m +CONFIG_SND_COMPRESS_OFFLOAD=m +CONFIG_SND_JACK=y +CONFIG_SND_JACK_INPUT_DEV=y +CONFIG_SND_OSSEMUL=y +CONFIG_SND_MIXER_OSS=m +CONFIG_SND_PCM_OSS=m +CONFIG_SND_PCM_OSS_PLUGINS=y +CONFIG_SND_PCM_TIMER=y +CONFIG_SND_HRTIMER=m +CONFIG_SND_DYNAMIC_MINORS=y +CONFIG_SND_MAX_CARDS=32 +CONFIG_SND_SUPPORT_OLD_API=y +CONFIG_SND_PROC_FS=y +CONFIG_SND_VERBOSE_PROCFS=y +# CONFIG_SND_VERBOSE_PRINTK is not set +# CONFIG_SND_DEBUG is not set +CONFIG_SND_VMASTER=y +CONFIG_SND_DMA_SGBUF=y +CONFIG_SND_SEQUENCER=m +CONFIG_SND_SEQ_DUMMY=m +CONFIG_SND_SEQUENCER_OSS=m +CONFIG_SND_SEQ_HRTIMER_DEFAULT=y +CONFIG_SND_SEQ_MIDI_EVENT=m +CONFIG_SND_SEQ_MIDI=m +CONFIG_SND_SEQ_MIDI_EMUL=m +CONFIG_SND_SEQ_VIRMIDI=m +CONFIG_SND_MPU401_UART=m +CONFIG_SND_OPL3_LIB=m +CONFIG_SND_OPL3_LIB_SEQ=m +CONFIG_SND_VX_LIB=m +CONFIG_SND_AC97_CODEC=m +CONFIG_SND_DRIVERS=y +# CONFIG_SND_PCSP is not set +CONFIG_SND_DUMMY=m +CONFIG_SND_ALOOP=m +CONFIG_SND_VIRMIDI=m +CONFIG_SND_MTPAV=m +CONFIG_SND_MTS64=m +CONFIG_SND_SERIAL_U16550=m +CONFIG_SND_MPU401=m +CONFIG_SND_PORTMAN2X4=m +CONFIG_SND_AC97_POWER_SAVE=y +CONFIG_SND_AC97_POWER_SAVE_DEFAULT=0 +CONFIG_SND_SB_COMMON=m +CONFIG_SND_PCI=y +CONFIG_SND_AD1889=m +CONFIG_SND_ALS300=m +CONFIG_SND_ALS4000=m +CONFIG_SND_ALI5451=m +CONFIG_SND_ASIHPI=m +CONFIG_SND_ATIIXP=m +CONFIG_SND_ATIIXP_MODEM=m +CONFIG_SND_AU8810=m +CONFIG_SND_AU8820=m +CONFIG_SND_AU8830=m +CONFIG_SND_AW2=m +CONFIG_SND_AZT3328=m +CONFIG_SND_BT87X=m +CONFIG_SND_BT87X_OVERCLOCK=y +CONFIG_SND_CA0106=m +CONFIG_SND_CMIPCI=m +CONFIG_SND_OXYGEN_LIB=m +CONFIG_SND_OXYGEN=m +CONFIG_SND_CS4281=m +CONFIG_SND_CS46XX=m +CONFIG_SND_CS46XX_NEW_DSP=y +CONFIG_SND_CTXFI=m +CONFIG_SND_DARLA20=m +CONFIG_SND_GINA20=m +CONFIG_SND_LAYLA20=m +CONFIG_SND_DARLA24=m +CONFIG_SND_GINA24=m +CONFIG_SND_LAYLA24=m +CONFIG_SND_MONA=m +CONFIG_SND_MIA=m +CONFIG_SND_ECHO3G=m +CONFIG_SND_INDIGO=m +CONFIG_SND_INDIGOIO=m +CONFIG_SND_INDIGODJ=m +CONFIG_SND_INDIGOIOX=m +CONFIG_SND_INDIGODJX=m +CONFIG_SND_EMU10K1=m +CONFIG_SND_EMU10K1_SEQ=m +CONFIG_SND_EMU10K1X=m +CONFIG_SND_ENS1370=m +CONFIG_SND_ENS1371=m +CONFIG_SND_ES1938=m +CONFIG_SND_ES1968=m +CONFIG_SND_ES1968_INPUT=y +CONFIG_SND_ES1968_RADIO=y +CONFIG_SND_FM801=m +CONFIG_SND_FM801_TEA575X_BOOL=y +CONFIG_SND_HDSP=m +CONFIG_SND_HDSPM=m +CONFIG_SND_ICE1712=m +CONFIG_SND_ICE1724=m +CONFIG_SND_INTEL8X0=m +CONFIG_SND_INTEL8X0M=m +CONFIG_SND_KORG1212=m +CONFIG_SND_LOLA=m +CONFIG_SND_LX6464ES=m +CONFIG_SND_MAESTRO3=m +CONFIG_SND_MAESTRO3_INPUT=y +CONFIG_SND_MIXART=m +CONFIG_SND_NM256=m +CONFIG_SND_PCXHR=m +CONFIG_SND_RIPTIDE=m +CONFIG_SND_RME32=m +CONFIG_SND_RME96=m +CONFIG_SND_RME9652=m +CONFIG_SND_SONICVIBES=m +CONFIG_SND_TRIDENT=m +CONFIG_SND_VIA82XX=m +CONFIG_SND_VIA82XX_MODEM=m +CONFIG_SND_VIRTUOSO=m +CONFIG_SND_VX222=m +CONFIG_SND_YMFPCI=m + +# +# HD-Audio +# +CONFIG_SND_HDA=m +CONFIG_SND_HDA_INTEL=m +CONFIG_SND_HDA_HWDEP=y +CONFIG_SND_HDA_RECONFIG=y +CONFIG_SND_HDA_INPUT_BEEP=y +CONFIG_SND_HDA_INPUT_BEEP_MODE=1 +CONFIG_SND_HDA_PATCH_LOADER=y +CONFIG_SND_HDA_CODEC_REALTEK=m +CONFIG_SND_HDA_CODEC_ANALOG=m +CONFIG_SND_HDA_CODEC_SIGMATEL=m +CONFIG_SND_HDA_CODEC_VIA=m +CONFIG_SND_HDA_CODEC_HDMI=m +CONFIG_SND_HDA_CODEC_CIRRUS=m +CONFIG_SND_HDA_CODEC_CONEXANT=m +CONFIG_SND_HDA_CODEC_CA0110=m +CONFIG_SND_HDA_CODEC_CA0132=m +CONFIG_SND_HDA_CODEC_CA0132_DSP=y +CONFIG_SND_HDA_CODEC_CMEDIA=m +CONFIG_SND_HDA_CODEC_SI3054=m +CONFIG_SND_HDA_GENERIC=m +CONFIG_SND_HDA_POWER_SAVE_DEFAULT=0 +CONFIG_SND_HDA_CORE=m +CONFIG_SND_HDA_DSP_LOADER=y +CONFIG_SND_HDA_COMPONENT=y +CONFIG_SND_HDA_I915=y +CONFIG_SND_HDA_EXT_CORE=m +CONFIG_SND_HDA_PREALLOC_SIZE=64 +CONFIG_SND_SPI=y +CONFIG_SND_USB=y +CONFIG_SND_USB_AUDIO=m +CONFIG_SND_USB_UA101=m +CONFIG_SND_USB_USX2Y=m +CONFIG_SND_USB_CAIAQ=m +CONFIG_SND_USB_CAIAQ_INPUT=y +CONFIG_SND_USB_US122L=m +CONFIG_SND_USB_6FIRE=m +CONFIG_SND_USB_HIFACE=m +CONFIG_SND_BCD2000=m +CONFIG_SND_USB_LINE6=m +CONFIG_SND_USB_POD=m +CONFIG_SND_USB_PODHD=m +CONFIG_SND_USB_TONEPORT=m +CONFIG_SND_USB_VARIAX=m +CONFIG_SND_FIREWIRE=y +CONFIG_SND_FIREWIRE_LIB=m +CONFIG_SND_DICE=m +CONFIG_SND_OXFW=m +CONFIG_SND_ISIGHT=m +CONFIG_SND_FIREWORKS=m +CONFIG_SND_BEBOB=m +CONFIG_SND_FIREWIRE_DIGI00X=m +CONFIG_SND_FIREWIRE_TASCAM=m +CONFIG_SND_FIREWIRE_MOTU=m +CONFIG_SND_FIREFACE=m +CONFIG_SND_PCMCIA=y +CONFIG_SND_VXPOCKET=m +CONFIG_SND_PDAUDIOCF=m +CONFIG_SND_SOC=m +CONFIG_SND_SOC_GENERIC_DMAENGINE_PCM=y +CONFIG_SND_SOC_COMPRESS=y +CONFIG_SND_SOC_TOPOLOGY=y +CONFIG_SND_SOC_ACPI=m +CONFIG_SND_SOC_AMD_ACP=m +CONFIG_SND_SOC_AMD_CZ_DA7219MX98357_MACH=m +CONFIG_SND_SOC_AMD_CZ_RT5645_MACH=m +CONFIG_SND_ATMEL_SOC=m +CONFIG_SND_DESIGNWARE_I2S=m +CONFIG_SND_DESIGNWARE_PCM=y + +# +# SoC Audio for Freescale CPUs +# + +# +# Common SoC Audio options for Freescale CPUs: +# +CONFIG_SND_SOC_FSL_ASRC=m +CONFIG_SND_SOC_FSL_SAI=m +CONFIG_SND_SOC_FSL_SSI=m +CONFIG_SND_SOC_FSL_SPDIF=m +CONFIG_SND_SOC_FSL_ESAI=m +CONFIG_SND_SOC_IMX_AUDMUX=m +CONFIG_SND_I2S_HI6210_I2S=m +CONFIG_SND_SOC_IMG=y +CONFIG_SND_SOC_IMG_I2S_IN=m +CONFIG_SND_SOC_IMG_I2S_OUT=m +CONFIG_SND_SOC_IMG_PARALLEL_OUT=m +CONFIG_SND_SOC_IMG_SPDIF_IN=m +CONFIG_SND_SOC_IMG_SPDIF_OUT=m +CONFIG_SND_SOC_IMG_PISTACHIO_INTERNAL_DAC=m +CONFIG_SND_SOC_INTEL_SST_TOPLEVEL=y +CONFIG_SND_SST_IPC=m +CONFIG_SND_SST_IPC_PCI=m +CONFIG_SND_SST_IPC_ACPI=m +CONFIG_SND_SOC_INTEL_SST_ACPI=m +CONFIG_SND_SOC_INTEL_SST=m +CONFIG_SND_SOC_INTEL_SST_FIRMWARE=m +CONFIG_SND_SOC_INTEL_HASWELL=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_PCI=m +CONFIG_SND_SST_ATOM_HIFI2_PLATFORM_ACPI=m +CONFIG_SND_SOC_INTEL_SKYLAKE_SSP_CLK=m +CONFIG_SND_SOC_INTEL_SKYLAKE=m +CONFIG_SND_SOC_ACPI_INTEL_MATCH=m +CONFIG_SND_SOC_INTEL_MACH=y +CONFIG_SND_SOC_INTEL_HASWELL_MACH=m +CONFIG_SND_SOC_INTEL_BDW_RT5677_MACH=m +CONFIG_SND_SOC_INTEL_BROADWELL_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5640_MACH=m +CONFIG_SND_SOC_INTEL_BYTCR_RT5651_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5672_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_RT5645_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_MAX98090_TI_MACH=m +CONFIG_SND_SOC_INTEL_CHT_BSW_NAU8824_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_DA7213_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_ES8316_MACH=m +CONFIG_SND_SOC_INTEL_BYT_CHT_NOCODEC_MACH=m +CONFIG_SND_SOC_INTEL_SKL_RT286_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_SSM4567_MACH=m +CONFIG_SND_SOC_INTEL_SKL_NAU88L25_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_DA7219_MAX98357A_MACH=m +CONFIG_SND_SOC_INTEL_BXT_RT298_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_RT5663_RT5514_MAX98927_MACH=m +CONFIG_SND_SOC_INTEL_KBL_DA7219_MAX98357A_MACH=m +# CONFIG_SND_SOC_INTEL_GLK_RT5682_MAX98357A_MACH is not set + +# +# STMicroelectronics STM32 SOC audio support +# +CONFIG_SND_SOC_XTFPGA_I2S=m +CONFIG_ZX_TDM=m +CONFIG_SND_SOC_I2C_AND_SPI=m + +# +# CODEC drivers +# +# CONFIG_SND_SOC_AC97_CODEC is not set +CONFIG_SND_SOC_ADAU_UTILS=m +CONFIG_SND_SOC_ADAU1701=m +CONFIG_SND_SOC_ADAU17X1=m +CONFIG_SND_SOC_ADAU1761=m +CONFIG_SND_SOC_ADAU1761_I2C=m +CONFIG_SND_SOC_ADAU1761_SPI=m +CONFIG_SND_SOC_ADAU7002=m +CONFIG_SND_SOC_AK4104=m +CONFIG_SND_SOC_AK4458=m +CONFIG_SND_SOC_AK4554=m +CONFIG_SND_SOC_AK4613=m +CONFIG_SND_SOC_AK4642=m +CONFIG_SND_SOC_AK5386=m +CONFIG_SND_SOC_AK5558=m +CONFIG_SND_SOC_ALC5623=m +CONFIG_SND_SOC_BD28623=m +# CONFIG_SND_SOC_BT_SCO is not set +CONFIG_SND_SOC_CS35L32=m +CONFIG_SND_SOC_CS35L33=m +CONFIG_SND_SOC_CS35L34=m +CONFIG_SND_SOC_CS35L35=m +CONFIG_SND_SOC_CS42L42=m +CONFIG_SND_SOC_CS42L51=m +CONFIG_SND_SOC_CS42L51_I2C=m +CONFIG_SND_SOC_CS42L52=m +CONFIG_SND_SOC_CS42L56=m +CONFIG_SND_SOC_CS42L73=m +CONFIG_SND_SOC_CS4265=m +CONFIG_SND_SOC_CS4270=m +CONFIG_SND_SOC_CS4271=m +CONFIG_SND_SOC_CS4271_I2C=m +CONFIG_SND_SOC_CS4271_SPI=m +CONFIG_SND_SOC_CS42XX8=m +CONFIG_SND_SOC_CS42XX8_I2C=m +CONFIG_SND_SOC_CS43130=m +CONFIG_SND_SOC_CS4349=m +CONFIG_SND_SOC_CS53L30=m +CONFIG_SND_SOC_DA7213=m +CONFIG_SND_SOC_DA7219=m +CONFIG_SND_SOC_DMIC=m +CONFIG_SND_SOC_HDMI_CODEC=m +CONFIG_SND_SOC_ES7134=m +# CONFIG_SND_SOC_ES7241 is not set +CONFIG_SND_SOC_ES8316=m +CONFIG_SND_SOC_ES8328=m +CONFIG_SND_SOC_ES8328_I2C=m +CONFIG_SND_SOC_ES8328_SPI=m +CONFIG_SND_SOC_GTM601=m +CONFIG_SND_SOC_HDAC_HDMI=m +CONFIG_SND_SOC_INNO_RK3036=m +CONFIG_SND_SOC_MAX98090=m +CONFIG_SND_SOC_MAX98357A=m +CONFIG_SND_SOC_MAX98504=m +CONFIG_SND_SOC_MAX9867=m +CONFIG_SND_SOC_MAX98927=m +CONFIG_SND_SOC_MAX98373=m +CONFIG_SND_SOC_MAX9860=m +CONFIG_SND_SOC_MSM8916_WCD_ANALOG=m +CONFIG_SND_SOC_MSM8916_WCD_DIGITAL=m +CONFIG_SND_SOC_PCM1681=m +CONFIG_SND_SOC_PCM1789=m +CONFIG_SND_SOC_PCM1789_I2C=m +CONFIG_SND_SOC_PCM179X=m +CONFIG_SND_SOC_PCM179X_I2C=m +CONFIG_SND_SOC_PCM179X_SPI=m +CONFIG_SND_SOC_PCM186X=m +CONFIG_SND_SOC_PCM186X_I2C=m +CONFIG_SND_SOC_PCM186X_SPI=m +CONFIG_SND_SOC_PCM3168A=m +CONFIG_SND_SOC_PCM3168A_I2C=m +CONFIG_SND_SOC_PCM3168A_SPI=m +CONFIG_SND_SOC_PCM512x=m +CONFIG_SND_SOC_PCM512x_I2C=m +CONFIG_SND_SOC_PCM512x_SPI=m +CONFIG_SND_SOC_RL6231=m +CONFIG_SND_SOC_RL6347A=m +CONFIG_SND_SOC_RT286=m +CONFIG_SND_SOC_RT298=m +CONFIG_SND_SOC_RT5514=m +CONFIG_SND_SOC_RT5514_SPI=m +CONFIG_SND_SOC_RT5616=m +CONFIG_SND_SOC_RT5631=m +CONFIG_SND_SOC_RT5640=m +CONFIG_SND_SOC_RT5645=m +CONFIG_SND_SOC_RT5651=m +CONFIG_SND_SOC_RT5663=m +CONFIG_SND_SOC_RT5670=m +CONFIG_SND_SOC_RT5677=m +CONFIG_SND_SOC_RT5677_SPI=m +CONFIG_SND_SOC_SGTL5000=m +CONFIG_SND_SOC_SI476X=m +CONFIG_SND_SOC_SIGMADSP=m +CONFIG_SND_SOC_SIGMADSP_I2C=m +CONFIG_SND_SOC_SIGMADSP_REGMAP=m +# CONFIG_SND_SOC_SIMPLE_AMPLIFIER is not set +CONFIG_SND_SOC_SIRF_AUDIO_CODEC=m +CONFIG_SND_SOC_SPDIF=m +CONFIG_SND_SOC_SSM2305=m +CONFIG_SND_SOC_SSM2602=m +CONFIG_SND_SOC_SSM2602_SPI=m +CONFIG_SND_SOC_SSM2602_I2C=m +CONFIG_SND_SOC_SSM4567=m +CONFIG_SND_SOC_STA32X=m +CONFIG_SND_SOC_STA350=m +CONFIG_SND_SOC_STI_SAS=m +CONFIG_SND_SOC_TAS2552=m +CONFIG_SND_SOC_TAS5086=m +CONFIG_SND_SOC_TAS571X=m +CONFIG_SND_SOC_TAS5720=m +CONFIG_SND_SOC_TAS6424=m +CONFIG_SND_SOC_TDA7419=m +CONFIG_SND_SOC_TFA9879=m +CONFIG_SND_SOC_TLV320AIC23=m +CONFIG_SND_SOC_TLV320AIC23_I2C=m +CONFIG_SND_SOC_TLV320AIC23_SPI=m +CONFIG_SND_SOC_TLV320AIC31XX=m +CONFIG_SND_SOC_TLV320AIC32X4=m +CONFIG_SND_SOC_TLV320AIC32X4_I2C=m +CONFIG_SND_SOC_TLV320AIC32X4_SPI=m +CONFIG_SND_SOC_TLV320AIC3X=m +CONFIG_SND_SOC_TS3A227E=m +CONFIG_SND_SOC_TSCS42XX=m +CONFIG_SND_SOC_TSCS454=m +CONFIG_SND_SOC_WM8510=m +CONFIG_SND_SOC_WM8523=m +CONFIG_SND_SOC_WM8524=m +CONFIG_SND_SOC_WM8580=m +CONFIG_SND_SOC_WM8711=m +CONFIG_SND_SOC_WM8728=m +CONFIG_SND_SOC_WM8731=m +CONFIG_SND_SOC_WM8737=m +CONFIG_SND_SOC_WM8741=m +CONFIG_SND_SOC_WM8750=m +CONFIG_SND_SOC_WM8753=m +CONFIG_SND_SOC_WM8770=m +CONFIG_SND_SOC_WM8776=m +CONFIG_SND_SOC_WM8782=m +CONFIG_SND_SOC_WM8804=m +CONFIG_SND_SOC_WM8804_I2C=m +CONFIG_SND_SOC_WM8804_SPI=m +CONFIG_SND_SOC_WM8903=m +CONFIG_SND_SOC_WM8960=m +CONFIG_SND_SOC_WM8962=m +CONFIG_SND_SOC_WM8974=m +CONFIG_SND_SOC_WM8978=m +CONFIG_SND_SOC_WM8985=m +CONFIG_SND_SOC_ZX_AUD96P22=m +CONFIG_SND_SOC_MAX9759=m +CONFIG_SND_SOC_MT6351=m +CONFIG_SND_SOC_NAU8540=m +CONFIG_SND_SOC_NAU8810=m +CONFIG_SND_SOC_NAU8824=m +CONFIG_SND_SOC_NAU8825=m +CONFIG_SND_SOC_TPA6130A2=m +CONFIG_SND_SIMPLE_CARD_UTILS=m +CONFIG_SND_SIMPLE_CARD=m +CONFIG_SND_X86=y +CONFIG_HDMI_LPE_AUDIO=m +CONFIG_SND_SYNTH_EMUX=m +CONFIG_AC97_BUS=m + +# +# HID support +# +CONFIG_HID=m +CONFIG_HID_BATTERY_STRENGTH=y +CONFIG_HIDRAW=y +CONFIG_UHID=m +CONFIG_HID_GENERIC=m + +# +# Special HID drivers +# +CONFIG_HID_A4TECH=m +CONFIG_HID_ACCUTOUCH=m +CONFIG_HID_ACRUX=m +CONFIG_HID_ACRUX_FF=y +CONFIG_HID_APPLE=m +CONFIG_HID_APPLEIR=m +CONFIG_HID_ASUS=m +CONFIG_HID_AUREAL=m +CONFIG_HID_BELKIN=m +CONFIG_HID_BETOP_FF=m +CONFIG_HID_CHERRY=m +CONFIG_HID_CHICONY=m +CONFIG_HID_CORSAIR=m +# CONFIG_HID_COUGAR is not set +CONFIG_HID_PRODIKEYS=m +CONFIG_HID_CMEDIA=m +CONFIG_HID_CP2112=m +CONFIG_HID_CYPRESS=m +CONFIG_HID_DRAGONRISE=m +CONFIG_DRAGONRISE_FF=y +CONFIG_HID_EMS_FF=m +CONFIG_HID_ELAN=m +CONFIG_HID_ELECOM=m +CONFIG_HID_ELO=m +CONFIG_HID_EZKEY=m +CONFIG_HID_GEMBIRD=m +CONFIG_HID_GFRM=m +CONFIG_HID_HOLTEK=m +CONFIG_HOLTEK_FF=y +CONFIG_HID_GOOGLE_HAMMER=m +CONFIG_HID_GT683R=m +CONFIG_HID_KEYTOUCH=m +CONFIG_HID_KYE=m +CONFIG_HID_UCLOGIC=m +CONFIG_HID_WALTOP=m +CONFIG_HID_GYRATION=m +CONFIG_HID_ICADE=m +CONFIG_HID_ITE=m +CONFIG_HID_JABRA=m +CONFIG_HID_TWINHAN=m +CONFIG_HID_KENSINGTON=m +CONFIG_HID_LCPOWER=m +CONFIG_HID_LED=m +CONFIG_HID_LENOVO=m +CONFIG_HID_LOGITECH=m +CONFIG_HID_LOGITECH_DJ=m +CONFIG_HID_LOGITECH_HIDPP=m +CONFIG_LOGITECH_FF=y +CONFIG_LOGIRUMBLEPAD2_FF=y +CONFIG_LOGIG940_FF=y +CONFIG_LOGIWHEELS_FF=y +CONFIG_HID_MAGICMOUSE=m +CONFIG_HID_MAYFLASH=m +CONFIG_HID_REDRAGON=m +CONFIG_HID_MICROSOFT=m +CONFIG_HID_MONTEREY=m +CONFIG_HID_MULTITOUCH=m +CONFIG_HID_NTI=m +CONFIG_HID_NTRIG=m +CONFIG_HID_ORTEK=m +CONFIG_HID_PANTHERLORD=m +CONFIG_PANTHERLORD_FF=y +CONFIG_HID_PENMOUNT=m +CONFIG_HID_PETALYNX=m +CONFIG_HID_PICOLCD=m +CONFIG_HID_PICOLCD_FB=y +CONFIG_HID_PICOLCD_BACKLIGHT=y +CONFIG_HID_PICOLCD_LCD=y +CONFIG_HID_PICOLCD_LEDS=y +CONFIG_HID_PICOLCD_CIR=y +CONFIG_HID_PLANTRONICS=m +CONFIG_HID_PRIMAX=m +CONFIG_HID_RETRODE=m +CONFIG_HID_ROCCAT=m +CONFIG_HID_SAITEK=m +CONFIG_HID_SAMSUNG=m +CONFIG_HID_SONY=m +CONFIG_SONY_FF=y +CONFIG_HID_SPEEDLINK=m +CONFIG_HID_STEAM=m +CONFIG_HID_STEELSERIES=m +CONFIG_HID_SUNPLUS=m +CONFIG_HID_RMI=m +CONFIG_HID_GREENASIA=m +CONFIG_GREENASIA_FF=y +CONFIG_HID_HYPERV_MOUSE=m +CONFIG_HID_SMARTJOYPLUS=m +CONFIG_SMARTJOYPLUS_FF=y +CONFIG_HID_TIVO=m +CONFIG_HID_TOPSEED=m +CONFIG_HID_THINGM=m +CONFIG_HID_THRUSTMASTER=m +CONFIG_THRUSTMASTER_FF=y +CONFIG_HID_UDRAW_PS3=m +CONFIG_HID_WACOM=m +CONFIG_HID_WIIMOTE=m +CONFIG_HID_XINMO=m +CONFIG_HID_ZEROPLUS=m +CONFIG_ZEROPLUS_FF=y +CONFIG_HID_ZYDACRON=m +CONFIG_HID_SENSOR_HUB=m +CONFIG_HID_SENSOR_CUSTOM_SENSOR=m +CONFIG_HID_ALPS=m + +# +# USB HID support +# +CONFIG_USB_HID=m +CONFIG_HID_PID=y +CONFIG_USB_HIDDEV=y + +# +# I2C HID support +# +CONFIG_I2C_HID=m + +# +# Intel ISH HID support +# +CONFIG_INTEL_ISH_HID=m +CONFIG_USB_OHCI_LITTLE_ENDIAN=y +CONFIG_USB_SUPPORT=y +CONFIG_USB_COMMON=y +CONFIG_USB_ARCH_HAS_HCD=y +CONFIG_USB=m +CONFIG_USB_PCI=y +CONFIG_USB_ANNOUNCE_NEW_DEVICES=y + +# +# Miscellaneous USB options +# +CONFIG_USB_DEFAULT_PERSIST=y +CONFIG_USB_DYNAMIC_MINORS=y +CONFIG_USB_OTG=y +# CONFIG_USB_OTG_WHITELIST is not set +# CONFIG_USB_OTG_BLACKLIST_HUB is not set +CONFIG_USB_OTG_FSM=m +CONFIG_USB_LEDS_TRIGGER_USBPORT=m +CONFIG_USB_MON=m +CONFIG_USB_WUSB=m +CONFIG_USB_WUSB_CBAF=m +# CONFIG_USB_WUSB_CBAF_DEBUG is not set + +# +# USB Host Controller Drivers +# +CONFIG_USB_C67X00_HCD=m +CONFIG_USB_XHCI_HCD=m +CONFIG_USB_XHCI_DBGCAP=y +CONFIG_USB_XHCI_PCI=m +CONFIG_USB_XHCI_PLATFORM=m +CONFIG_USB_EHCI_HCD=m +CONFIG_USB_EHCI_ROOT_HUB_TT=y +CONFIG_USB_EHCI_TT_NEWSCHED=y +CONFIG_USB_EHCI_PCI=m +CONFIG_USB_EHCI_HCD_PLATFORM=m +CONFIG_USB_OXU210HP_HCD=m +CONFIG_USB_ISP116X_HCD=m +CONFIG_USB_FOTG210_HCD=m +CONFIG_USB_MAX3421_HCD=m +CONFIG_USB_OHCI_HCD=m +CONFIG_USB_OHCI_HCD_PCI=m +CONFIG_USB_OHCI_HCD_SSB=y +CONFIG_USB_OHCI_HCD_PLATFORM=m +CONFIG_USB_UHCI_HCD=m +CONFIG_USB_U132_HCD=m +CONFIG_USB_SL811_HCD=m +# CONFIG_USB_SL811_HCD_ISO is not set +CONFIG_USB_SL811_CS=m +CONFIG_USB_R8A66597_HCD=m +CONFIG_USB_WHCI_HCD=m +CONFIG_USB_HWA_HCD=m +CONFIG_USB_HCD_BCMA=m +CONFIG_USB_HCD_SSB=m +# CONFIG_USB_HCD_TEST_MODE is not set + +# +# USB Device Class drivers +# +CONFIG_USB_ACM=m +CONFIG_USB_PRINTER=m +CONFIG_USB_WDM=m +CONFIG_USB_TMC=m + +# +# NOTE: USB_STORAGE depends on SCSI but BLK_DEV_SD may +# + +# +# also be needed; see USB_STORAGE Help for more info +# +CONFIG_USB_STORAGE=m +# CONFIG_USB_STORAGE_DEBUG is not set +CONFIG_USB_STORAGE_REALTEK=m +CONFIG_REALTEK_AUTOPM=y +CONFIG_USB_STORAGE_DATAFAB=m +CONFIG_USB_STORAGE_FREECOM=m +CONFIG_USB_STORAGE_ISD200=m +CONFIG_USB_STORAGE_USBAT=m +CONFIG_USB_STORAGE_SDDR09=m +CONFIG_USB_STORAGE_SDDR55=m +CONFIG_USB_STORAGE_JUMPSHOT=m +CONFIG_USB_STORAGE_ALAUDA=m +CONFIG_USB_STORAGE_ONETOUCH=m +CONFIG_USB_STORAGE_KARMA=m +CONFIG_USB_STORAGE_CYPRESS_ATACB=m +CONFIG_USB_STORAGE_ENE_UB6250=m +CONFIG_USB_UAS=m + +# +# USB Imaging devices +# +CONFIG_USB_MDC800=m +CONFIG_USB_MICROTEK=m +CONFIG_USBIP_CORE=m +CONFIG_USBIP_VHCI_HCD=m +CONFIG_USBIP_VHCI_HC_PORTS=8 +CONFIG_USBIP_VHCI_NR_HCS=1 +CONFIG_USBIP_HOST=m +CONFIG_USBIP_VUDC=m +# CONFIG_USBIP_DEBUG is not set +CONFIG_USB_MUSB_HDRC=m +# CONFIG_USB_MUSB_HOST is not set +# CONFIG_USB_MUSB_GADGET is not set +CONFIG_USB_MUSB_DUAL_ROLE=y + +# +# Platform Glue Layer +# + +# +# MUSB DMA mode +# +CONFIG_MUSB_PIO_ONLY=y +CONFIG_USB_DWC3=m +# CONFIG_USB_DWC3_ULPI is not set +# CONFIG_USB_DWC3_HOST is not set +# CONFIG_USB_DWC3_GADGET is not set +CONFIG_USB_DWC3_DUAL_ROLE=y + +# +# Platform Glue Driver Support +# +CONFIG_USB_DWC3_PCI=m +CONFIG_USB_DWC3_HAPS=m +CONFIG_USB_DWC2=m +# CONFIG_USB_DWC2_HOST is not set + +# +# Gadget/Dual-role mode requires USB Gadget support to be enabled +# +# CONFIG_USB_DWC2_PERIPHERAL is not set +CONFIG_USB_DWC2_DUAL_ROLE=y +CONFIG_USB_DWC2_PCI=m +# CONFIG_USB_DWC2_DEBUG is not set +# CONFIG_USB_DWC2_TRACK_MISSED_SOFS is not set +CONFIG_USB_CHIPIDEA=m +CONFIG_USB_CHIPIDEA_PCI=m +CONFIG_USB_CHIPIDEA_UDC=y +CONFIG_USB_CHIPIDEA_HOST=y +CONFIG_USB_ISP1760=m +CONFIG_USB_ISP1760_HCD=y +CONFIG_USB_ISP1761_UDC=y +# CONFIG_USB_ISP1760_HOST_ROLE is not set +# CONFIG_USB_ISP1760_GADGET_ROLE is not set +CONFIG_USB_ISP1760_DUAL_ROLE=y + +# +# USB port drivers +# +CONFIG_USB_USS720=m +CONFIG_USB_SERIAL=m +CONFIG_USB_SERIAL_GENERIC=y +CONFIG_USB_SERIAL_SIMPLE=m +CONFIG_USB_SERIAL_AIRCABLE=m +CONFIG_USB_SERIAL_ARK3116=m +CONFIG_USB_SERIAL_BELKIN=m +CONFIG_USB_SERIAL_CH341=m +CONFIG_USB_SERIAL_WHITEHEAT=m +CONFIG_USB_SERIAL_DIGI_ACCELEPORT=m +CONFIG_USB_SERIAL_CP210X=m +CONFIG_USB_SERIAL_CYPRESS_M8=m +CONFIG_USB_SERIAL_EMPEG=m +CONFIG_USB_SERIAL_FTDI_SIO=m +CONFIG_USB_SERIAL_VISOR=m +CONFIG_USB_SERIAL_IPAQ=m +CONFIG_USB_SERIAL_IR=m +CONFIG_USB_SERIAL_EDGEPORT=m +CONFIG_USB_SERIAL_EDGEPORT_TI=m +CONFIG_USB_SERIAL_F81232=m +CONFIG_USB_SERIAL_F8153X=m +CONFIG_USB_SERIAL_GARMIN=m +CONFIG_USB_SERIAL_IPW=m +CONFIG_USB_SERIAL_IUU=m +CONFIG_USB_SERIAL_KEYSPAN_PDA=m +CONFIG_USB_SERIAL_KEYSPAN=m +CONFIG_USB_SERIAL_KLSI=m +CONFIG_USB_SERIAL_KOBIL_SCT=m +CONFIG_USB_SERIAL_MCT_U232=m +CONFIG_USB_SERIAL_METRO=m +CONFIG_USB_SERIAL_MOS7720=m +CONFIG_USB_SERIAL_MOS7715_PARPORT=y +CONFIG_USB_SERIAL_MOS7840=m +CONFIG_USB_SERIAL_MXUPORT=m +CONFIG_USB_SERIAL_NAVMAN=m +CONFIG_USB_SERIAL_PL2303=m +CONFIG_USB_SERIAL_OTI6858=m +CONFIG_USB_SERIAL_QCAUX=m +CONFIG_USB_SERIAL_QUALCOMM=m +CONFIG_USB_SERIAL_SPCP8X5=m +CONFIG_USB_SERIAL_SAFE=m +CONFIG_USB_SERIAL_SAFE_PADDED=y +CONFIG_USB_SERIAL_SIERRAWIRELESS=m +CONFIG_USB_SERIAL_SYMBOL=m +CONFIG_USB_SERIAL_TI=m +CONFIG_USB_SERIAL_CYBERJACK=m +CONFIG_USB_SERIAL_XIRCOM=m +CONFIG_USB_SERIAL_WWAN=m +CONFIG_USB_SERIAL_OPTION=m +CONFIG_USB_SERIAL_OMNINET=m +CONFIG_USB_SERIAL_OPTICON=m +CONFIG_USB_SERIAL_XSENS_MT=m +CONFIG_USB_SERIAL_WISHBONE=m +CONFIG_USB_SERIAL_SSU100=m +CONFIG_USB_SERIAL_QT2=m +# CONFIG_USB_SERIAL_UPD78F0730 is not set +# CONFIG_USB_SERIAL_DEBUG is not set + +# +# USB Miscellaneous drivers +# +CONFIG_USB_EMI62=m +CONFIG_USB_EMI26=m +CONFIG_USB_ADUTUX=m +CONFIG_USB_SEVSEG=m +CONFIG_USB_RIO500=m +CONFIG_USB_LEGOTOWER=m +CONFIG_USB_LCD=m +CONFIG_USB_CYPRESS_CY7C63=m +CONFIG_USB_CYTHERM=m +CONFIG_USB_IDMOUSE=m +CONFIG_USB_FTDI_ELAN=m +CONFIG_USB_APPLEDISPLAY=m +CONFIG_USB_SISUSBVGA=m +CONFIG_USB_SISUSBVGA_CON=y +CONFIG_USB_LD=m +CONFIG_USB_TRANCEVIBRATOR=m +CONFIG_USB_IOWARRIOR=m +CONFIG_USB_TEST=m +CONFIG_USB_EHSET_TEST_FIXTURE=m +CONFIG_USB_ISIGHTFW=m +CONFIG_USB_YUREX=m +CONFIG_USB_EZUSB_FX2=m +CONFIG_USB_HUB_USB251XB=m +CONFIG_USB_HSIC_USB3503=m +CONFIG_USB_HSIC_USB4604=m +CONFIG_USB_LINK_LAYER_TEST=m +CONFIG_USB_CHAOSKEY=m +CONFIG_USB_ATM=m +CONFIG_USB_SPEEDTOUCH=m +CONFIG_USB_CXACRU=m +CONFIG_USB_UEAGLEATM=m +CONFIG_USB_XUSBATM=m + +# +# USB Physical Layer drivers +# +CONFIG_USB_PHY=y +CONFIG_NOP_USB_XCEIV=m +CONFIG_USB_GPIO_VBUS=m +CONFIG_TAHVO_USB=m +CONFIG_TAHVO_USB_HOST_BY_DEFAULT=y +CONFIG_USB_ISP1301=m +CONFIG_USB_GADGET=m +# CONFIG_USB_GADGET_DEBUG is not set +# CONFIG_USB_GADGET_DEBUG_FILES is not set +# CONFIG_USB_GADGET_DEBUG_FS is not set +CONFIG_USB_GADGET_VBUS_DRAW=2 +CONFIG_USB_GADGET_STORAGE_NUM_BUFFERS=2 +CONFIG_U_SERIAL_CONSOLE=y + +# +# USB Peripheral Controller +# +CONFIG_USB_FOTG210_UDC=m +CONFIG_USB_GR_UDC=m +CONFIG_USB_R8A66597=m +CONFIG_USB_PXA27X=m +CONFIG_USB_MV_UDC=m +CONFIG_USB_MV_U3D=m +CONFIG_USB_SNP_CORE=m +CONFIG_USB_M66592=m +CONFIG_USB_BDC_UDC=m + +# +# Platform Support +# +CONFIG_USB_BDC_PCI=m +CONFIG_USB_AMD5536UDC=m +CONFIG_USB_NET2272=m +CONFIG_USB_NET2272_DMA=y +CONFIG_USB_NET2280=m +CONFIG_USB_GOKU=m +CONFIG_USB_EG20T=m +# CONFIG_USB_DUMMY_HCD is not set +CONFIG_USB_LIBCOMPOSITE=m +CONFIG_USB_F_ACM=m +CONFIG_USB_F_SS_LB=m +CONFIG_USB_U_SERIAL=m +CONFIG_USB_U_ETHER=m +CONFIG_USB_U_AUDIO=m +CONFIG_USB_F_SERIAL=m +CONFIG_USB_F_OBEX=m +CONFIG_USB_F_NCM=m +CONFIG_USB_F_ECM=m +CONFIG_USB_F_PHONET=m +CONFIG_USB_F_EEM=m +CONFIG_USB_F_SUBSET=m +CONFIG_USB_F_RNDIS=m +CONFIG_USB_F_MASS_STORAGE=m +CONFIG_USB_F_FS=m +CONFIG_USB_F_UAC1=m +CONFIG_USB_F_UAC2=m +CONFIG_USB_F_UVC=m +CONFIG_USB_F_MIDI=m +CONFIG_USB_F_HID=m +CONFIG_USB_F_PRINTER=m +CONFIG_USB_F_TCM=m +CONFIG_USB_CONFIGFS=m +CONFIG_USB_CONFIGFS_SERIAL=y +CONFIG_USB_CONFIGFS_ACM=y +CONFIG_USB_CONFIGFS_OBEX=y +CONFIG_USB_CONFIGFS_NCM=y +CONFIG_USB_CONFIGFS_ECM=y +CONFIG_USB_CONFIGFS_ECM_SUBSET=y +CONFIG_USB_CONFIGFS_RNDIS=y +CONFIG_USB_CONFIGFS_EEM=y +CONFIG_USB_CONFIGFS_PHONET=y +CONFIG_USB_CONFIGFS_MASS_STORAGE=y +CONFIG_USB_CONFIGFS_F_LB_SS=y +CONFIG_USB_CONFIGFS_F_FS=y +CONFIG_USB_CONFIGFS_F_UAC1=y +# CONFIG_USB_CONFIGFS_F_UAC1_LEGACY is not set +CONFIG_USB_CONFIGFS_F_UAC2=y +CONFIG_USB_CONFIGFS_F_MIDI=y +CONFIG_USB_CONFIGFS_F_HID=y +CONFIG_USB_CONFIGFS_F_UVC=y +CONFIG_USB_CONFIGFS_F_PRINTER=y +CONFIG_USB_CONFIGFS_F_TCM=y +CONFIG_USB_ZERO=m +CONFIG_USB_ZERO_HNPTEST=y +CONFIG_USB_AUDIO=m +CONFIG_GADGET_UAC1=y +# CONFIG_GADGET_UAC1_LEGACY is not set +CONFIG_USB_ETH=m +CONFIG_USB_ETH_RNDIS=y +CONFIG_USB_ETH_EEM=y +CONFIG_USB_G_NCM=m +CONFIG_USB_GADGETFS=m +CONFIG_USB_FUNCTIONFS=m +CONFIG_USB_FUNCTIONFS_ETH=y +CONFIG_USB_FUNCTIONFS_RNDIS=y +CONFIG_USB_FUNCTIONFS_GENERIC=y +CONFIG_USB_MASS_STORAGE=m +CONFIG_USB_GADGET_TARGET=m +CONFIG_USB_G_SERIAL=m +CONFIG_USB_MIDI_GADGET=m +CONFIG_USB_G_PRINTER=m +CONFIG_USB_CDC_COMPOSITE=m +CONFIG_USB_G_NOKIA=m +CONFIG_USB_G_ACM_MS=m +CONFIG_USB_G_MULTI=m +CONFIG_USB_G_MULTI_RNDIS=y +CONFIG_USB_G_MULTI_CDC=y +CONFIG_USB_G_HID=m +# CONFIG_USB_G_DBGP is not set +CONFIG_USB_G_WEBCAM=m +CONFIG_TYPEC=m +CONFIG_TYPEC_TCPM=m +CONFIG_TYPEC_TCPCI=m +CONFIG_TYPEC_RT1711H=m +CONFIG_TYPEC_FUSB302=m +CONFIG_TYPEC_UCSI=m +CONFIG_UCSI_ACPI=m +CONFIG_TYPEC_TPS6598X=m + +# +# USB Type-C Multiplexer/DeMultiplexer Switch support +# +CONFIG_TYPEC_MUX_PI3USB30532=m + +# +# USB Type-C Alternate Mode drivers +# +# CONFIG_TYPEC_DP_ALTMODE is not set +CONFIG_USB_ROLE_SWITCH=m +CONFIG_USB_ROLES_INTEL_XHCI=m +CONFIG_USB_LED_TRIG=y +CONFIG_USB_ULPI_BUS=m +CONFIG_UWB=m +CONFIG_UWB_HWA=m +CONFIG_UWB_WHCI=m +CONFIG_UWB_I1480U=m +CONFIG_MMC=m +CONFIG_MMC_BLOCK=m +CONFIG_MMC_BLOCK_MINORS=8 +CONFIG_SDIO_UART=m +# CONFIG_MMC_TEST is not set + +# +# MMC/SD/SDIO Host Controller Drivers +# +# CONFIG_MMC_DEBUG is not set +CONFIG_MMC_SDHCI=m +CONFIG_MMC_SDHCI_PCI=m +CONFIG_MMC_RICOH_MMC=y +CONFIG_MMC_SDHCI_ACPI=m +CONFIG_MMC_SDHCI_PLTFM=m +CONFIG_MMC_SDHCI_F_SDH30=m +CONFIG_MMC_WBSD=m +CONFIG_MMC_TIFM_SD=m +CONFIG_MMC_SPI=m +CONFIG_MMC_SDRICOH_CS=m +CONFIG_MMC_CB710=m +CONFIG_MMC_VIA_SDMMC=m +CONFIG_MMC_VUB300=m +CONFIG_MMC_USHC=m +CONFIG_MMC_USDHI6ROL0=m +CONFIG_MMC_REALTEK_PCI=m +CONFIG_MMC_REALTEK_USB=m +CONFIG_MMC_CQHCI=m +CONFIG_MMC_TOSHIBA_PCI=m +CONFIG_MMC_MTK=m +CONFIG_MMC_SDHCI_XENON=m +CONFIG_MEMSTICK=m +# CONFIG_MEMSTICK_DEBUG is not set + +# +# MemoryStick drivers +# +# CONFIG_MEMSTICK_UNSAFE_RESUME is not set +CONFIG_MSPRO_BLOCK=m +CONFIG_MS_BLOCK=m + +# +# MemoryStick Host Controller Drivers +# +CONFIG_MEMSTICK_TIFM_MS=m +CONFIG_MEMSTICK_JMICRON_38X=m +CONFIG_MEMSTICK_R592=m +CONFIG_MEMSTICK_REALTEK_PCI=m +CONFIG_MEMSTICK_REALTEK_USB=m +CONFIG_NEW_LEDS=y +CONFIG_LEDS_CLASS=y +CONFIG_LEDS_CLASS_FLASH=m +# CONFIG_LEDS_BRIGHTNESS_HW_CHANGED is not set + +# +# LED drivers +# +CONFIG_LEDS_APU=m +CONFIG_LEDS_AS3645A=m +CONFIG_LEDS_LM3530=m +CONFIG_LEDS_LM3533=m +CONFIG_LEDS_LM3642=m +CONFIG_LEDS_LM3601X=m +CONFIG_LEDS_MT6323=m +CONFIG_LEDS_PCA9532=m +CONFIG_LEDS_PCA9532_GPIO=y +CONFIG_LEDS_GPIO=m +CONFIG_LEDS_LP3944=m +CONFIG_LEDS_LP3952=m +CONFIG_LEDS_LP55XX_COMMON=m +CONFIG_LEDS_LP5521=m +CONFIG_LEDS_LP5523=m +CONFIG_LEDS_LP5562=m +CONFIG_LEDS_LP8501=m +CONFIG_LEDS_CLEVO_MAIL=m +CONFIG_LEDS_PCA955X=m +# CONFIG_LEDS_PCA955X_GPIO is not set +CONFIG_LEDS_PCA963X=m +CONFIG_LEDS_WM831X_STATUS=m +CONFIG_LEDS_DA9052=m +CONFIG_LEDS_DAC124S085=m +CONFIG_LEDS_PWM=m +CONFIG_LEDS_REGULATOR=m +CONFIG_LEDS_BD2802=m +CONFIG_LEDS_INTEL_SS4200=m +CONFIG_LEDS_LT3593=m +CONFIG_LEDS_MC13783=m +CONFIG_LEDS_TCA6507=m +CONFIG_LEDS_TLC591XX=m +CONFIG_LEDS_LM355x=m +CONFIG_LEDS_MENF21BMC=m + +# +# LED driver for blink(1) USB RGB LED is under Special HID drivers (HID_THINGM) +# +CONFIG_LEDS_BLINKM=m +CONFIG_LEDS_MLXCPLD=m +CONFIG_LEDS_MLXREG=m +CONFIG_LEDS_USER=m +CONFIG_LEDS_NIC78BX=m + +# +# LED Triggers +# +CONFIG_LEDS_TRIGGERS=y +CONFIG_LEDS_TRIGGER_TIMER=m +CONFIG_LEDS_TRIGGER_ONESHOT=m +CONFIG_LEDS_TRIGGER_DISK=y +# CONFIG_LEDS_TRIGGER_MTD is not set +CONFIG_LEDS_TRIGGER_HEARTBEAT=m +CONFIG_LEDS_TRIGGER_BACKLIGHT=m +CONFIG_LEDS_TRIGGER_CPU=y +CONFIG_LEDS_TRIGGER_ACTIVITY=m +CONFIG_LEDS_TRIGGER_GPIO=m +CONFIG_LEDS_TRIGGER_DEFAULT_ON=m + +# +# iptables trigger is under Netfilter config (LED target) +# +CONFIG_LEDS_TRIGGER_TRANSIENT=m +CONFIG_LEDS_TRIGGER_CAMERA=m +CONFIG_LEDS_TRIGGER_PANIC=y +CONFIG_LEDS_TRIGGER_NETDEV=m +CONFIG_ACCESSIBILITY=y +CONFIG_A11Y_BRAILLE_CONSOLE=y +CONFIG_INFINIBAND=m +CONFIG_INFINIBAND_USER_MAD=m +CONFIG_INFINIBAND_USER_ACCESS=m +# CONFIG_INFINIBAND_EXP_LEGACY_VERBS_NEW_UAPI is not set +CONFIG_INFINIBAND_USER_MEM=y +CONFIG_INFINIBAND_ON_DEMAND_PAGING=y +CONFIG_INFINIBAND_ADDR_TRANS=y +CONFIG_INFINIBAND_ADDR_TRANS_CONFIGFS=y +CONFIG_INFINIBAND_MTHCA=m +CONFIG_INFINIBAND_MTHCA_DEBUG=y +CONFIG_INFINIBAND_QIB=m +CONFIG_INFINIBAND_QIB_DCA=y +CONFIG_INFINIBAND_CXGB3=m +CONFIG_INFINIBAND_CXGB4=m +CONFIG_INFINIBAND_I40IW=m +CONFIG_MLX4_INFINIBAND=m +CONFIG_MLX5_INFINIBAND=m +CONFIG_INFINIBAND_NES=m +# CONFIG_INFINIBAND_NES_DEBUG is not set +CONFIG_INFINIBAND_OCRDMA=m +CONFIG_INFINIBAND_VMWARE_PVRDMA=m +CONFIG_INFINIBAND_USNIC=m +CONFIG_INFINIBAND_IPOIB=m +CONFIG_INFINIBAND_IPOIB_CM=y +CONFIG_INFINIBAND_IPOIB_DEBUG=y +# CONFIG_INFINIBAND_IPOIB_DEBUG_DATA is not set +CONFIG_INFINIBAND_SRP=m +CONFIG_INFINIBAND_SRPT=m +CONFIG_INFINIBAND_ISER=m +CONFIG_INFINIBAND_ISERT=m +CONFIG_INFINIBAND_OPA_VNIC=m +CONFIG_INFINIBAND_RDMAVT=m +CONFIG_RDMA_RXE=m +CONFIG_INFINIBAND_HFI1=m +# CONFIG_HFI1_DEBUG_SDMA_ORDER is not set +# CONFIG_SDMA_VERBOSITY is not set +CONFIG_INFINIBAND_QEDR=m +CONFIG_INFINIBAND_BNXT_RE=m +CONFIG_EDAC_ATOMIC_SCRUB=y +CONFIG_EDAC_SUPPORT=y +CONFIG_EDAC=y +CONFIG_EDAC_LEGACY_SYSFS=y +# CONFIG_EDAC_DEBUG is not set +CONFIG_EDAC_DECODE_MCE=m +CONFIG_EDAC_GHES=y +CONFIG_EDAC_AMD64=m +CONFIG_EDAC_AMD64_ERROR_INJECTION=y +CONFIG_EDAC_E752X=m +CONFIG_EDAC_I82975X=m +CONFIG_EDAC_I3000=m +CONFIG_EDAC_I3200=m +CONFIG_EDAC_IE31200=m +CONFIG_EDAC_X38=m +CONFIG_EDAC_I5400=m +CONFIG_EDAC_I7CORE=m +CONFIG_EDAC_I5000=m +CONFIG_EDAC_I5100=m +CONFIG_EDAC_I7300=m +CONFIG_EDAC_SBRIDGE=m +CONFIG_EDAC_SKX=m +CONFIG_EDAC_PND2=m +CONFIG_RTC_LIB=y +CONFIG_RTC_MC146818_LIB=y +CONFIG_RTC_CLASS=y +CONFIG_RTC_HCTOSYS=y +CONFIG_RTC_HCTOSYS_DEVICE="rtc0" +CONFIG_RTC_SYSTOHC=y +CONFIG_RTC_SYSTOHC_DEVICE="rtc0" +# CONFIG_RTC_DEBUG is not set +CONFIG_RTC_NVMEM=y + +# +# RTC interfaces +# +CONFIG_RTC_INTF_SYSFS=y +CONFIG_RTC_INTF_PROC=y +CONFIG_RTC_INTF_DEV=y +CONFIG_RTC_INTF_DEV_UIE_EMUL=y +# CONFIG_RTC_DRV_TEST is not set + +# +# I2C RTC drivers +# +CONFIG_RTC_DRV_88PM80X=m +CONFIG_RTC_DRV_ABB5ZES3=m +CONFIG_RTC_DRV_ABX80X=m +CONFIG_RTC_DRV_DS1307=m +# CONFIG_RTC_DRV_DS1307_CENTURY is not set +CONFIG_RTC_DRV_DS1374=m +CONFIG_RTC_DRV_DS1374_WDT=y +CONFIG_RTC_DRV_DS1672=m +CONFIG_RTC_DRV_MAX6900=m +CONFIG_RTC_DRV_MAX8907=m +CONFIG_RTC_DRV_RS5C372=m +CONFIG_RTC_DRV_ISL1208=m +CONFIG_RTC_DRV_ISL12022=m +CONFIG_RTC_DRV_X1205=m +CONFIG_RTC_DRV_PCF8523=m +CONFIG_RTC_DRV_PCF85063=m +CONFIG_RTC_DRV_PCF85363=m +CONFIG_RTC_DRV_PCF8563=m +CONFIG_RTC_DRV_PCF8583=m +CONFIG_RTC_DRV_M41T80=m +CONFIG_RTC_DRV_M41T80_WDT=y +CONFIG_RTC_DRV_BQ32K=m +CONFIG_RTC_DRV_S35390A=m +CONFIG_RTC_DRV_FM3130=m +CONFIG_RTC_DRV_RX8010=m +CONFIG_RTC_DRV_RX8581=m +CONFIG_RTC_DRV_RX8025=m +CONFIG_RTC_DRV_EM3027=m +CONFIG_RTC_DRV_RV8803=m + +# +# SPI RTC drivers +# +CONFIG_RTC_DRV_M41T93=m +CONFIG_RTC_DRV_M41T94=m +CONFIG_RTC_DRV_DS1302=m +CONFIG_RTC_DRV_DS1305=m +CONFIG_RTC_DRV_DS1343=m +CONFIG_RTC_DRV_DS1347=m +CONFIG_RTC_DRV_DS1390=m +CONFIG_RTC_DRV_MAX6916=m +CONFIG_RTC_DRV_R9701=m +CONFIG_RTC_DRV_RX4581=m +CONFIG_RTC_DRV_RX6110=m +CONFIG_RTC_DRV_RS5C348=m +CONFIG_RTC_DRV_MAX6902=m +CONFIG_RTC_DRV_PCF2123=m +CONFIG_RTC_DRV_MCP795=m +CONFIG_RTC_I2C_AND_SPI=m + +# +# SPI and I2C RTC drivers +# +CONFIG_RTC_DRV_DS3232=m +CONFIG_RTC_DRV_DS3232_HWMON=y +CONFIG_RTC_DRV_PCF2127=m +CONFIG_RTC_DRV_RV3029C2=m +CONFIG_RTC_DRV_RV3029_HWMON=y + +# +# Platform RTC drivers +# +CONFIG_RTC_DRV_CMOS=y +CONFIG_RTC_DRV_DS1286=m +CONFIG_RTC_DRV_DS1511=m +CONFIG_RTC_DRV_DS1553=m +CONFIG_RTC_DRV_DS1685_FAMILY=m +CONFIG_RTC_DRV_DS1685=y +# CONFIG_RTC_DRV_DS1689 is not set +# CONFIG_RTC_DRV_DS17285 is not set +# CONFIG_RTC_DRV_DS17485 is not set +# CONFIG_RTC_DRV_DS17885 is not set +CONFIG_RTC_DS1685_PROC_REGS=y +CONFIG_RTC_DRV_DS1742=m +CONFIG_RTC_DRV_DS2404=m +CONFIG_RTC_DRV_DA9052=m +CONFIG_RTC_DRV_DA9063=m +CONFIG_RTC_DRV_STK17TA8=m +CONFIG_RTC_DRV_M48T86=m +CONFIG_RTC_DRV_M48T35=m +CONFIG_RTC_DRV_M48T59=m +CONFIG_RTC_DRV_MSM6242=m +CONFIG_RTC_DRV_BQ4802=m +CONFIG_RTC_DRV_RP5C01=m +CONFIG_RTC_DRV_V3020=m +CONFIG_RTC_DRV_WM831X=m +CONFIG_RTC_DRV_PCF50633=m +# CONFIG_RTC_DRV_CROS_EC is not set + +# +# on-CPU RTC drivers +# +CONFIG_RTC_DRV_FTRTC010=m +CONFIG_RTC_DRV_PCAP=m +CONFIG_RTC_DRV_MC13XXX=m +CONFIG_RTC_DRV_MT6397=m + +# +# HID Sensor RTC drivers +# +CONFIG_RTC_DRV_HID_SENSOR_TIME=m +CONFIG_DMADEVICES=y +# CONFIG_DMADEVICES_DEBUG is not set + +# +# DMA Devices +# +CONFIG_DMA_ENGINE=y +CONFIG_DMA_VIRTUAL_CHANNELS=y +CONFIG_DMA_ACPI=y +CONFIG_ALTERA_MSGDMA=m +CONFIG_INTEL_IDMA64=m +CONFIG_INTEL_IOATDMA=m +CONFIG_INTEL_MIC_X100_DMA=m +CONFIG_QCOM_HIDMA_MGMT=m +CONFIG_QCOM_HIDMA=m +CONFIG_DW_DMAC_CORE=y +CONFIG_DW_DMAC=m +CONFIG_DW_DMAC_PCI=y +CONFIG_HSU_DMA=y + +# +# DMA Clients +# +CONFIG_ASYNC_TX_DMA=y +# CONFIG_DMATEST is not set +CONFIG_DMA_ENGINE_RAID=y + +# +# DMABUF options +# +CONFIG_SYNC_FILE=y +# CONFIG_SW_SYNC is not set +CONFIG_DCA=m +CONFIG_AUXDISPLAY=y +CONFIG_HD44780=m +CONFIG_KS0108=m +CONFIG_KS0108_PORT=0x378 +CONFIG_KS0108_DELAY=2 +CONFIG_CFAG12864B=m +CONFIG_CFAG12864B_RATE=20 +CONFIG_IMG_ASCII_LCD=m +CONFIG_PANEL=m +CONFIG_PANEL_PARPORT=0 +CONFIG_PANEL_PROFILE=5 +# CONFIG_PANEL_CHANGE_MESSAGE is not set +CONFIG_CHARLCD=m +CONFIG_UIO=m +CONFIG_UIO_CIF=m +CONFIG_UIO_PDRV_GENIRQ=m +CONFIG_UIO_DMEM_GENIRQ=m +CONFIG_UIO_AEC=m +CONFIG_UIO_SERCOS3=m +CONFIG_UIO_PCI_GENERIC=m +CONFIG_UIO_NETX=m +CONFIG_UIO_PRUSS=m +CONFIG_UIO_MF624=m +CONFIG_UIO_HV_GENERIC=m +CONFIG_VFIO_IOMMU_TYPE1=m +CONFIG_VFIO_VIRQFD=m +CONFIG_VFIO=m +# CONFIG_VFIO_NOIOMMU is not set +CONFIG_VFIO_PCI=m +CONFIG_VFIO_PCI_VGA=y +CONFIG_VFIO_PCI_MMAP=y +CONFIG_VFIO_PCI_INTX=y +CONFIG_VFIO_PCI_IGD=y +CONFIG_VFIO_MDEV=m +CONFIG_VFIO_MDEV_DEVICE=m +CONFIG_IRQ_BYPASS_MANAGER=m +CONFIG_VIRT_DRIVERS=y +# CONFIG_VBOXGUEST is not set +CONFIG_VIRTIO=m +CONFIG_VIRTIO_MENU=y +CONFIG_VIRTIO_PCI=m +CONFIG_VIRTIO_PCI_LEGACY=y +CONFIG_VIRTIO_BALLOON=m +CONFIG_VIRTIO_INPUT=m +CONFIG_VIRTIO_MMIO=m +CONFIG_VIRTIO_MMIO_CMDLINE_DEVICES=y + +# +# Microsoft Hyper-V guest support +# +CONFIG_HYPERV=m +CONFIG_HYPERV_TSCPAGE=y +CONFIG_HYPERV_UTILS=m +CONFIG_HYPERV_BALLOON=m +CONFIG_STAGING=y +CONFIG_PRISM2_USB=m +CONFIG_COMEDI=m +# CONFIG_COMEDI_DEBUG is not set +CONFIG_COMEDI_DEFAULT_BUF_SIZE_KB=2048 +CONFIG_COMEDI_DEFAULT_BUF_MAXSIZE_KB=20480 +CONFIG_COMEDI_MISC_DRIVERS=y +CONFIG_COMEDI_BOND=m +CONFIG_COMEDI_TEST=m +CONFIG_COMEDI_PARPORT=m +CONFIG_COMEDI_ISA_DRIVERS=y +CONFIG_COMEDI_PCL711=m +CONFIG_COMEDI_PCL724=m +CONFIG_COMEDI_PCL726=m +CONFIG_COMEDI_PCL730=m +CONFIG_COMEDI_PCL812=m +CONFIG_COMEDI_PCL816=m +CONFIG_COMEDI_PCL818=m +CONFIG_COMEDI_PCM3724=m +CONFIG_COMEDI_AMPLC_DIO200_ISA=m +CONFIG_COMEDI_AMPLC_PC236_ISA=m +CONFIG_COMEDI_AMPLC_PC263_ISA=m +CONFIG_COMEDI_RTI800=m +CONFIG_COMEDI_RTI802=m +CONFIG_COMEDI_DAC02=m +CONFIG_COMEDI_DAS16M1=m +CONFIG_COMEDI_DAS08_ISA=m +CONFIG_COMEDI_DAS16=m +CONFIG_COMEDI_DAS800=m +CONFIG_COMEDI_DAS1800=m +CONFIG_COMEDI_DAS6402=m +CONFIG_COMEDI_DT2801=m +CONFIG_COMEDI_DT2811=m +CONFIG_COMEDI_DT2814=m +CONFIG_COMEDI_DT2815=m +CONFIG_COMEDI_DT2817=m +CONFIG_COMEDI_DT282X=m +CONFIG_COMEDI_DMM32AT=m +CONFIG_COMEDI_FL512=m +CONFIG_COMEDI_AIO_AIO12_8=m +CONFIG_COMEDI_AIO_IIRO_16=m +CONFIG_COMEDI_II_PCI20KC=m +CONFIG_COMEDI_C6XDIGIO=m +CONFIG_COMEDI_MPC624=m +CONFIG_COMEDI_ADQ12B=m +CONFIG_COMEDI_NI_AT_A2150=m +CONFIG_COMEDI_NI_AT_AO=m +CONFIG_COMEDI_NI_ATMIO=m +CONFIG_COMEDI_NI_ATMIO16D=m +CONFIG_COMEDI_NI_LABPC_ISA=m +CONFIG_COMEDI_PCMAD=m +CONFIG_COMEDI_PCMDA12=m +CONFIG_COMEDI_PCMMIO=m +CONFIG_COMEDI_PCMUIO=m +CONFIG_COMEDI_MULTIQ3=m +CONFIG_COMEDI_S526=m +CONFIG_COMEDI_PCI_DRIVERS=m +CONFIG_COMEDI_8255_PCI=m +CONFIG_COMEDI_ADDI_WATCHDOG=m +CONFIG_COMEDI_ADDI_APCI_1032=m +CONFIG_COMEDI_ADDI_APCI_1500=m +CONFIG_COMEDI_ADDI_APCI_1516=m +CONFIG_COMEDI_ADDI_APCI_1564=m +CONFIG_COMEDI_ADDI_APCI_16XX=m +CONFIG_COMEDI_ADDI_APCI_2032=m +CONFIG_COMEDI_ADDI_APCI_2200=m +CONFIG_COMEDI_ADDI_APCI_3120=m +CONFIG_COMEDI_ADDI_APCI_3501=m +CONFIG_COMEDI_ADDI_APCI_3XXX=m +CONFIG_COMEDI_ADL_PCI6208=m +CONFIG_COMEDI_ADL_PCI7X3X=m +CONFIG_COMEDI_ADL_PCI8164=m +CONFIG_COMEDI_ADL_PCI9111=m +CONFIG_COMEDI_ADL_PCI9118=m +CONFIG_COMEDI_ADV_PCI1710=m +CONFIG_COMEDI_ADV_PCI1720=m +CONFIG_COMEDI_ADV_PCI1723=m +CONFIG_COMEDI_ADV_PCI1724=m +CONFIG_COMEDI_ADV_PCI1760=m +CONFIG_COMEDI_ADV_PCI_DIO=m +CONFIG_COMEDI_AMPLC_DIO200_PCI=m +CONFIG_COMEDI_AMPLC_PC236_PCI=m +CONFIG_COMEDI_AMPLC_PC263_PCI=m +CONFIG_COMEDI_AMPLC_PCI224=m +CONFIG_COMEDI_AMPLC_PCI230=m +CONFIG_COMEDI_CONTEC_PCI_DIO=m +CONFIG_COMEDI_DAS08_PCI=m +CONFIG_COMEDI_DT3000=m +CONFIG_COMEDI_DYNA_PCI10XX=m +CONFIG_COMEDI_GSC_HPDI=m +CONFIG_COMEDI_MF6X4=m +CONFIG_COMEDI_ICP_MULTI=m +CONFIG_COMEDI_DAQBOARD2000=m +CONFIG_COMEDI_JR3_PCI=m +CONFIG_COMEDI_KE_COUNTER=m +CONFIG_COMEDI_CB_PCIDAS64=m +CONFIG_COMEDI_CB_PCIDAS=m +CONFIG_COMEDI_CB_PCIDDA=m +CONFIG_COMEDI_CB_PCIMDAS=m +CONFIG_COMEDI_CB_PCIMDDA=m +CONFIG_COMEDI_ME4000=m +CONFIG_COMEDI_ME_DAQ=m +CONFIG_COMEDI_NI_6527=m +CONFIG_COMEDI_NI_65XX=m +CONFIG_COMEDI_NI_660X=m +CONFIG_COMEDI_NI_670X=m +CONFIG_COMEDI_NI_LABPC_PCI=m +CONFIG_COMEDI_NI_PCIDIO=m +CONFIG_COMEDI_NI_PCIMIO=m +CONFIG_COMEDI_RTD520=m +CONFIG_COMEDI_S626=m +CONFIG_COMEDI_MITE=m +CONFIG_COMEDI_NI_TIOCMD=m +CONFIG_COMEDI_PCMCIA_DRIVERS=m +CONFIG_COMEDI_CB_DAS16_CS=m +CONFIG_COMEDI_DAS08_CS=m +CONFIG_COMEDI_NI_DAQ_700_CS=m +CONFIG_COMEDI_NI_DAQ_DIO24_CS=m +CONFIG_COMEDI_NI_LABPC_CS=m +CONFIG_COMEDI_NI_MIO_CS=m +CONFIG_COMEDI_QUATECH_DAQP_CS=m +CONFIG_COMEDI_USB_DRIVERS=m +CONFIG_COMEDI_DT9812=m +CONFIG_COMEDI_NI_USB6501=m +CONFIG_COMEDI_USBDUX=m +CONFIG_COMEDI_USBDUXFAST=m +CONFIG_COMEDI_USBDUXSIGMA=m +CONFIG_COMEDI_VMK80XX=m +CONFIG_COMEDI_8254=m +CONFIG_COMEDI_8255=m +CONFIG_COMEDI_8255_SA=m +CONFIG_COMEDI_KCOMEDILIB=m +CONFIG_COMEDI_AMPLC_DIO200=m +CONFIG_COMEDI_AMPLC_PC236=m +CONFIG_COMEDI_DAS08=m +CONFIG_COMEDI_ISADMA=m +CONFIG_COMEDI_NI_LABPC=m +CONFIG_COMEDI_NI_LABPC_ISADMA=m +CONFIG_COMEDI_NI_TIO=m +CONFIG_RTL8192U=m +CONFIG_RTLLIB=m +CONFIG_RTLLIB_CRYPTO_CCMP=m +CONFIG_RTLLIB_CRYPTO_TKIP=m +CONFIG_RTLLIB_CRYPTO_WEP=m +CONFIG_RTL8192E=m +CONFIG_RTL8723BS=m +CONFIG_R8712U=m +CONFIG_R8188EU=m +CONFIG_88EU_AP_MODE=y +CONFIG_R8822BE=m +CONFIG_RTLWIFI_DEBUG_ST=y +CONFIG_RTS5208=m +CONFIG_VT6655=m +CONFIG_VT6656=m + +# +# IIO staging drivers +# + +# +# Accelerometers +# +CONFIG_ADIS16203=m +CONFIG_ADIS16240=m + +# +# Analog to digital converters +# +CONFIG_AD7606=m +CONFIG_AD7606_IFACE_PARALLEL=m +CONFIG_AD7606_IFACE_SPI=m +CONFIG_AD7780=m +CONFIG_AD7816=m +CONFIG_AD7192=m +CONFIG_AD7280=m + +# +# Analog digital bi-direction converters +# +CONFIG_ADT7316=m +CONFIG_ADT7316_SPI=m +CONFIG_ADT7316_I2C=m + +# +# Capacitance to digital converters +# +CONFIG_AD7150=m +CONFIG_AD7152=m +CONFIG_AD7746=m + +# +# Direct Digital Synthesis +# +CONFIG_AD9832=m +CONFIG_AD9834=m + +# +# Network Analyzer, Impedance Converters +# +CONFIG_AD5933=m + +# +# Active energy metering IC +# +CONFIG_ADE7854=m +CONFIG_ADE7854_I2C=m +CONFIG_ADE7854_SPI=m + +# +# Resolver to digital converters +# +CONFIG_AD2S90=m +CONFIG_AD2S1210=m +CONFIG_FB_SM750=m +CONFIG_FB_XGI=m + +# +# Speakup console speech +# +CONFIG_SPEAKUP=m +CONFIG_SPEAKUP_SYNTH_ACNTSA=m +CONFIG_SPEAKUP_SYNTH_APOLLO=m +CONFIG_SPEAKUP_SYNTH_AUDPTR=m +CONFIG_SPEAKUP_SYNTH_BNS=m +CONFIG_SPEAKUP_SYNTH_DECTLK=m +CONFIG_SPEAKUP_SYNTH_DECEXT=m +CONFIG_SPEAKUP_SYNTH_LTLK=m +CONFIG_SPEAKUP_SYNTH_SOFT=m +CONFIG_SPEAKUP_SYNTH_SPKOUT=m +CONFIG_SPEAKUP_SYNTH_TXPRT=m +# CONFIG_SPEAKUP_SYNTH_DUMMY is not set +CONFIG_STAGING_MEDIA=y +CONFIG_I2C_BCM2048=m +CONFIG_SOC_CAMERA_IMX074=m +CONFIG_SOC_CAMERA_MT9T031=m +CONFIG_VIDEO_ZORAN=m +CONFIG_VIDEO_ZORAN_DC30=m +CONFIG_VIDEO_ZORAN_ZR36060=m +CONFIG_VIDEO_ZORAN_BUZ=m +CONFIG_VIDEO_ZORAN_DC10=m +CONFIG_VIDEO_ZORAN_LML33=m +CONFIG_VIDEO_ZORAN_LML33R10=m +CONFIG_VIDEO_ZORAN_AVS6EYES=m + +# +# Android +# +CONFIG_LTE_GDM724X=m +CONFIG_FIREWIRE_SERIAL=m +CONFIG_FWTTY_MAX_TOTAL_PORTS=64 +CONFIG_FWTTY_MAX_CARD_PORTS=32 +CONFIG_MTD_SPINAND_MT29F=m +CONFIG_MTD_SPINAND_ONDIEECC=y +CONFIG_DGNC=m +CONFIG_GS_FPGABOOT=m +CONFIG_UNISYSSPAR=y +CONFIG_FB_TFT=m +CONFIG_FB_TFT_AGM1264K_FL=m +CONFIG_FB_TFT_BD663474=m +CONFIG_FB_TFT_HX8340BN=m +CONFIG_FB_TFT_HX8347D=m +CONFIG_FB_TFT_HX8353D=m +# CONFIG_FB_TFT_HX8357D is not set +# CONFIG_FB_TFT_ILI9163 is not set +CONFIG_FB_TFT_ILI9320=m +CONFIG_FB_TFT_ILI9325=m +CONFIG_FB_TFT_ILI9340=m +CONFIG_FB_TFT_ILI9341=m +CONFIG_FB_TFT_ILI9481=m +CONFIG_FB_TFT_ILI9486=m +CONFIG_FB_TFT_PCD8544=m +CONFIG_FB_TFT_RA8875=m +CONFIG_FB_TFT_S6D02A1=m +CONFIG_FB_TFT_S6D1121=m +CONFIG_FB_TFT_SH1106=m +CONFIG_FB_TFT_SSD1289=m +CONFIG_FB_TFT_SSD1305=m +CONFIG_FB_TFT_SSD1306=m +CONFIG_FB_TFT_SSD1331=m +CONFIG_FB_TFT_SSD1351=m +CONFIG_FB_TFT_ST7735R=m +CONFIG_FB_TFT_ST7789V=m +CONFIG_FB_TFT_TINYLCD=m +CONFIG_FB_TFT_TLS8204=m +CONFIG_FB_TFT_UC1611=m +CONFIG_FB_TFT_UC1701=m +CONFIG_FB_TFT_UPD161704=m +CONFIG_FB_TFT_WATTEROTT=m +CONFIG_FB_FLEX=m +CONFIG_FB_TFT_FBTFT_DEVICE=m +CONFIG_WILC1000=m +CONFIG_WILC1000_SDIO=m +CONFIG_WILC1000_SPI=m +# CONFIG_WILC1000_HW_OOB_INTR is not set +CONFIG_MOST=m +CONFIG_MOST_CDEV=m +CONFIG_MOST_NET=m +CONFIG_MOST_SOUND=m +CONFIG_MOST_VIDEO=m +CONFIG_MOST_I2C=m +CONFIG_MOST_USB=m +CONFIG_KS7010=m +CONFIG_GREYBUS=m +CONFIG_GREYBUS_ES2=m +CONFIG_GREYBUS_AUDIO=m +CONFIG_GREYBUS_BOOTROM=m +CONFIG_GREYBUS_FIRMWARE=m +CONFIG_GREYBUS_HID=m +CONFIG_GREYBUS_LIGHT=m +CONFIG_GREYBUS_LOG=m +CONFIG_GREYBUS_LOOPBACK=m +CONFIG_GREYBUS_POWER=m +CONFIG_GREYBUS_RAW=m +CONFIG_GREYBUS_VIBRATOR=m +CONFIG_GREYBUS_BRIDGED_PHY=m +CONFIG_GREYBUS_GPIO=m +CONFIG_GREYBUS_I2C=m +CONFIG_GREYBUS_PWM=m +CONFIG_GREYBUS_SDIO=m +CONFIG_GREYBUS_SPI=m +CONFIG_GREYBUS_UART=m +CONFIG_GREYBUS_USB=m +# CONFIG_DRM_VBOXVIDEO is not set +CONFIG_PI433=m +CONFIG_MTK_MMC=m +# CONFIG_MTK_AEE_KDUMP is not set +# CONFIG_MTK_MMC_CD_POLL is not set + +# +# Gasket devices +# +# CONFIG_STAGING_GASKET_FRAMEWORK is not set +# CONFIG_XIL_AXIS_FIFO is not set +# CONFIG_EROFS_FS is not set +CONFIG_X86_PLATFORM_DEVICES=y +CONFIG_ACER_WMI=m +CONFIG_ACER_WIRELESS=m +CONFIG_ACERHDF=m +CONFIG_ALIENWARE_WMI=m +CONFIG_ASUS_LAPTOP=m +CONFIG_DELL_SMBIOS=m +CONFIG_DELL_SMBIOS_WMI=y +CONFIG_DELL_SMBIOS_SMM=y +CONFIG_DELL_LAPTOP=m +CONFIG_DELL_WMI=m +CONFIG_DELL_WMI_DESCRIPTOR=m +CONFIG_DELL_WMI_AIO=m +CONFIG_DELL_WMI_LED=m +CONFIG_DELL_SMO8800=m +CONFIG_DELL_RBTN=m +CONFIG_FUJITSU_LAPTOP=m +CONFIG_FUJITSU_TABLET=m +CONFIG_AMILO_RFKILL=m +CONFIG_GPD_POCKET_FAN=m +CONFIG_HP_ACCEL=m +CONFIG_HP_WIRELESS=m +CONFIG_HP_WMI=m +CONFIG_MSI_LAPTOP=m +CONFIG_PANASONIC_LAPTOP=m +CONFIG_COMPAL_LAPTOP=m +CONFIG_SONY_LAPTOP=m +CONFIG_SONYPI_COMPAT=y +CONFIG_IDEAPAD_LAPTOP=m +CONFIG_SURFACE3_WMI=m +CONFIG_THINKPAD_ACPI=m +CONFIG_THINKPAD_ACPI_ALSA_SUPPORT=y +# CONFIG_THINKPAD_ACPI_DEBUGFACILITIES is not set +# CONFIG_THINKPAD_ACPI_DEBUG is not set +# CONFIG_THINKPAD_ACPI_UNSAFE_LEDS is not set +CONFIG_THINKPAD_ACPI_VIDEO=y +CONFIG_THINKPAD_ACPI_HOTKEY_POLL=y +CONFIG_SENSORS_HDAPS=m +CONFIG_INTEL_MENLOW=m +CONFIG_EEEPC_LAPTOP=m +CONFIG_ASUS_WMI=m +CONFIG_ASUS_NB_WMI=m +CONFIG_EEEPC_WMI=m +CONFIG_ASUS_WIRELESS=m +CONFIG_ACPI_WMI=m +CONFIG_WMI_BMOF=m +CONFIG_INTEL_WMI_THUNDERBOLT=m +CONFIG_MSI_WMI=m +CONFIG_PEAQ_WMI=m +CONFIG_TOPSTAR_LAPTOP=m +CONFIG_ACPI_TOSHIBA=m +CONFIG_TOSHIBA_BT_RFKILL=m +CONFIG_TOSHIBA_HAPS=m +CONFIG_TOSHIBA_WMI=m +CONFIG_ACPI_CMPC=m +CONFIG_INTEL_CHT_INT33FE=m +CONFIG_INTEL_INT0002_VGPIO=m +CONFIG_INTEL_HID_EVENT=m +CONFIG_INTEL_VBTN=m +CONFIG_INTEL_IPS=m +CONFIG_INTEL_PMC_CORE=y +CONFIG_IBM_RTL=m +CONFIG_SAMSUNG_LAPTOP=m +CONFIG_MXM_WMI=m +CONFIG_INTEL_OAKTRAIL=m +CONFIG_SAMSUNG_Q10=m +CONFIG_APPLE_GMUX=m +CONFIG_INTEL_RST=m +CONFIG_INTEL_SMARTCONNECT=m +CONFIG_PVPANIC=m +CONFIG_INTEL_PMC_IPC=m +CONFIG_INTEL_BXTWC_PMIC_TMU=m +CONFIG_SURFACE_PRO3_BUTTON=m +CONFIG_SURFACE_3_BUTTON=m +CONFIG_INTEL_PUNIT_IPC=m +CONFIG_INTEL_TELEMETRY=m +CONFIG_MLX_PLATFORM=m +# CONFIG_INTEL_TURBO_MAX_3 is not set +CONFIG_INTEL_CHTDC_TI_PWRBTN=m +# CONFIG_I2C_MULTI_INSTANTIATE is not set +CONFIG_PMC_ATOM=y +CONFIG_CHROME_PLATFORMS=y +CONFIG_CHROMEOS_LAPTOP=m +CONFIG_CHROMEOS_PSTORE=m +CONFIG_CHROMEOS_TBMC=m +# CONFIG_CROS_EC_I2C is not set +# CONFIG_CROS_EC_SPI is not set +CONFIG_CROS_EC_LPC=m +CONFIG_CROS_EC_LPC_MEC=y +CONFIG_CROS_EC_PROTO=y +CONFIG_CROS_KBD_LED_BACKLIGHT=m +CONFIG_MELLANOX_PLATFORM=y +CONFIG_MLXREG_HOTPLUG=m +# CONFIG_MLXREG_IO is not set +CONFIG_CLKDEV_LOOKUP=y +CONFIG_HAVE_CLK_PREPARE=y +CONFIG_COMMON_CLK=y + +# +# Common Clock Framework +# +CONFIG_COMMON_CLK_WM831X=m +# CONFIG_COMMON_CLK_MAX9485 is not set +CONFIG_COMMON_CLK_SI5351=m +CONFIG_COMMON_CLK_SI544=m +CONFIG_COMMON_CLK_CDCE706=m +CONFIG_COMMON_CLK_CS2000_CP=m +CONFIG_COMMON_CLK_PWM=m +CONFIG_HWSPINLOCK=y + +# +# Clock Source drivers +# +CONFIG_CLKEVT_I8253=y +CONFIG_I8253_LOCK=y +CONFIG_CLKBLD_I8253=y +CONFIG_MAILBOX=y +CONFIG_PCC=y +CONFIG_ALTERA_MBOX=m +CONFIG_IOMMU_API=y +CONFIG_IOMMU_SUPPORT=y + +# +# Generic IOMMU Pagetable Support +# +# CONFIG_IOMMU_DEBUGFS is not set +# CONFIG_IOMMU_DEFAULT_PASSTHROUGH is not set +CONFIG_IOMMU_IOVA=y +CONFIG_AMD_IOMMU=y +CONFIG_AMD_IOMMU_V2=m +CONFIG_DMAR_TABLE=y +CONFIG_INTEL_IOMMU=y +CONFIG_INTEL_IOMMU_SVM=y +# CONFIG_INTEL_IOMMU_DEFAULT_ON is not set +CONFIG_INTEL_IOMMU_FLOPPY_WA=y +CONFIG_IRQ_REMAP=y + +# +# Remoteproc drivers +# +CONFIG_REMOTEPROC=m + +# +# Rpmsg drivers +# +CONFIG_RPMSG=m +CONFIG_RPMSG_CHAR=m +CONFIG_RPMSG_QCOM_GLINK_NATIVE=m +CONFIG_RPMSG_QCOM_GLINK_RPM=m +CONFIG_RPMSG_VIRTIO=m +CONFIG_SOUNDWIRE=y + +# +# SoundWire Devices +# +CONFIG_SOUNDWIRE_BUS=m +CONFIG_SOUNDWIRE_CADENCE=m +CONFIG_SOUNDWIRE_INTEL=m + +# +# SOC (System On Chip) specific Drivers +# + +# +# Amlogic SoC drivers +# + +# +# Broadcom SoC drivers +# + +# +# NXP/Freescale QorIQ SoC drivers +# + +# +# i.MX SoC drivers +# + +# +# Qualcomm SoC drivers +# +CONFIG_SOC_TI=y + +# +# Xilinx SoC drivers +# +CONFIG_XILINX_VCU=m +CONFIG_PM_DEVFREQ=y + +# +# DEVFREQ Governors +# +CONFIG_DEVFREQ_GOV_SIMPLE_ONDEMAND=y +CONFIG_DEVFREQ_GOV_PERFORMANCE=y +CONFIG_DEVFREQ_GOV_POWERSAVE=y +CONFIG_DEVFREQ_GOV_USERSPACE=y +CONFIG_DEVFREQ_GOV_PASSIVE=m + +# +# DEVFREQ Drivers +# +CONFIG_PM_DEVFREQ_EVENT=y +CONFIG_EXTCON=y + +# +# Extcon Device Drivers +# +CONFIG_EXTCON_ADC_JACK=m +CONFIG_EXTCON_ARIZONA=m +CONFIG_EXTCON_AXP288=m +CONFIG_EXTCON_GPIO=m +# CONFIG_EXTCON_INTEL_INT3496 is not set +CONFIG_EXTCON_MAX14577=m +CONFIG_EXTCON_MAX3355=m +CONFIG_EXTCON_MAX77693=m +CONFIG_EXTCON_RT8973A=m +CONFIG_EXTCON_SM5502=m +# CONFIG_EXTCON_USB_GPIO is not set +CONFIG_EXTCON_USBC_CROS_EC=m +CONFIG_MEMORY=y +CONFIG_IIO=m +CONFIG_IIO_BUFFER=y +CONFIG_IIO_BUFFER_CB=m +CONFIG_IIO_BUFFER_HW_CONSUMER=m +CONFIG_IIO_KFIFO_BUF=m +CONFIG_IIO_TRIGGERED_BUFFER=m +CONFIG_IIO_CONFIGFS=m +CONFIG_IIO_TRIGGER=y +CONFIG_IIO_CONSUMERS_PER_TRIGGER=2 +CONFIG_IIO_SW_DEVICE=m +CONFIG_IIO_SW_TRIGGER=m +CONFIG_IIO_TRIGGERED_EVENT=m + +# +# Accelerometers +# +CONFIG_ADIS16201=m +CONFIG_ADIS16209=m +CONFIG_BMA180=m +CONFIG_BMA220=m +CONFIG_BMC150_ACCEL=m +CONFIG_BMC150_ACCEL_I2C=m +CONFIG_BMC150_ACCEL_SPI=m +CONFIG_DA280=m +CONFIG_DA311=m +CONFIG_DMARD09=m +CONFIG_DMARD10=m +CONFIG_HID_SENSOR_ACCEL_3D=m +CONFIG_IIO_CROS_EC_ACCEL_LEGACY=m +CONFIG_IIO_ST_ACCEL_3AXIS=m +CONFIG_IIO_ST_ACCEL_I2C_3AXIS=m +CONFIG_IIO_ST_ACCEL_SPI_3AXIS=m +CONFIG_KXSD9=m +CONFIG_KXSD9_SPI=m +CONFIG_KXSD9_I2C=m +CONFIG_KXCJK1013=m +CONFIG_MC3230=m +CONFIG_MMA7455=m +CONFIG_MMA7455_I2C=m +CONFIG_MMA7455_SPI=m +CONFIG_MMA7660=m +CONFIG_MMA8452=m +CONFIG_MMA9551_CORE=m +CONFIG_MMA9551=m +CONFIG_MMA9553=m +CONFIG_MXC4005=m +CONFIG_MXC6255=m +CONFIG_SCA3000=m +CONFIG_STK8312=m +CONFIG_STK8BA50=m + +# +# Analog to digital converters +# +CONFIG_AD_SIGMA_DELTA=m +CONFIG_AD7266=m +CONFIG_AD7291=m +CONFIG_AD7298=m +CONFIG_AD7476=m +CONFIG_AD7766=m +CONFIG_AD7791=m +CONFIG_AD7793=m +CONFIG_AD7887=m +CONFIG_AD7923=m +CONFIG_AD799X=m +CONFIG_AXP20X_ADC=m +CONFIG_AXP288_ADC=m +CONFIG_CC10001_ADC=m +CONFIG_DA9150_GPADC=m +CONFIG_DLN2_ADC=m +CONFIG_HI8435=m +CONFIG_HX711=m +CONFIG_INA2XX_ADC=m +CONFIG_LTC2471=m +CONFIG_LTC2485=m +CONFIG_LTC2497=m +CONFIG_MAX1027=m +CONFIG_MAX11100=m +CONFIG_MAX1118=m +CONFIG_MAX1363=m +CONFIG_MAX9611=m +CONFIG_MCP320X=m +CONFIG_MCP3422=m +CONFIG_MEN_Z188_ADC=m +CONFIG_NAU7802=m +CONFIG_QCOM_VADC_COMMON=m +CONFIG_QCOM_SPMI_IADC=m +CONFIG_QCOM_SPMI_VADC=m +CONFIG_TI_ADC081C=m +CONFIG_TI_ADC0832=m +CONFIG_TI_ADC084S021=m +CONFIG_TI_ADC12138=m +CONFIG_TI_ADC108S102=m +CONFIG_TI_ADC128S052=m +CONFIG_TI_ADC161S626=m +CONFIG_TI_ADS1015=m +CONFIG_TI_ADS7950=m +CONFIG_TI_AM335X_ADC=m +CONFIG_TI_TLC4541=m +CONFIG_VIPERBOARD_ADC=m + +# +# Analog Front Ends +# + +# +# Amplifiers +# +CONFIG_AD8366=m + +# +# Chemical Sensors +# +CONFIG_ATLAS_PH_SENSOR=m +# CONFIG_BME680 is not set +CONFIG_CCS811=m +CONFIG_IAQCORE=m +CONFIG_VZ89X=m +CONFIG_IIO_CROS_EC_SENSORS_CORE=m +CONFIG_IIO_CROS_EC_SENSORS=m + +# +# Hid Sensor IIO Common +# +CONFIG_HID_SENSOR_IIO_COMMON=m +CONFIG_HID_SENSOR_IIO_TRIGGER=m +CONFIG_IIO_MS_SENSORS_I2C=m + +# +# SSP Sensor Common +# +CONFIG_IIO_SSP_SENSORS_COMMONS=m +CONFIG_IIO_SSP_SENSORHUB=m +CONFIG_IIO_ST_SENSORS_I2C=m +CONFIG_IIO_ST_SENSORS_SPI=m +CONFIG_IIO_ST_SENSORS_CORE=m + +# +# Counters +# + +# +# Digital to analog converters +# +CONFIG_AD5064=m +CONFIG_AD5360=m +CONFIG_AD5380=m +CONFIG_AD5421=m +CONFIG_AD5446=m +CONFIG_AD5449=m +CONFIG_AD5592R_BASE=m +CONFIG_AD5592R=m +CONFIG_AD5593R=m +CONFIG_AD5504=m +CONFIG_AD5624R_SPI=m +CONFIG_LTC2632=m +CONFIG_AD5686=m +CONFIG_AD5686_SPI=m +CONFIG_AD5696_I2C=m +CONFIG_AD5755=m +# CONFIG_AD5758 is not set +CONFIG_AD5761=m +CONFIG_AD5764=m +CONFIG_AD5791=m +CONFIG_AD7303=m +CONFIG_AD8801=m +CONFIG_DS4424=m +CONFIG_M62332=m +CONFIG_MAX517=m +CONFIG_MCP4725=m +CONFIG_MCP4922=m +CONFIG_TI_DAC082S085=m +CONFIG_TI_DAC5571=m + +# +# IIO dummy driver +# +# CONFIG_IIO_SIMPLE_DUMMY is not set + +# +# Frequency Synthesizers DDS/PLL +# + +# +# Clock Generator/Distribution +# +CONFIG_AD9523=m + +# +# Phase-Locked Loop (PLL) frequency synthesizers +# +CONFIG_ADF4350=m + +# +# Digital gyroscope sensors +# +CONFIG_ADIS16080=m +CONFIG_ADIS16130=m +CONFIG_ADIS16136=m +CONFIG_ADIS16260=m +CONFIG_ADXRS450=m +CONFIG_BMG160=m +CONFIG_BMG160_I2C=m +CONFIG_BMG160_SPI=m +CONFIG_HID_SENSOR_GYRO_3D=m +CONFIG_MPU3050=m +CONFIG_MPU3050_I2C=m +CONFIG_IIO_ST_GYRO_3AXIS=m +CONFIG_IIO_ST_GYRO_I2C_3AXIS=m +CONFIG_IIO_ST_GYRO_SPI_3AXIS=m +CONFIG_ITG3200=m + +# +# Health Sensors +# + +# +# Heart Rate Monitors +# +CONFIG_AFE4403=m +CONFIG_AFE4404=m +CONFIG_MAX30100=m +CONFIG_MAX30102=m + +# +# Humidity sensors +# +CONFIG_AM2315=m +CONFIG_DHT11=m +CONFIG_HDC100X=m +CONFIG_HID_SENSOR_HUMIDITY=m +CONFIG_HTS221=m +CONFIG_HTS221_I2C=m +CONFIG_HTS221_SPI=m +CONFIG_HTU21=m +CONFIG_SI7005=m +CONFIG_SI7020=m + +# +# Inertial measurement units +# +CONFIG_ADIS16400=m +CONFIG_ADIS16480=m +CONFIG_BMI160=m +CONFIG_BMI160_I2C=m +CONFIG_BMI160_SPI=m +CONFIG_KMX61=m +CONFIG_INV_MPU6050_IIO=m +CONFIG_INV_MPU6050_I2C=m +CONFIG_INV_MPU6050_SPI=m +CONFIG_IIO_ST_LSM6DSX=m +CONFIG_IIO_ST_LSM6DSX_I2C=m +CONFIG_IIO_ST_LSM6DSX_SPI=m +CONFIG_IIO_ADIS_LIB=m +CONFIG_IIO_ADIS_LIB_BUFFER=y + +# +# Light sensors +# +# CONFIG_ACPI_ALS is not set +CONFIG_ADJD_S311=m +CONFIG_AL3320A=m +CONFIG_APDS9300=m +CONFIG_APDS9960=m +CONFIG_BH1750=m +CONFIG_BH1780=m +CONFIG_CM32181=m +CONFIG_CM3232=m +CONFIG_CM3323=m +CONFIG_CM36651=m +CONFIG_IIO_CROS_EC_LIGHT_PROX=m +CONFIG_GP2AP020A00F=m +CONFIG_SENSORS_ISL29018=m +CONFIG_SENSORS_ISL29028=m +CONFIG_ISL29125=m +CONFIG_HID_SENSOR_ALS=m +CONFIG_HID_SENSOR_PROX=m +CONFIG_JSA1212=m +CONFIG_RPR0521=m +CONFIG_SENSORS_LM3533=m +CONFIG_LTR501=m +CONFIG_LV0104CS=m +CONFIG_MAX44000=m +CONFIG_OPT3001=m +CONFIG_PA12203001=m +# CONFIG_SI1133 is not set +CONFIG_SI1145=m +CONFIG_STK3310=m +CONFIG_ST_UVIS25=m +CONFIG_ST_UVIS25_I2C=m +CONFIG_ST_UVIS25_SPI=m +CONFIG_TCS3414=m +CONFIG_TCS3472=m +CONFIG_SENSORS_TSL2563=m +CONFIG_TSL2583=m +CONFIG_TSL2772=m +CONFIG_TSL4531=m +CONFIG_US5182D=m +CONFIG_VCNL4000=m +CONFIG_VEML6070=m +CONFIG_VL6180=m +CONFIG_ZOPT2201=m + +# +# Magnetometer sensors +# +CONFIG_AK8975=m +CONFIG_AK09911=m +CONFIG_BMC150_MAGN=m +CONFIG_BMC150_MAGN_I2C=m +CONFIG_BMC150_MAGN_SPI=m +CONFIG_MAG3110=m +CONFIG_HID_SENSOR_MAGNETOMETER_3D=m +CONFIG_MMC35240=m +CONFIG_IIO_ST_MAGN_3AXIS=m +CONFIG_IIO_ST_MAGN_I2C_3AXIS=m +CONFIG_IIO_ST_MAGN_SPI_3AXIS=m +CONFIG_SENSORS_HMC5843=m +CONFIG_SENSORS_HMC5843_I2C=m +CONFIG_SENSORS_HMC5843_SPI=m + +# +# Multiplexers +# + +# +# Inclinometer sensors +# +CONFIG_HID_SENSOR_INCLINOMETER_3D=m +CONFIG_HID_SENSOR_DEVICE_ROTATION=m + +# +# Triggers - standalone +# +CONFIG_IIO_HRTIMER_TRIGGER=m +CONFIG_IIO_INTERRUPT_TRIGGER=m +CONFIG_IIO_TIGHTLOOP_TRIGGER=m +CONFIG_IIO_SYSFS_TRIGGER=m + +# +# Digital potentiometers +# +CONFIG_AD5272=m +CONFIG_DS1803=m +CONFIG_MAX5481=m +CONFIG_MAX5487=m +CONFIG_MCP4018=m +CONFIG_MCP4131=m +CONFIG_MCP4531=m +CONFIG_TPL0102=m + +# +# Digital potentiostats +# +CONFIG_LMP91000=m + +# +# Pressure sensors +# +CONFIG_ABP060MG=m +CONFIG_BMP280=m +CONFIG_BMP280_I2C=m +CONFIG_BMP280_SPI=m +CONFIG_IIO_CROS_EC_BARO=m +CONFIG_HID_SENSOR_PRESS=m +CONFIG_HP03=m +CONFIG_MPL115=m +CONFIG_MPL115_I2C=m +CONFIG_MPL115_SPI=m +CONFIG_MPL3115=m +CONFIG_MS5611=m +CONFIG_MS5611_I2C=m +CONFIG_MS5611_SPI=m +CONFIG_MS5637=m +CONFIG_IIO_ST_PRESS=m +CONFIG_IIO_ST_PRESS_I2C=m +CONFIG_IIO_ST_PRESS_SPI=m +CONFIG_T5403=m +CONFIG_HP206C=m +CONFIG_ZPA2326=m +CONFIG_ZPA2326_I2C=m +CONFIG_ZPA2326_SPI=m + +# +# Lightning sensors +# +CONFIG_AS3935=m + +# +# Proximity and distance sensors +# +# CONFIG_ISL29501 is not set +CONFIG_LIDAR_LITE_V2=m +CONFIG_RFD77402=m +CONFIG_SRF04=m +CONFIG_SX9500=m +CONFIG_SRF08=m + +# +# Resolver to digital converters +# +CONFIG_AD2S1200=m + +# +# Temperature sensors +# +CONFIG_MAXIM_THERMOCOUPLE=m +CONFIG_HID_SENSOR_TEMP=m +CONFIG_MLX90614=m +CONFIG_MLX90632=m +CONFIG_TMP006=m +CONFIG_TMP007=m +CONFIG_TSYS01=m +CONFIG_TSYS02D=m +CONFIG_NTB=m +CONFIG_NTB_AMD=m +CONFIG_NTB_IDT=m +CONFIG_NTB_INTEL=m +CONFIG_NTB_SWITCHTEC=m +CONFIG_NTB_PINGPONG=m +CONFIG_NTB_TOOL=m +CONFIG_NTB_PERF=m +CONFIG_NTB_TRANSPORT=m +CONFIG_VME_BUS=y + +# +# VME Bridge Drivers +# +CONFIG_VME_CA91CX42=m +CONFIG_VME_TSI148=m +CONFIG_VME_FAKE=m + +# +# VME Board Drivers +# +CONFIG_VMIVME_7805=m + +# +# VME Device Drivers +# +CONFIG_VME_USER=m +CONFIG_PWM=y +CONFIG_PWM_SYSFS=y +CONFIG_PWM_CROS_EC=m +CONFIG_PWM_LP3943=m +CONFIG_PWM_LPSS=m +CONFIG_PWM_LPSS_PCI=m +CONFIG_PWM_LPSS_PLATFORM=m +CONFIG_PWM_PCA9685=m + +# +# IRQ chip support +# +CONFIG_ARM_GIC_MAX_NR=1 +CONFIG_IPACK_BUS=m +CONFIG_BOARD_TPCI200=m +CONFIG_SERIAL_IPOCTAL=m +CONFIG_RESET_CONTROLLER=y +CONFIG_RESET_TI_SYSCON=m +CONFIG_FMC=m +CONFIG_FMC_FAKEDEV=m +CONFIG_FMC_TRIVIAL=m +CONFIG_FMC_WRITE_EEPROM=m +CONFIG_FMC_CHARDEV=m + +# +# PHY Subsystem +# +CONFIG_GENERIC_PHY=y +CONFIG_BCM_KONA_USB2_PHY=m +CONFIG_PHY_PXA_28NM_HSIC=m +CONFIG_PHY_PXA_28NM_USB2=m +CONFIG_PHY_CPCAP_USB=m +CONFIG_PHY_QCOM_USB_HS=m +CONFIG_PHY_QCOM_USB_HSIC=m +CONFIG_PHY_SAMSUNG_USB2=m +CONFIG_PHY_TUSB1210=m +CONFIG_POWERCAP=y +CONFIG_INTEL_RAPL=m +# CONFIG_IDLE_INJECT is not set +CONFIG_MCB=m +CONFIG_MCB_PCI=m +CONFIG_MCB_LPC=m + +# +# Performance monitor support +# +CONFIG_RAS=y +CONFIG_RAS_CEC=y +CONFIG_THUNDERBOLT=m + +# +# Android +# +# CONFIG_ANDROID is not set +CONFIG_LIBNVDIMM=m +CONFIG_BLK_DEV_PMEM=m +CONFIG_ND_BLK=m +CONFIG_ND_CLAIM=y +CONFIG_ND_BTT=m +CONFIG_BTT=y +CONFIG_DAX_DRIVER=y +CONFIG_DAX=y +CONFIG_DEV_DAX=m +CONFIG_NVMEM=y +CONFIG_RAVE_SP_EEPROM=m + +# +# HW tracing support +# +CONFIG_STM=m +# CONFIG_STM_DUMMY is not set +CONFIG_STM_SOURCE_CONSOLE=m +CONFIG_STM_SOURCE_HEARTBEAT=m +# CONFIG_STM_SOURCE_FTRACE is not set +CONFIG_INTEL_TH=m +CONFIG_INTEL_TH_PCI=m +CONFIG_INTEL_TH_ACPI=m +CONFIG_INTEL_TH_GTH=m +CONFIG_INTEL_TH_STH=m +CONFIG_INTEL_TH_MSU=m +CONFIG_INTEL_TH_PTI=m +# CONFIG_INTEL_TH_DEBUG is not set +CONFIG_FPGA=m +CONFIG_ALTERA_PR_IP_CORE=m +CONFIG_FPGA_MGR_ALTERA_PS_SPI=m +CONFIG_FPGA_MGR_ALTERA_CVP=m +CONFIG_FPGA_MGR_XILINX_SPI=m +CONFIG_FPGA_MGR_MACHXO2_SPI=m +CONFIG_FPGA_BRIDGE=m +CONFIG_XILINX_PR_DECOUPLER=m +CONFIG_FPGA_REGION=m +# CONFIG_FPGA_DFL is not set +CONFIG_PM_OPP=y +# CONFIG_UNISYS_VISORBUS is not set +CONFIG_SIOX=m +CONFIG_SIOX_BUS_GPIO=m +CONFIG_SLIMBUS=m +CONFIG_SLIM_QCOM_CTRL=m + +# +# File systems +# +CONFIG_DCACHE_WORD_ACCESS=y +CONFIG_FS_IOMAP=y +CONFIG_EXT2_FS=m +CONFIG_EXT2_FS_XATTR=y +CONFIG_EXT2_FS_POSIX_ACL=y +CONFIG_EXT2_FS_SECURITY=y +CONFIG_EXT3_FS=m +CONFIG_EXT3_FS_POSIX_ACL=y +CONFIG_EXT3_FS_SECURITY=y +CONFIG_EXT4_FS=m +CONFIG_EXT4_FS_POSIX_ACL=y +CONFIG_EXT4_FS_SECURITY=y +CONFIG_EXT4_ENCRYPTION=y +CONFIG_EXT4_FS_ENCRYPTION=y +# CONFIG_EXT4_DEBUG is not set +CONFIG_JBD2=m +# CONFIG_JBD2_DEBUG is not set +CONFIG_FS_MBCACHE=m +CONFIG_REISERFS_FS=m +# CONFIG_REISERFS_CHECK is not set +CONFIG_REISERFS_PROC_INFO=y +CONFIG_REISERFS_FS_XATTR=y +CONFIG_REISERFS_FS_POSIX_ACL=y +CONFIG_REISERFS_FS_SECURITY=y +CONFIG_JFS_FS=m +CONFIG_JFS_POSIX_ACL=y +CONFIG_JFS_SECURITY=y +# CONFIG_JFS_DEBUG is not set +CONFIG_JFS_STATISTICS=y +CONFIG_XFS_FS=m +CONFIG_XFS_QUOTA=y +CONFIG_XFS_POSIX_ACL=y +CONFIG_XFS_RT=y +CONFIG_XFS_ONLINE_SCRUB=y +# CONFIG_XFS_ONLINE_REPAIR is not set +# CONFIG_XFS_WARN is not set +# CONFIG_XFS_DEBUG is not set +CONFIG_GFS2_FS=m +CONFIG_GFS2_FS_LOCKING_DLM=y +CONFIG_OCFS2_FS=m +CONFIG_OCFS2_FS_O2CB=m +CONFIG_OCFS2_FS_USERSPACE_CLUSTER=m +CONFIG_OCFS2_FS_STATS=y +CONFIG_OCFS2_DEBUG_MASKLOG=y +CONFIG_OCFS2_DEBUG_FS=y +CONFIG_BTRFS_FS=m +CONFIG_BTRFS_FS_POSIX_ACL=y +# CONFIG_BTRFS_FS_CHECK_INTEGRITY is not set +# CONFIG_BTRFS_FS_RUN_SANITY_TESTS is not set +# CONFIG_BTRFS_DEBUG is not set +# CONFIG_BTRFS_ASSERT is not set +# CONFIG_BTRFS_FS_REF_VERIFY is not set +CONFIG_NILFS2_FS=m +CONFIG_F2FS_FS=m +CONFIG_F2FS_STAT_FS=y +CONFIG_F2FS_FS_XATTR=y +CONFIG_F2FS_FS_POSIX_ACL=y +CONFIG_F2FS_FS_SECURITY=y +CONFIG_F2FS_CHECK_FS=y +CONFIG_F2FS_FS_ENCRYPTION=y +# CONFIG_F2FS_IO_TRACE is not set +# CONFIG_F2FS_FAULT_INJECTION is not set +CONFIG_FS_DAX=y +CONFIG_FS_POSIX_ACL=y +CONFIG_EXPORTFS=y +CONFIG_EXPORTFS_BLOCK_OPS=y +CONFIG_FILE_LOCKING=y +CONFIG_MANDATORY_FILE_LOCKING=y +CONFIG_FS_ENCRYPTION=m +CONFIG_FSNOTIFY=y +CONFIG_DNOTIFY=y +CONFIG_INOTIFY_USER=y +CONFIG_FANOTIFY=y +# CONFIG_FANOTIFY_ACCESS_PERMISSIONS is not set +CONFIG_QUOTA=y +CONFIG_QUOTA_NETLINK_INTERFACE=y +# CONFIG_PRINT_QUOTA_WARNING is not set +# CONFIG_QUOTA_DEBUG is not set +CONFIG_QUOTA_TREE=m +CONFIG_QFMT_V1=m +CONFIG_QFMT_V2=m +CONFIG_QUOTACTL=y +CONFIG_QUOTACTL_COMPAT=y +CONFIG_AUTOFS4_FS=m +CONFIG_AUTOFS_FS=m +CONFIG_FUSE_FS=m +CONFIG_CUSE=m +CONFIG_OVERLAY_FS=m +# CONFIG_OVERLAY_FS_REDIRECT_DIR is not set +CONFIG_OVERLAY_FS_REDIRECT_ALWAYS_FOLLOW=y +CONFIG_OVERLAY_FS_INDEX=y +CONFIG_OVERLAY_FS_NFS_EXPORT=y +# CONFIG_OVERLAY_FS_XINO_AUTO is not set +# CONFIG_OVERLAY_FS_METACOPY is not set + +# +# Caches +# +CONFIG_FSCACHE=m +CONFIG_FSCACHE_STATS=y +# CONFIG_FSCACHE_HISTOGRAM is not set +# CONFIG_FSCACHE_DEBUG is not set +# CONFIG_FSCACHE_OBJECT_LIST is not set +CONFIG_CACHEFILES=m +# CONFIG_CACHEFILES_DEBUG is not set +# CONFIG_CACHEFILES_HISTOGRAM is not set + +# +# CD-ROM/DVD Filesystems +# +CONFIG_ISO9660_FS=m +CONFIG_JOLIET=y +CONFIG_ZISOFS=y +CONFIG_UDF_FS=m + +# +# DOS/FAT/NT Filesystems +# +CONFIG_FAT_FS=m +CONFIG_MSDOS_FS=m +CONFIG_VFAT_FS=m +CONFIG_FAT_DEFAULT_CODEPAGE=437 +CONFIG_FAT_DEFAULT_IOCHARSET="utf8" +# CONFIG_FAT_DEFAULT_UTF8 is not set +CONFIG_NTFS_FS=m +# CONFIG_NTFS_DEBUG is not set +CONFIG_NTFS_RW=y + +# +# Pseudo filesystems +# +CONFIG_PROC_FS=y +CONFIG_PROC_KCORE=y +CONFIG_PROC_SYSCTL=y +CONFIG_PROC_PAGE_MONITOR=y +# CONFIG_PROC_CHILDREN is not set +CONFIG_KERNFS=y +CONFIG_SYSFS=y +CONFIG_TMPFS=y +CONFIG_TMPFS_POSIX_ACL=y +CONFIG_TMPFS_XATTR=y +CONFIG_HUGETLBFS=y +CONFIG_HUGETLB_PAGE=y +CONFIG_MEMFD_CREATE=y +CONFIG_ARCH_HAS_GIGANTIC_PAGE=y +CONFIG_CONFIGFS_FS=y +CONFIG_EFIVAR_FS=m +CONFIG_MISC_FILESYSTEMS=y +CONFIG_ORANGEFS_FS=m +CONFIG_ADFS_FS=m +# CONFIG_ADFS_FS_RW is not set +CONFIG_AFFS_FS=m +CONFIG_ECRYPT_FS=m +CONFIG_ECRYPT_FS_MESSAGING=y +CONFIG_HFS_FS=m +CONFIG_HFSPLUS_FS=m +CONFIG_BEFS_FS=m +# CONFIG_BEFS_DEBUG is not set +CONFIG_BFS_FS=m +CONFIG_EFS_FS=m +CONFIG_JFFS2_FS=m +CONFIG_JFFS2_FS_DEBUG=0 +CONFIG_JFFS2_FS_WRITEBUFFER=y +CONFIG_JFFS2_FS_WBUF_VERIFY=y +CONFIG_JFFS2_SUMMARY=y +CONFIG_JFFS2_FS_XATTR=y +CONFIG_JFFS2_FS_POSIX_ACL=y +CONFIG_JFFS2_FS_SECURITY=y +CONFIG_JFFS2_COMPRESSION_OPTIONS=y +CONFIG_JFFS2_ZLIB=y +CONFIG_JFFS2_LZO=y +CONFIG_JFFS2_RTIME=y +CONFIG_JFFS2_RUBIN=y +# CONFIG_JFFS2_CMODE_NONE is not set +CONFIG_JFFS2_CMODE_PRIORITY=y +# CONFIG_JFFS2_CMODE_SIZE is not set +# CONFIG_JFFS2_CMODE_FAVOURLZO is not set +CONFIG_UBIFS_FS=m +CONFIG_UBIFS_FS_ADVANCED_COMPR=y +CONFIG_UBIFS_FS_LZO=y +CONFIG_UBIFS_FS_ZLIB=y +CONFIG_UBIFS_ATIME_SUPPORT=y +CONFIG_UBIFS_FS_XATTR=y +CONFIG_UBIFS_FS_ENCRYPTION=y +CONFIG_UBIFS_FS_SECURITY=y +CONFIG_CRAMFS=m +CONFIG_CRAMFS_BLOCKDEV=y +CONFIG_CRAMFS_MTD=y +CONFIG_SQUASHFS=m +# CONFIG_SQUASHFS_FILE_CACHE is not set +CONFIG_SQUASHFS_FILE_DIRECT=y +# CONFIG_SQUASHFS_DECOMP_SINGLE is not set +# CONFIG_SQUASHFS_DECOMP_MULTI is not set +CONFIG_SQUASHFS_DECOMP_MULTI_PERCPU=y +CONFIG_SQUASHFS_XATTR=y +CONFIG_SQUASHFS_ZLIB=y +CONFIG_SQUASHFS_LZ4=y +CONFIG_SQUASHFS_LZO=y +CONFIG_SQUASHFS_XZ=y +CONFIG_SQUASHFS_ZSTD=y +CONFIG_SQUASHFS_4K_DEVBLK_SIZE=y +CONFIG_SQUASHFS_EMBEDDED=y +CONFIG_SQUASHFS_FRAGMENT_CACHE_SIZE=3 +CONFIG_VXFS_FS=m +CONFIG_MINIX_FS=m +CONFIG_OMFS_FS=m +CONFIG_HPFS_FS=m +CONFIG_QNX4FS_FS=m +CONFIG_QNX6FS_FS=m +# CONFIG_QNX6FS_DEBUG is not set +CONFIG_ROMFS_FS=m +# CONFIG_ROMFS_BACKED_BY_BLOCK is not set +# CONFIG_ROMFS_BACKED_BY_MTD is not set +CONFIG_ROMFS_BACKED_BY_BOTH=y +CONFIG_ROMFS_ON_BLOCK=y +CONFIG_ROMFS_ON_MTD=y +CONFIG_PSTORE=y +CONFIG_PSTORE_DEFLATE_COMPRESS=y +CONFIG_PSTORE_LZO_COMPRESS=y +CONFIG_PSTORE_LZ4_COMPRESS=y +CONFIG_PSTORE_LZ4HC_COMPRESS=y +CONFIG_PSTORE_842_COMPRESS=y +CONFIG_PSTORE_ZSTD_COMPRESS=y +CONFIG_PSTORE_COMPRESS=y +# CONFIG_PSTORE_DEFLATE_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_LZO_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_LZ4_COMPRESS_DEFAULT=y +# CONFIG_PSTORE_LZ4HC_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_842_COMPRESS_DEFAULT is not set +# CONFIG_PSTORE_ZSTD_COMPRESS_DEFAULT is not set +CONFIG_PSTORE_COMPRESS_DEFAULT="lz4" +# CONFIG_PSTORE_CONSOLE is not set +CONFIG_PSTORE_PMSG=y +# CONFIG_PSTORE_FTRACE is not set +CONFIG_PSTORE_RAM=m +CONFIG_SYSV_FS=m +CONFIG_UFS_FS=m +# CONFIG_UFS_FS_WRITE is not set +# CONFIG_UFS_DEBUG is not set +CONFIG_EXOFS_FS=m +# CONFIG_EXOFS_DEBUG is not set +CONFIG_ORE=m +CONFIG_NETWORK_FILESYSTEMS=y +CONFIG_NFS_FS=m +CONFIG_NFS_V2=m +CONFIG_NFS_V3=m +CONFIG_NFS_V3_ACL=y +CONFIG_NFS_V4=m +CONFIG_NFS_SWAP=y +CONFIG_NFS_V4_1=y +CONFIG_NFS_V4_2=y +CONFIG_PNFS_FILE_LAYOUT=m +CONFIG_PNFS_BLOCK=m +CONFIG_PNFS_FLEXFILE_LAYOUT=m +CONFIG_NFS_V4_1_IMPLEMENTATION_ID_DOMAIN="kernel.org" +CONFIG_NFS_V4_1_MIGRATION=y +CONFIG_NFS_V4_SECURITY_LABEL=y +CONFIG_NFS_FSCACHE=y +# CONFIG_NFS_USE_LEGACY_DNS is not set +CONFIG_NFS_USE_KERNEL_DNS=y +CONFIG_NFSD=m +CONFIG_NFSD_V2_ACL=y +CONFIG_NFSD_V3=y +CONFIG_NFSD_V3_ACL=y +CONFIG_NFSD_V4=y +CONFIG_NFSD_PNFS=y +CONFIG_NFSD_BLOCKLAYOUT=y +CONFIG_NFSD_SCSILAYOUT=y +CONFIG_NFSD_FLEXFILELAYOUT=y +# CONFIG_NFSD_V4_SECURITY_LABEL is not set +# CONFIG_NFSD_FAULT_INJECTION is not set +CONFIG_GRACE_PERIOD=m +CONFIG_LOCKD=m +CONFIG_LOCKD_V4=y +CONFIG_NFS_ACL_SUPPORT=m +CONFIG_NFS_COMMON=y +CONFIG_SUNRPC=m +CONFIG_SUNRPC_GSS=m +CONFIG_SUNRPC_BACKCHANNEL=y +CONFIG_SUNRPC_SWAP=y +CONFIG_RPCSEC_GSS_KRB5=m +# CONFIG_SUNRPC_DEBUG is not set +CONFIG_SUNRPC_XPRT_RDMA=m +CONFIG_CEPH_FS=m +CONFIG_CEPH_FSCACHE=y +CONFIG_CEPH_FS_POSIX_ACL=y +CONFIG_CIFS=m +CONFIG_CIFS_STATS2=y +CONFIG_CIFS_ALLOW_INSECURE_LEGACY=y +CONFIG_CIFS_WEAK_PW_HASH=y +CONFIG_CIFS_UPCALL=y +CONFIG_CIFS_XATTR=y +CONFIG_CIFS_POSIX=y +CONFIG_CIFS_ACL=y +# CONFIG_CIFS_DEBUG is not set +CONFIG_CIFS_DFS_UPCALL=y +# CONFIG_CIFS_SMB_DIRECT is not set +CONFIG_CIFS_FSCACHE=y +CONFIG_CODA_FS=m +CONFIG_AFS_FS=m +# CONFIG_AFS_DEBUG is not set +CONFIG_AFS_FSCACHE=y +CONFIG_9P_FS=m +CONFIG_9P_FSCACHE=y +CONFIG_9P_FS_POSIX_ACL=y +CONFIG_9P_FS_SECURITY=y +CONFIG_NLS=y +CONFIG_NLS_DEFAULT="utf8" +CONFIG_NLS_CODEPAGE_437=m +CONFIG_NLS_CODEPAGE_737=m +CONFIG_NLS_CODEPAGE_775=m +CONFIG_NLS_CODEPAGE_850=m +CONFIG_NLS_CODEPAGE_852=m +CONFIG_NLS_CODEPAGE_855=m +CONFIG_NLS_CODEPAGE_857=m +CONFIG_NLS_CODEPAGE_860=m +CONFIG_NLS_CODEPAGE_861=m +CONFIG_NLS_CODEPAGE_862=m +CONFIG_NLS_CODEPAGE_863=m +CONFIG_NLS_CODEPAGE_864=m +CONFIG_NLS_CODEPAGE_865=m +CONFIG_NLS_CODEPAGE_866=m +CONFIG_NLS_CODEPAGE_869=m +CONFIG_NLS_CODEPAGE_936=m +CONFIG_NLS_CODEPAGE_950=m +CONFIG_NLS_CODEPAGE_932=m +CONFIG_NLS_CODEPAGE_949=m +CONFIG_NLS_CODEPAGE_874=m +CONFIG_NLS_ISO8859_8=m +CONFIG_NLS_CODEPAGE_1250=m +CONFIG_NLS_CODEPAGE_1251=m +CONFIG_NLS_ASCII=m +CONFIG_NLS_ISO8859_1=m +CONFIG_NLS_ISO8859_2=m +CONFIG_NLS_ISO8859_3=m +CONFIG_NLS_ISO8859_4=m +CONFIG_NLS_ISO8859_5=m +CONFIG_NLS_ISO8859_6=m +CONFIG_NLS_ISO8859_7=m +CONFIG_NLS_ISO8859_9=m +CONFIG_NLS_ISO8859_13=m +CONFIG_NLS_ISO8859_14=m +CONFIG_NLS_ISO8859_15=m +CONFIG_NLS_KOI8_R=m +CONFIG_NLS_KOI8_U=m +CONFIG_NLS_MAC_ROMAN=m +CONFIG_NLS_MAC_CELTIC=m +CONFIG_NLS_MAC_CENTEURO=m +CONFIG_NLS_MAC_CROATIAN=m +CONFIG_NLS_MAC_CYRILLIC=m +CONFIG_NLS_MAC_GAELIC=m +CONFIG_NLS_MAC_GREEK=m +CONFIG_NLS_MAC_ICELAND=m +CONFIG_NLS_MAC_INUIT=m +CONFIG_NLS_MAC_ROMANIAN=m +CONFIG_NLS_MAC_TURKISH=m +CONFIG_NLS_UTF8=m +CONFIG_DLM=m +# CONFIG_DLM_DEBUG is not set + +# +# Security options +# +CONFIG_KEYS=y +CONFIG_KEYS_COMPAT=y +CONFIG_PERSISTENT_KEYRINGS=y +# CONFIG_BIG_KEYS is not set +CONFIG_TRUSTED_KEYS=m +CONFIG_ENCRYPTED_KEYS=m +# CONFIG_KEY_DH_OPERATIONS is not set +CONFIG_SECURITY_DMESG_RESTRICT=y +CONFIG_SECURITY_PERF_EVENTS_RESTRICT=y +CONFIG_SECURITY_TIOCSTI_RESTRICT=y +CONFIG_SECURITY=y +CONFIG_SECURITYFS=y +CONFIG_SECURITY_NETWORK=y +CONFIG_PAGE_TABLE_ISOLATION=y +# CONFIG_SECURITY_INFINIBAND is not set +# CONFIG_SECURITY_NETWORK_XFRM is not set +CONFIG_SECURITY_PATH=y +CONFIG_INTEL_TXT=y +CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y +CONFIG_HARDENED_USERCOPY=y +CONFIG_HARDENED_USERCOPY_FALLBACK=y +CONFIG_FORTIFY_SOURCE=y +CONFIG_PAGE_SANITIZE=y +CONFIG_PAGE_SANITIZE_VERIFY=y +# CONFIG_STATIC_USERMODEHELPER is not set +# CONFIG_SECURITY_SELINUX is not set +# CONFIG_SECURITY_SMACK is not set +# CONFIG_SECURITY_TOMOYO is not set +CONFIG_SECURITY_APPARMOR=y +CONFIG_SECURITY_APPARMOR_BOOTPARAM_VALUE=1 +CONFIG_SECURITY_APPARMOR_HASH=y +CONFIG_SECURITY_APPARMOR_HASH_DEFAULT=y +# CONFIG_SECURITY_APPARMOR_DEBUG is not set +# CONFIG_SECURITY_LOADPIN is not set +CONFIG_SECURITY_YAMA=y +# CONFIG_INTEGRITY is not set +CONFIG_DEFAULT_SECURITY_APPARMOR=y +# CONFIG_DEFAULT_SECURITY_DAC is not set +CONFIG_DEFAULT_SECURITY="apparmor" +CONFIG_XOR_BLOCKS=m +CONFIG_ASYNC_CORE=m +CONFIG_ASYNC_MEMCPY=m +CONFIG_ASYNC_XOR=m +CONFIG_ASYNC_PQ=m +CONFIG_ASYNC_RAID6_RECOV=m +CONFIG_CRYPTO=y + +# +# Crypto core or helper +# +CONFIG_CRYPTO_ALGAPI=y +CONFIG_CRYPTO_ALGAPI2=y +CONFIG_CRYPTO_AEAD=m +CONFIG_CRYPTO_AEAD2=y +CONFIG_CRYPTO_BLKCIPHER=y +CONFIG_CRYPTO_BLKCIPHER2=y +CONFIG_CRYPTO_HASH=y +CONFIG_CRYPTO_HASH2=y +CONFIG_CRYPTO_RNG=m +CONFIG_CRYPTO_RNG2=y +CONFIG_CRYPTO_RNG_DEFAULT=m +CONFIG_CRYPTO_AKCIPHER2=y +CONFIG_CRYPTO_AKCIPHER=y +CONFIG_CRYPTO_KPP2=y +CONFIG_CRYPTO_KPP=m +CONFIG_CRYPTO_ACOMP2=y +CONFIG_CRYPTO_RSA=y +CONFIG_CRYPTO_DH=m +CONFIG_CRYPTO_ECDH=m +CONFIG_CRYPTO_MANAGER=y +CONFIG_CRYPTO_MANAGER2=y +CONFIG_CRYPTO_USER=m +CONFIG_CRYPTO_MANAGER_DISABLE_TESTS=y +CONFIG_CRYPTO_GF128MUL=m +CONFIG_CRYPTO_NULL=m +CONFIG_CRYPTO_NULL2=y +CONFIG_CRYPTO_PCRYPT=m +CONFIG_CRYPTO_WORKQUEUE=y +CONFIG_CRYPTO_CRYPTD=m +CONFIG_CRYPTO_MCRYPTD=m +CONFIG_CRYPTO_AUTHENC=m +CONFIG_CRYPTO_TEST=m +CONFIG_CRYPTO_SIMD=m +CONFIG_CRYPTO_GLUE_HELPER_X86=m +CONFIG_CRYPTO_ENGINE=m + +# +# Authenticated Encryption with Associated Data +# +CONFIG_CRYPTO_CCM=m +CONFIG_CRYPTO_GCM=m +CONFIG_CRYPTO_CHACHA20POLY1305=m +CONFIG_CRYPTO_AEGIS128=m +CONFIG_CRYPTO_AEGIS128L=m +CONFIG_CRYPTO_AEGIS256=m +CONFIG_CRYPTO_AEGIS128_AESNI_SSE2=m +CONFIG_CRYPTO_AEGIS128L_AESNI_SSE2=m +CONFIG_CRYPTO_AEGIS256_AESNI_SSE2=m +CONFIG_CRYPTO_MORUS640=m +CONFIG_CRYPTO_MORUS640_GLUE=m +CONFIG_CRYPTO_MORUS640_SSE2=m +CONFIG_CRYPTO_MORUS1280=m +CONFIG_CRYPTO_MORUS1280_GLUE=m +CONFIG_CRYPTO_MORUS1280_SSE2=m +CONFIG_CRYPTO_MORUS1280_AVX2=m +CONFIG_CRYPTO_SEQIV=m +CONFIG_CRYPTO_ECHAINIV=m + +# +# Block modes +# +CONFIG_CRYPTO_CBC=m +CONFIG_CRYPTO_CFB=m +CONFIG_CRYPTO_CTR=m +CONFIG_CRYPTO_CTS=m +CONFIG_CRYPTO_ECB=y +CONFIG_CRYPTO_LRW=m +CONFIG_CRYPTO_PCBC=m +CONFIG_CRYPTO_XTS=m +CONFIG_CRYPTO_KEYWRAP=m + +# +# Hash modes +# +CONFIG_CRYPTO_CMAC=m +CONFIG_CRYPTO_HMAC=m +CONFIG_CRYPTO_XCBC=m +CONFIG_CRYPTO_VMAC=m + +# +# Digest +# +CONFIG_CRYPTO_CRC32C=m +CONFIG_CRYPTO_CRC32C_INTEL=m +CONFIG_CRYPTO_CRC32=m +CONFIG_CRYPTO_CRC32_PCLMUL=m +CONFIG_CRYPTO_CRCT10DIF=y +CONFIG_CRYPTO_CRCT10DIF_PCLMUL=m +CONFIG_CRYPTO_GHASH=m +CONFIG_CRYPTO_POLY1305=m +CONFIG_CRYPTO_POLY1305_X86_64=m +CONFIG_CRYPTO_MD4=m +CONFIG_CRYPTO_MD5=y +CONFIG_CRYPTO_MICHAEL_MIC=m +CONFIG_CRYPTO_RMD128=m +CONFIG_CRYPTO_RMD160=m +CONFIG_CRYPTO_RMD256=m +CONFIG_CRYPTO_RMD320=m +CONFIG_CRYPTO_SHA1=y +CONFIG_CRYPTO_SHA1_SSSE3=m +CONFIG_CRYPTO_SHA256_SSSE3=m +CONFIG_CRYPTO_SHA512_SSSE3=m +CONFIG_CRYPTO_SHA1_MB=m +CONFIG_CRYPTO_SHA256_MB=m +CONFIG_CRYPTO_SHA512_MB=m +CONFIG_CRYPTO_SHA256=m +CONFIG_CRYPTO_SHA512=y +CONFIG_CRYPTO_SHA3=m +CONFIG_CRYPTO_SM3=m +CONFIG_CRYPTO_TGR192=m +CONFIG_CRYPTO_WP512=m +CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=m + +# +# Ciphers +# +CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=m +CONFIG_CRYPTO_AES_X86_64=m +CONFIG_CRYPTO_AES_NI_INTEL=m +CONFIG_CRYPTO_ANUBIS=m +CONFIG_CRYPTO_ARC4=m +CONFIG_CRYPTO_BLOWFISH=m +CONFIG_CRYPTO_BLOWFISH_COMMON=m +CONFIG_CRYPTO_BLOWFISH_X86_64=m +CONFIG_CRYPTO_CAMELLIA=m +CONFIG_CRYPTO_CAMELLIA_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX_X86_64=m +CONFIG_CRYPTO_CAMELLIA_AESNI_AVX2_X86_64=m +CONFIG_CRYPTO_CAST_COMMON=m +CONFIG_CRYPTO_CAST5=m +CONFIG_CRYPTO_CAST5_AVX_X86_64=m +CONFIG_CRYPTO_CAST6=m +CONFIG_CRYPTO_CAST6_AVX_X86_64=m +CONFIG_CRYPTO_DES=m +CONFIG_CRYPTO_DES3_EDE_X86_64=m +CONFIG_CRYPTO_FCRYPT=m +CONFIG_CRYPTO_KHAZAD=m +CONFIG_CRYPTO_SALSA20=m +CONFIG_CRYPTO_CHACHA20=m +CONFIG_CRYPTO_CHACHA20_X86_64=m +CONFIG_CRYPTO_SEED=m +CONFIG_CRYPTO_SERPENT=m +CONFIG_CRYPTO_SERPENT_SSE2_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX_X86_64=m +CONFIG_CRYPTO_SERPENT_AVX2_X86_64=m +CONFIG_CRYPTO_SM4=m +CONFIG_CRYPTO_TEA=m +CONFIG_CRYPTO_TWOFISH=m +CONFIG_CRYPTO_TWOFISH_COMMON=m +CONFIG_CRYPTO_TWOFISH_X86_64=m +CONFIG_CRYPTO_TWOFISH_X86_64_3WAY=m +CONFIG_CRYPTO_TWOFISH_AVX_X86_64=m + +# +# Compression +# +CONFIG_CRYPTO_DEFLATE=y +CONFIG_CRYPTO_LZO=y +CONFIG_CRYPTO_842=y +CONFIG_CRYPTO_LZ4=y +CONFIG_CRYPTO_LZ4HC=y +CONFIG_CRYPTO_ZSTD=y + +# +# Random Number Generation +# +CONFIG_CRYPTO_ANSI_CPRNG=m +CONFIG_CRYPTO_DRBG_MENU=m +CONFIG_CRYPTO_DRBG_HMAC=y +CONFIG_CRYPTO_DRBG_HASH=y +CONFIG_CRYPTO_DRBG_CTR=y +CONFIG_CRYPTO_DRBG=m +CONFIG_CRYPTO_JITTERENTROPY=m +CONFIG_CRYPTO_USER_API=m +CONFIG_CRYPTO_USER_API_HASH=m +CONFIG_CRYPTO_USER_API_SKCIPHER=m +CONFIG_CRYPTO_USER_API_RNG=m +CONFIG_CRYPTO_USER_API_AEAD=m +CONFIG_CRYPTO_HASH_INFO=y +CONFIG_CRYPTO_HW=y +CONFIG_CRYPTO_DEV_PADLOCK=m +CONFIG_CRYPTO_DEV_PADLOCK_AES=m +CONFIG_CRYPTO_DEV_PADLOCK_SHA=m +CONFIG_CRYPTO_DEV_CCP=y +CONFIG_CRYPTO_DEV_CCP_DD=m +CONFIG_CRYPTO_DEV_SP_CCP=y +CONFIG_CRYPTO_DEV_CCP_CRYPTO=m +CONFIG_CRYPTO_DEV_SP_PSP=y +CONFIG_CRYPTO_DEV_QAT=m +CONFIG_CRYPTO_DEV_QAT_DH895xCC=m +CONFIG_CRYPTO_DEV_QAT_C3XXX=m +CONFIG_CRYPTO_DEV_QAT_C62X=m +CONFIG_CRYPTO_DEV_QAT_DH895xCCVF=m +CONFIG_CRYPTO_DEV_QAT_C3XXXVF=m +CONFIG_CRYPTO_DEV_QAT_C62XVF=m +CONFIG_CRYPTO_DEV_NITROX=m +CONFIG_CRYPTO_DEV_NITROX_CNN55XX=m +CONFIG_CRYPTO_DEV_CHELSIO=m +CONFIG_CHELSIO_IPSEC_INLINE=y +CONFIG_CRYPTO_DEV_CHELSIO_TLS=m +CONFIG_CRYPTO_DEV_VIRTIO=m +CONFIG_ASYMMETRIC_KEY_TYPE=y +CONFIG_ASYMMETRIC_PUBLIC_KEY_SUBTYPE=y +CONFIG_X509_CERTIFICATE_PARSER=y +CONFIG_PKCS7_MESSAGE_PARSER=y +CONFIG_PKCS7_TEST_KEY=m +CONFIG_SIGNED_PE_FILE_VERIFICATION=y + +# +# Certificates for signature checking +# +CONFIG_MODULE_SIG_KEY="certs/signing_key.pem" +CONFIG_SYSTEM_TRUSTED_KEYRING=y +CONFIG_SYSTEM_TRUSTED_KEYS="" +# CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set +CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" +CONFIG_BINARY_PRINTF=y + +# +# Library routines +# +CONFIG_RAID6_PQ=m +CONFIG_BITREVERSE=y +CONFIG_RATIONAL=y +CONFIG_GENERIC_STRNCPY_FROM_USER=y +CONFIG_GENERIC_STRNLEN_USER=y +CONFIG_GENERIC_NET_UTILS=y +CONFIG_GENERIC_FIND_FIRST_BIT=y +CONFIG_GENERIC_PCI_IOMAP=y +CONFIG_GENERIC_IOMAP=y +CONFIG_ARCH_USE_CMPXCHG_LOCKREF=y +CONFIG_ARCH_HAS_FAST_MULTIPLIER=y +CONFIG_CRC_CCITT=m +CONFIG_CRC16=m +CONFIG_CRC_T10DIF=y +CONFIG_CRC_ITU_T=m +CONFIG_CRC32=y +# CONFIG_CRC32_SELFTEST is not set +CONFIG_CRC32_SLICEBY8=y +# CONFIG_CRC32_SLICEBY4 is not set +# CONFIG_CRC32_SARWATE is not set +# CONFIG_CRC32_BIT is not set +CONFIG_CRC64=m +CONFIG_CRC4=m +CONFIG_CRC7=m +CONFIG_LIBCRC32C=m +CONFIG_CRC8=m +CONFIG_XXHASH=y +# CONFIG_RANDOM32_SELFTEST is not set +CONFIG_842_COMPRESS=y +CONFIG_842_DECOMPRESS=y +CONFIG_ZLIB_INFLATE=y +CONFIG_ZLIB_DEFLATE=y +CONFIG_LZO_COMPRESS=y +CONFIG_LZO_DECOMPRESS=y +CONFIG_LZ4_COMPRESS=y +CONFIG_LZ4HC_COMPRESS=y +CONFIG_LZ4_DECOMPRESS=y +CONFIG_ZSTD_COMPRESS=y +CONFIG_ZSTD_DECOMPRESS=y +CONFIG_XZ_DEC=y +CONFIG_XZ_DEC_X86=y +CONFIG_XZ_DEC_POWERPC=y +CONFIG_XZ_DEC_IA64=y +CONFIG_XZ_DEC_ARM=y +CONFIG_XZ_DEC_ARMTHUMB=y +CONFIG_XZ_DEC_SPARC=y +CONFIG_XZ_DEC_BCJ=y +# CONFIG_XZ_DEC_TEST is not set +CONFIG_DECOMPRESS_GZIP=y +CONFIG_DECOMPRESS_BZIP2=y +CONFIG_DECOMPRESS_LZMA=y +CONFIG_DECOMPRESS_XZ=y +CONFIG_DECOMPRESS_LZO=y +CONFIG_DECOMPRESS_LZ4=y +CONFIG_GENERIC_ALLOCATOR=y +CONFIG_REED_SOLOMON=m +CONFIG_REED_SOLOMON_ENC8=y +CONFIG_REED_SOLOMON_DEC8=y +CONFIG_REED_SOLOMON_DEC16=y +CONFIG_BCH=m +CONFIG_BCH_CONST_PARAMS=y +CONFIG_TEXTSEARCH=y +CONFIG_TEXTSEARCH_KMP=m +CONFIG_TEXTSEARCH_BM=m +CONFIG_TEXTSEARCH_FSM=m +CONFIG_BTREE=y +CONFIG_INTERVAL_TREE=y +CONFIG_RADIX_TREE_MULTIORDER=y +CONFIG_ASSOCIATIVE_ARRAY=y +CONFIG_HAS_IOMEM=y +CONFIG_HAS_IOPORT_MAP=y +CONFIG_HAS_DMA=y +CONFIG_NEED_SG_DMA_LENGTH=y +CONFIG_NEED_DMA_MAP_STATE=y +CONFIG_ARCH_DMA_ADDR_T_64BIT=y +CONFIG_DMA_DIRECT_OPS=y +CONFIG_DMA_VIRT_OPS=y +CONFIG_SWIOTLB=y +CONFIG_SGL_ALLOC=y +CONFIG_IOMMU_HELPER=y +CONFIG_CHECK_SIGNATURE=y +CONFIG_CPUMASK_OFFSTACK=y +CONFIG_CPU_RMAP=y +CONFIG_DQL=y +CONFIG_GLOB=y +# CONFIG_GLOB_SELFTEST is not set +CONFIG_NLATTR=y +CONFIG_LRU_CACHE=m +CONFIG_CLZ_TAB=y +CONFIG_CORDIC=m +CONFIG_DDR=y +CONFIG_IRQ_POLL=y +CONFIG_MPILIB=y +CONFIG_OID_REGISTRY=y +CONFIG_UCS2_STRING=y +CONFIG_FONT_SUPPORT=y +CONFIG_FONTS=y +CONFIG_FONT_8x8=y +CONFIG_FONT_8x16=y +# CONFIG_FONT_6x11 is not set +# CONFIG_FONT_7x14 is not set +# CONFIG_FONT_PEARL_8x8 is not set +# CONFIG_FONT_ACORN_8x8 is not set +# CONFIG_FONT_MINI_4x6 is not set +# CONFIG_FONT_6x10 is not set +# CONFIG_FONT_10x18 is not set +# CONFIG_FONT_SUN8x16 is not set +# CONFIG_FONT_SUN12x22 is not set +CONFIG_SG_POOL=y +CONFIG_ARCH_HAS_SG_CHAIN=y +CONFIG_ARCH_HAS_PMEM_API=y +CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE=y +CONFIG_ARCH_HAS_UACCESS_MCSAFE=y +CONFIG_SBITMAP=y +CONFIG_PARMAN=m +CONFIG_PRIME_NUMBERS=m +# CONFIG_STRING_SELFTEST is not set + +# +# Kernel hacking +# + +# +# printk and dmesg options +# +# CONFIG_PRINTK_TIME is not set +CONFIG_CONSOLE_LOGLEVEL_DEFAULT=1 +CONFIG_CONSOLE_LOGLEVEL_QUIET=4 +CONFIG_MESSAGE_LOGLEVEL_DEFAULT=1 +# CONFIG_BOOT_PRINTK_DELAY is not set +# CONFIG_DYNAMIC_DEBUG is not set + +# +# Compile-time checks and compiler options +# +# CONFIG_DEBUG_INFO is not set +# CONFIG_ENABLE_MUST_CHECK is not set +CONFIG_FRAME_WARN=0 +CONFIG_STRIP_ASM_SYMS=y +# CONFIG_READABLE_ASM is not set +# CONFIG_UNUSED_SYMBOLS is not set +# CONFIG_PAGE_OWNER is not set +CONFIG_DEBUG_FS=y +# CONFIG_HEADERS_CHECK is not set +# CONFIG_DEBUG_SECTION_MISMATCH is not set +CONFIG_SECTION_MISMATCH_WARN_ONLY=y +CONFIG_STACK_VALIDATION=y +# CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set +CONFIG_MAGIC_SYSRQ=y +CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x1 +CONFIG_MAGIC_SYSRQ_SERIAL=y +CONFIG_DEBUG_KERNEL=y + +# +# Memory Debugging +# +# CONFIG_PAGE_EXTENSION is not set +# CONFIG_DEBUG_PAGEALLOC is not set +# CONFIG_PAGE_POISONING is not set +# CONFIG_DEBUG_PAGE_REF is not set +# CONFIG_DEBUG_RODATA_TEST is not set +# CONFIG_DEBUG_OBJECTS is not set +CONFIG_SLUB_DEBUG_ON=y +# CONFIG_SLUB_STATS is not set +CONFIG_HAVE_DEBUG_KMEMLEAK=y +# CONFIG_DEBUG_KMEMLEAK is not set +# CONFIG_DEBUG_STACK_USAGE is not set +# CONFIG_DEBUG_VM is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y +# CONFIG_DEBUG_VIRTUAL is not set +CONFIG_DEBUG_MEMORY_INIT=y +# CONFIG_DEBUG_PER_CPU_MAPS is not set +CONFIG_HAVE_DEBUG_STACKOVERFLOW=y +# CONFIG_DEBUG_STACKOVERFLOW is not set +CONFIG_HAVE_ARCH_KASAN=y +# CONFIG_KASAN is not set +CONFIG_ARCH_HAS_KCOV=y +CONFIG_CC_HAS_SANCOV_TRACE_PC=y +# CONFIG_KCOV is not set +# CONFIG_DEBUG_SHIRQ is not set + +# +# Debug Lockups and Hangs +# +# CONFIG_SOFTLOCKUP_DETECTOR is not set +CONFIG_HARDLOCKUP_CHECK_TIMESTAMP=y +# CONFIG_HARDLOCKUP_DETECTOR is not set +# CONFIG_DETECT_HUNG_TASK is not set +# CONFIG_WQ_WATCHDOG is not set +# CONFIG_PANIC_ON_OOPS is not set +CONFIG_PANIC_ON_OOPS_VALUE=0 +CONFIG_PANIC_TIMEOUT=0 +CONFIG_SCHED_DEBUG=y +CONFIG_SCHED_INFO=y +CONFIG_SCHEDSTATS=y +CONFIG_SCHED_STACK_END_CHECK=y +# CONFIG_DEBUG_TIMEKEEPING is not set +# CONFIG_DEBUG_PREEMPT is not set + +# +# Lock Debugging (spinlocks, mutexes, etc...) +# +CONFIG_LOCK_DEBUGGING_SUPPORT=y +# CONFIG_PROVE_LOCKING is not set +# CONFIG_LOCK_STAT is not set +# CONFIG_DEBUG_RT_MUTEXES is not set +# CONFIG_DEBUG_SPINLOCK is not set +# CONFIG_DEBUG_MUTEXES is not set +# CONFIG_DEBUG_WW_MUTEX_SLOWPATH is not set +# CONFIG_DEBUG_RWSEMS is not set +# CONFIG_DEBUG_LOCK_ALLOC is not set +# CONFIG_DEBUG_ATOMIC_SLEEP is not set +# CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set +# CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set +CONFIG_STACKTRACE=y +# CONFIG_WARN_ALL_UNSEEDED_RANDOM is not set +# CONFIG_DEBUG_KOBJECT is not set +CONFIG_DEBUG_BUGVERBOSE=y +CONFIG_DEBUG_LIST=y +CONFIG_DEBUG_PI_LIST=y +CONFIG_DEBUG_SG=y +CONFIG_DEBUG_NOTIFIERS=y +CONFIG_DEBUG_CREDENTIALS=y + +# +# RCU Debugging +# +CONFIG_TORTURE_TEST=m +CONFIG_RCU_PERF_TEST=m +# CONFIG_RCU_TORTURE_TEST is not set +CONFIG_RCU_CPU_STALL_TIMEOUT=60 +# CONFIG_RCU_TRACE is not set +# CONFIG_RCU_EQS_DEBUG is not set +# CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set +# CONFIG_DEBUG_BLOCK_EXT_DEVT is not set +# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set +# CONFIG_NOTIFIER_ERROR_INJECTION is not set +CONFIG_FUNCTION_ERROR_INJECTION=y +# CONFIG_FAULT_INJECTION is not set +CONFIG_LATENCYTOP=y +CONFIG_USER_STACKTRACE_SUPPORT=y +CONFIG_NOP_TRACER=y +CONFIG_HAVE_FUNCTION_TRACER=y +CONFIG_HAVE_FUNCTION_GRAPH_TRACER=y +CONFIG_HAVE_DYNAMIC_FTRACE=y +CONFIG_HAVE_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_HAVE_FTRACE_MCOUNT_RECORD=y +CONFIG_HAVE_SYSCALL_TRACEPOINTS=y +CONFIG_HAVE_FENTRY=y +CONFIG_HAVE_C_RECORDMCOUNT=y +CONFIG_TRACE_CLOCK=y +CONFIG_RING_BUFFER=y +CONFIG_EVENT_TRACING=y +CONFIG_CONTEXT_SWITCH_TRACER=y +CONFIG_TRACING=y +CONFIG_GENERIC_TRACER=y +CONFIG_TRACING_SUPPORT=y +CONFIG_FTRACE=y +CONFIG_FUNCTION_TRACER=y +# CONFIG_FUNCTION_GRAPH_TRACER is not set +# CONFIG_PREEMPTIRQ_EVENTS is not set +# CONFIG_IRQSOFF_TRACER is not set +# CONFIG_PREEMPT_TRACER is not set +# CONFIG_SCHED_TRACER is not set +# CONFIG_HWLAT_TRACER is not set +CONFIG_FTRACE_SYSCALLS=y +# CONFIG_TRACER_SNAPSHOT is not set +CONFIG_BRANCH_PROFILE_NONE=y +# CONFIG_PROFILE_ANNOTATED_BRANCHES is not set +CONFIG_STACK_TRACER=y +CONFIG_BLK_DEV_IO_TRACE=y +CONFIG_KPROBE_EVENTS=y +# CONFIG_KPROBE_EVENTS_ON_NOTRACE is not set +# CONFIG_UPROBE_EVENTS is not set +CONFIG_BPF_EVENTS=y +CONFIG_PROBE_EVENTS=y +CONFIG_DYNAMIC_FTRACE=y +CONFIG_DYNAMIC_FTRACE_WITH_REGS=y +CONFIG_FUNCTION_PROFILER=y +# CONFIG_BPF_KPROBE_OVERRIDE is not set +CONFIG_FTRACE_MCOUNT_RECORD=y +# CONFIG_FTRACE_STARTUP_TEST is not set +# CONFIG_MMIOTRACE is not set +# CONFIG_HIST_TRIGGERS is not set +# CONFIG_TRACEPOINT_BENCHMARK is not set +# CONFIG_RING_BUFFER_BENCHMARK is not set +# CONFIG_RING_BUFFER_STARTUP_TEST is not set +# CONFIG_PREEMPTIRQ_DELAY_TEST is not set +# CONFIG_TRACE_EVAL_MAP_FILE is not set +CONFIG_TRACING_EVENTS_GPIO=y +# CONFIG_PROVIDE_OHCI1394_DMA_INIT is not set +# CONFIG_DMA_API_DEBUG is not set +CONFIG_RUNTIME_TESTING_MENU=y +CONFIG_LKDTM=m +# CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_SORT is not set +# CONFIG_KPROBES_SANITY_TEST is not set +# CONFIG_BACKTRACE_SELF_TEST is not set +# CONFIG_RBTREE_TEST is not set +# CONFIG_INTERVAL_TREE_TEST is not set +# CONFIG_PERCPU_TEST is not set +# CONFIG_ATOMIC64_SELFTEST is not set +# CONFIG_ASYNC_RAID6_TEST is not set +# CONFIG_TEST_HEXDUMP is not set +# CONFIG_TEST_STRING_HELPERS is not set +# CONFIG_TEST_KSTRTOX is not set +# CONFIG_TEST_PRINTF is not set +# CONFIG_TEST_BITMAP is not set +# CONFIG_TEST_BITFIELD is not set +# CONFIG_TEST_UUID is not set +# CONFIG_TEST_OVERFLOW is not set +# CONFIG_TEST_RHASHTABLE is not set +# CONFIG_TEST_HASH is not set +# CONFIG_TEST_IDA is not set +# CONFIG_TEST_PARMAN is not set +# CONFIG_TEST_LKM is not set +# CONFIG_TEST_USER_COPY is not set +# CONFIG_TEST_BPF is not set +# CONFIG_FIND_BIT_BENCHMARK is not set +# CONFIG_TEST_FIRMWARE is not set +# CONFIG_TEST_SYSCTL is not set +# CONFIG_TEST_UDELAY is not set +# CONFIG_TEST_STATIC_KEYS is not set +# CONFIG_TEST_KMOD is not set +CONFIG_MEMTEST=y +# CONFIG_BUG_ON_DATA_CORRUPTION is not set +# CONFIG_SAMPLES is not set +CONFIG_HAVE_ARCH_KGDB=y +# CONFIG_KGDB is not set +CONFIG_ARCH_HAS_UBSAN_SANITIZE_ALL=y +# CONFIG_UBSAN is not set +CONFIG_ARCH_HAS_DEVMEM_IS_ALLOWED=y +CONFIG_STRICT_DEVMEM=y +CONFIG_IO_STRICT_DEVMEM=y +CONFIG_TRACE_IRQFLAGS_SUPPORT=y +CONFIG_EARLY_PRINTK_USB=y +CONFIG_X86_VERBOSE_BOOTUP=y +CONFIG_EARLY_PRINTK=y +CONFIG_EARLY_PRINTK_DBGP=y +CONFIG_EARLY_PRINTK_EFI=y +CONFIG_EARLY_PRINTK_USB_XDBC=y +CONFIG_X86_PTDUMP_CORE=y +# CONFIG_X86_PTDUMP is not set +# CONFIG_EFI_PGT_DUMP is not set +CONFIG_DEBUG_WX=y +CONFIG_DOUBLEFAULT=y +# CONFIG_DEBUG_TLBFLUSH is not set +# CONFIG_IOMMU_DEBUG is not set +CONFIG_HAVE_MMIOTRACE_SUPPORT=y +# CONFIG_X86_DECODER_SELFTEST is not set +CONFIG_IO_DELAY_TYPE_0X80=0 +CONFIG_IO_DELAY_TYPE_0XED=1 +CONFIG_IO_DELAY_TYPE_UDELAY=2 +CONFIG_IO_DELAY_TYPE_NONE=3 +CONFIG_IO_DELAY_0X80=y +# CONFIG_IO_DELAY_0XED is not set +# CONFIG_IO_DELAY_UDELAY is not set +# CONFIG_IO_DELAY_NONE is not set +CONFIG_DEFAULT_IO_DELAY_TYPE=0 +# CONFIG_DEBUG_BOOT_PARAMS is not set +# CONFIG_CPA_DEBUG is not set +# CONFIG_OPTIMIZE_INLINING is not set +# CONFIG_DEBUG_ENTRY is not set +# CONFIG_DEBUG_NMI_SELFTEST is not set +CONFIG_X86_DEBUG_FPU=y +# CONFIG_PUNIT_ATOM_DEBUG is not set +CONFIG_UNWINDER_ORC=y +# CONFIG_UNWINDER_FRAME_POINTER is not set diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-revert-patches-causing-instant-reboot.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-revert-patches-causing-instant-reboot.patch new file mode 100644 index 00000000..a2127cff --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-revert-patches-causing-instant-reboot.patch @@ -0,0 +1,314 @@ +diff --git a/arch/x86/boot/compressed/head_64.S b/arch/x86/boot/compressed/head_64.S +index 8169e8b7a4dc..12915511be61 100644 +--- a/arch/x86/boot/compressed/head_64.S ++++ b/arch/x86/boot/compressed/head_64.S +@@ -305,48 +305,13 @@ ENTRY(startup_64) + /* Set up the stack */ + leaq boot_stack_end(%rbx), %rsp + +- /* +- * paging_prepare() and cleanup_trampoline() below can have GOT +- * references. Adjust the table with address we are running at. +- * +- * Zero RAX for adjust_got: the GOT was not adjusted before; +- * there's no adjustment to undo. +- */ +- xorq %rax, %rax +- +- /* +- * Calculate the address the binary is loaded at and use it as +- * a GOT adjustment. +- */ +- call 1f +-1: popq %rdi +- subq $1b, %rdi +- +- call adjust_got +- + /* + * At this point we are in long mode with 4-level paging enabled, +- * but we might want to enable 5-level paging or vice versa. +- * +- * The problem is that we cannot do it directly. Setting or clearing +- * CR4.LA57 in long mode would trigger #GP. So we need to switch off +- * long mode and paging first. +- * +- * We also need a trampoline in lower memory to switch over from +- * 4- to 5-level paging for cases when the bootloader puts the kernel +- * above 4G, but didn't enable 5-level paging for us. +- * +- * The same trampoline can be used to switch from 5- to 4-level paging +- * mode, like when starting 4-level paging kernel via kexec() when +- * original kernel worked in 5-level paging mode. +- * +- * For the trampoline, we need the top page table to reside in lower +- * memory as we don't have a way to load 64-bit values into CR3 in +- * 32-bit mode. ++ * but we want to enable 5-level paging. + * +- * We go though the trampoline even if we don't have to: if we're +- * already in a desired paging mode. This way the trampoline code gets +- * tested on every boot. ++ * The problem is that we cannot do it directly. Setting LA57 in ++ * long mode would trigger #GP. So we need to switch off long mode ++ * first. + */ + + /* Make sure we have GDT with 32-bit code segment */ +@@ -371,32 +336,40 @@ ENTRY(startup_64) + /* Save the trampoline address in RCX */ + movq %rax, %rcx + ++ /* Check if we need to enable 5-level paging */ ++ cmpq $0, %rdx ++ jz lvl5 ++ ++ /* Clear additional page table */ ++ leaq lvl5_pgtable(%rbx), %rdi ++ xorq %rax, %rax ++ movq $(PAGE_SIZE/8), %rcx ++ rep stosq ++ + /* +- * Load the address of trampoline_return() into RDI. +- * It will be used by the trampoline to return to the main code. ++ * Setup current CR3 as the first and only entry in a new top level ++ * page table. + */ +- leaq trampoline_return(%rip), %rdi ++ movq %cr3, %rdi ++ leaq 0x7 (%rdi), %rax ++ movq %rax, lvl5_pgtable(%rbx) + + /* Switch to compatibility mode (CS.L = 0 CS.D = 1) via far return */ + pushq $__KERNEL32_CS +- leaq TRAMPOLINE_32BIT_CODE_OFFSET(%rax), %rax ++ leaq compatible_mode(%rip), %rax + pushq %rax + lretq +-trampoline_return: ++lvl5: + /* Restore the stack, the 32-bit trampoline uses its own stack */ + leaq boot_stack_end(%rbx), %rsp + + /* + * cleanup_trampoline() would restore trampoline memory. + * +- * RDI is address of the page table to use instead of page table +- * in trampoline memory (if required). +- * + * RSI holds real mode data and needs to be preserved across + * this function call. + */ + pushq %rsi +- leaq top_pgtable(%rbx), %rdi + call cleanup_trampoline + popq %rsi + +@@ -404,21 +377,6 @@ trampoline_return: + pushq $0 + popfq + +- /* +- * Previously we've adjusted the GOT with address the binary was +- * loaded at. Now we need to re-adjust for relocation address. +- * +- * Calculate the address the binary is loaded at, so that we can +- * undo the previous GOT adjustment. +- */ +- call 1f +-1: popq %rax +- subq $1b, %rax +- +- /* The new adjustment is the relocation address */ +- movq %rbx, %rdi +- call adjust_got +- + /* + * Copy the compressed kernel to the end of our buffer + * where decompression in place becomes safe. +@@ -519,6 +477,19 @@ relocated: + shrq $3, %rcx + rep stosq + ++/* ++ * Adjust our own GOT ++ */ ++ leaq _got(%rip), %rdx ++ leaq _egot(%rip), %rcx ++1: ++ cmpq %rcx, %rdx ++ jae 2f ++ addq %rbx, (%rdx) ++ addq $8, %rdx ++ jmp 1b ++2: ++ + /* + * Do the extraction, and jump to the new kernel.. + */ +@@ -537,36 +508,9 @@ relocated: + */ + jmp *%rax + +-/* +- * Adjust the global offset table +- * +- * RAX is the previous adjustment of the table to undo (use 0 if it's the +- * first time we touch GOT). +- * RDI is the new adjustment to apply. +- */ +-adjust_got: +- /* Walk through the GOT adding the address to the entries */ +- leaq _got(%rip), %rdx +- leaq _egot(%rip), %rcx +-1: +- cmpq %rcx, %rdx +- jae 2f +- subq %rax, (%rdx) /* Undo previous adjustment */ +- addq %rdi, (%rdx) /* Apply the new adjustment */ +- addq $8, %rdx +- jmp 1b +-2: +- ret +- + .code32 +-/* +- * This is the 32-bit trampoline that will be copied over to low memory. +- * +- * RDI contains the return address (might be above 4G). +- * ECX contains the base address of the trampoline memory. +- * Non zero RDX on return means we need to enable 5-level paging. +- */ + ENTRY(trampoline_32bit_src) ++compatible_mode: + /* Set up data and stack segments */ + movl $__KERNEL_DS, %eax + movl %eax, %ds +@@ -580,61 +524,33 @@ ENTRY(trampoline_32bit_src) + btrl $X86_CR0_PG_BIT, %eax + movl %eax, %cr0 + +- /* Check what paging mode we want to be in after the trampoline */ +- cmpl $0, %edx +- jz 1f ++ /* Point CR3 to 5-level paging */ ++ leal lvl5_pgtable(%ebx), %eax ++ movl %eax, %cr3 + +- /* We want 5-level paging: don't touch CR3 if it already points to 5-level page tables */ ++ /* Enable PAE and LA57 mode */ + movl %cr4, %eax +- testl $X86_CR4_LA57, %eax +- jnz 3f +- jmp 2f +-1: +- /* We want 4-level paging: don't touch CR3 if it already points to 4-level page tables */ +- movl %cr4, %eax +- testl $X86_CR4_LA57, %eax +- jz 3f +-2: +- /* Point CR3 to the trampoline's new top level page table */ +- leal TRAMPOLINE_32BIT_PGTABLE_OFFSET(%ecx), %eax +- movl %eax, %cr3 +-3: +- /* Enable PAE and LA57 (if required) paging modes */ +- movl $X86_CR4_PAE, %eax +- cmpl $0, %edx +- jz 1f +- orl $X86_CR4_LA57, %eax +-1: ++ orl $(X86_CR4_PAE | X86_CR4_LA57), %eax + movl %eax, %cr4 + +- /* Calculate address of paging_enabled() once we are executing in the trampoline */ +- leal paging_enabled - trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_OFFSET(%ecx), %eax ++ /* Calculate address we are running at */ ++ call 1f ++1: popl %edi ++ subl $1b, %edi + +- /* Prepare the stack for far return to Long Mode */ ++ /* Prepare stack for far return to Long Mode */ + pushl $__KERNEL_CS +- pushl %eax ++ leal lvl5(%edi), %eax ++ push %eax + +- /* Enable paging again */ ++ /* Enable paging back */ + movl $(X86_CR0_PG | X86_CR0_PE), %eax + movl %eax, %cr0 + + lret + +- .code64 +-paging_enabled: +- /* Return from the trampoline */ +- jmp *%rdi +- +- /* +- * The trampoline code has a size limit. +- * Make sure we fail to compile if the trampoline code grows +- * beyond TRAMPOLINE_32BIT_CODE_SIZE bytes. +- */ +- .org trampoline_32bit_src + TRAMPOLINE_32BIT_CODE_SIZE +- +- .code32 + no_longmode: +- /* This isn't an x86-64 CPU, so hang intentionally, we cannot continue */ ++ /* This isn't an x86-64 CPU so hang */ + 1: + hlt + jmp 1b +@@ -695,10 +611,5 @@ boot_stack_end: + .balign 4096 + pgtable: + .fill BOOT_PGT_SIZE, 1, 0 +- +-/* +- * The page table is going to be used instead of page table in the trampoline +- * memory. +- */ +-top_pgtable: ++lvl5_pgtable: + .fill PAGE_SIZE, 1, 0 +diff --git a/arch/x86/boot/compressed/pgtable_64.c b/arch/x86/boot/compressed/pgtable_64.c +index a362fa0b849c..32af1cbcd903 100644 +--- a/arch/x86/boot/compressed/pgtable_64.c ++++ b/arch/x86/boot/compressed/pgtable_64.c +@@ -22,6 +22,14 @@ struct paging_config { + /* Buffer to preserve trampoline memory */ + static char trampoline_save[TRAMPOLINE_32BIT_SIZE]; + ++/* ++ * The page table is going to be used instead of page table in the trampoline ++ * memory. ++ * ++ * It must not be in BSS as BSS is cleared after cleanup_trampoline(). ++ */ ++static char top_pgtable[PAGE_SIZE] __aligned(PAGE_SIZE) __section(.data); ++ + /* + * Trampoline address will be printed by extract_kernel() for debugging + * purposes. +@@ -126,7 +134,7 @@ struct paging_config paging_prepare(void) + return paging_config; + } + +-void cleanup_trampoline(void *pgtable) ++void cleanup_trampoline(void) + { + void *trampoline_pgtable; + +@@ -137,8 +145,8 @@ void cleanup_trampoline(void *pgtable) + * if it's there. + */ + if ((void *)__native_read_cr3() == trampoline_pgtable) { +- memcpy(pgtable, trampoline_pgtable, PAGE_SIZE); +- native_write_cr3((unsigned long)pgtable); ++ memcpy(top_pgtable, trampoline_pgtable, PAGE_SIZE); ++ native_write_cr3((unsigned long)top_pgtable); + } + + /* Restore trampoline memory */ diff --git a/sys-kernel/linux-image-redcore-lts/files/4.19-uksm-linux-hardened.patch b/sys-kernel/linux-image-redcore-lts/files/4.19-uksm-linux-hardened.patch new file mode 100644 index 00000000..afb30db2 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/files/4.19-uksm-linux-hardened.patch @@ -0,0 +1,6941 @@ +diff -Nur a/Documentation/vm/00-INDEX b/Documentation/vm/00-INDEX +--- a/Documentation/vm/00-INDEX 2019-02-06 16:30:16.000000000 +0000 ++++ b/Documentation/vm/00-INDEX 2019-02-09 17:23:06.726863699 +0000 +@@ -18,6 +18,8 @@ + - explains what hwpoison is + ksm.rst + - how to use the Kernel Samepage Merging feature. ++uksm.txt ++ - Introduction to Ultra KSM + mmu_notifier.rst + - a note about clearing pte/pmd and mmu notifications + numa.rst +diff -Nur a/Documentation/vm/uksm.txt b/Documentation/vm/uksm.txt +--- a/Documentation/vm/uksm.txt 1970-01-01 01:00:00.000000000 +0100 ++++ b/Documentation/vm/uksm.txt 2019-02-09 17:23:06.726863699 +0000 +@@ -0,0 +1,61 @@ ++The Ultra Kernel Samepage Merging feature ++---------------------------------------------- ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++ChangeLog: ++ ++2012-05-05 The creation of this Doc ++2012-05-08 UKSM 0.1.1.1 libc crash bug fix, api clean up, doc clean up. ++2012-05-28 UKSM 0.1.1.2 bug fix release ++2012-06-26 UKSM 0.1.2-beta1 first beta release for 0.1.2 ++2012-07-2 UKSM 0.1.2-beta2 ++2012-07-10 UKSM 0.1.2-beta3 ++2012-07-26 UKSM 0.1.2 Fine grained speed control, more scan optimization. ++2012-10-13 UKSM 0.1.2.1 Bug fixes. ++2012-12-31 UKSM 0.1.2.2 Minor bug fixes. ++2014-07-02 UKSM 0.1.2.3 Fix a " __this_cpu_read() in preemptible bug". ++2015-04-22 UKSM 0.1.2.4 Fix a race condition that can sometimes trigger anonying warnings. ++2016-09-10 UKSM 0.1.2.5 Fix a bug in dedup ratio calculation. ++2017-02-26 UKSM 0.1.2.6 Fix a bug in hugetlbpage handling and a race bug with page migration. +diff -Nur a/fs/exec.c b/fs/exec.c +--- a/fs/exec.c 2019-02-09 17:20:30.471820869 +0000 ++++ b/fs/exec.c 2019-02-09 17:26:11.522863979 +0000 +@@ -63,6 +63,7 @@ + #include <linux/compat.h> + #include <linux/vmalloc.h> + #include <linux/random.h> ++#include <linux/ksm.h> + + #include <linux/uaccess.h> + #include <asm/mmu_context.h> +@@ -1382,6 +1383,7 @@ + /* An exec changes our domain. We are no longer part of the thread + group */ + current->self_exec_id++; ++ + flush_signal_handlers(current, 0); + } + EXPORT_SYMBOL(setup_new_exec); +diff -Nur a/fs/proc/meminfo.c b/fs/proc/meminfo.c +--- a/fs/proc/meminfo.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/fs/proc/meminfo.c 2019-02-09 17:23:06.726863699 +0000 +@@ -106,6 +106,10 @@ + global_zone_page_state(NR_KERNEL_STACK_KB)); + show_val_kb(m, "PageTables: ", + global_zone_page_state(NR_PAGETABLE)); ++#ifdef CONFIG_UKSM ++ show_val_kb(m, "KsmZeroPages: ", ++ global_zone_page_state(NR_UKSM_ZERO_PAGES)); ++#endif + #ifdef CONFIG_QUICKLIST + show_val_kb(m, "Quicklists: ", quicklist_total_size()); + #endif +diff -Nur a/include/asm-generic/pgtable.h b/include/asm-generic/pgtable.h +--- a/include/asm-generic/pgtable.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/asm-generic/pgtable.h 2019-02-09 17:23:06.726863699 +0000 +@@ -817,12 +817,25 @@ + extern void untrack_pfn_moved(struct vm_area_struct *vma); + #endif + ++#ifdef CONFIG_UKSM ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ extern unsigned long uksm_zero_pfn; ++ return pfn == uksm_zero_pfn; ++} ++#else ++static inline int is_uksm_zero_pfn(unsigned long pfn) ++{ ++ return 0; ++} ++#endif ++ + #ifdef __HAVE_COLOR_ZERO_PAGE + static inline int is_zero_pfn(unsigned long pfn) + { + extern unsigned long zero_pfn; + unsigned long offset_from_zero_pfn = pfn - zero_pfn; +- return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT); ++ return offset_from_zero_pfn <= (zero_page_mask >> PAGE_SHIFT) || is_uksm_zero_pfn(pfn); + } + + #define my_zero_pfn(addr) page_to_pfn(ZERO_PAGE(addr)) +@@ -831,7 +844,7 @@ + static inline int is_zero_pfn(unsigned long pfn) + { + extern unsigned long zero_pfn; +- return pfn == zero_pfn; ++ return (pfn == zero_pfn) || (is_uksm_zero_pfn(pfn)); + } + + static inline unsigned long my_zero_pfn(unsigned long addr) +diff -Nur a/include/linux/ksm.h b/include/linux/ksm.h +--- a/include/linux/ksm.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/ksm.h 2019-02-09 17:23:06.726863699 +0000 +@@ -1,4 +1,4 @@ +-/* SPDX-License-Identifier: GPL-2.0 */ ++/* SPDX-License-Identifier: GPL-3.0 */ + #ifndef __LINUX_KSM_H + #define __LINUX_KSM_H + /* +@@ -21,20 +21,16 @@ + #ifdef CONFIG_KSM + int ksm_madvise(struct vm_area_struct *vma, unsigned long start, + unsigned long end, int advice, unsigned long *vm_flags); +-int __ksm_enter(struct mm_struct *mm); +-void __ksm_exit(struct mm_struct *mm); + +-static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++static inline struct stable_node *page_stable_node(struct page *page) + { +- if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) +- return __ksm_enter(mm); +- return 0; ++ return PageKsm(page) ? page_rmapping(page) : NULL; + } + +-static inline void ksm_exit(struct mm_struct *mm) ++static inline void set_page_stable_node(struct page *page, ++ struct stable_node *stable_node) + { +- if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) +- __ksm_exit(mm); ++ page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); + } + + /* +@@ -54,6 +50,33 @@ + void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc); + void ksm_migrate_page(struct page *newpage, struct page *oldpage); + ++#ifdef CONFIG_KSM_LEGACY ++int __ksm_enter(struct mm_struct *mm); ++void __ksm_exit(struct mm_struct *mm); ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &oldmm->flags)) ++ return __ksm_enter(mm); ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++ if (test_bit(MMF_VM_MERGEABLE, &mm->flags)) ++ __ksm_exit(mm); ++} ++ ++#elif defined(CONFIG_UKSM) ++static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) ++{ ++ return 0; ++} ++ ++static inline void ksm_exit(struct mm_struct *mm) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++ + #else /* !CONFIG_KSM */ + + static inline int ksm_fork(struct mm_struct *mm, struct mm_struct *oldmm) +@@ -89,4 +112,6 @@ + #endif /* CONFIG_MMU */ + #endif /* !CONFIG_KSM */ + ++#include <linux/uksm.h> ++ + #endif /* __LINUX_KSM_H */ +diff -Nur a/include/linux/mm_types.h b/include/linux/mm_types.h +--- a/include/linux/mm_types.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/mm_types.h 2019-02-09 17:23:06.726863699 +0000 +@@ -323,6 +323,9 @@ + struct mempolicy *vm_policy; /* NUMA policy for the VMA */ + #endif + struct vm_userfaultfd_ctx vm_userfaultfd_ctx; ++#ifdef CONFIG_UKSM ++ struct vma_slot *uksm_vma_slot; ++#endif + } __randomize_layout; + + struct core_thread { +diff -Nur a/include/linux/mmzone.h b/include/linux/mmzone.h +--- a/include/linux/mmzone.h 2019-02-06 16:30:16.000000000 +0000 ++++ b/include/linux/mmzone.h 2019-02-09 17:23:06.726863699 +0000 +@@ -148,6 +148,9 @@ + NR_ZSPAGES, /* allocated in zsmalloc */ + #endif + NR_FREE_CMA_PAGES, ++#ifdef CONFIG_UKSM ++ NR_UKSM_ZERO_PAGES, ++#endif + NR_VM_ZONE_STAT_ITEMS }; + + enum node_stat_item { +@@ -867,7 +870,7 @@ + } + + /** +- * is_highmem - helper function to quickly check if a struct zone is a ++ * is_highmem - helper function to quickly check if a struct zone is a + * highmem zone or not. This is an attempt to keep references + * to ZONE_{DMA/NORMAL/HIGHMEM/etc} in general code to a minimum. + * @zone - pointer to struct zone variable +diff -Nur a/include/linux/sradix-tree.h b/include/linux/sradix-tree.h +--- a/include/linux/sradix-tree.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/include/linux/sradix-tree.h 2019-02-09 17:23:06.726863699 +0000 +@@ -0,0 +1,77 @@ ++#ifndef _LINUX_SRADIX_TREE_H ++#define _LINUX_SRADIX_TREE_H ++ ++ ++#define INIT_SRADIX_TREE(root, mask) \ ++do { \ ++ (root)->height = 0; \ ++ (root)->gfp_mask = (mask); \ ++ (root)->rnode = NULL; \ ++} while (0) ++ ++#define ULONG_BITS (sizeof(unsigned long) * 8) ++#define SRADIX_TREE_INDEX_BITS (8 /* CHAR_BIT */ * sizeof(unsigned long)) ++//#define SRADIX_TREE_MAP_SHIFT 6 ++//#define SRADIX_TREE_MAP_SIZE (1UL << SRADIX_TREE_MAP_SHIFT) ++//#define SRADIX_TREE_MAP_MASK (SRADIX_TREE_MAP_SIZE-1) ++ ++struct sradix_tree_node { ++ unsigned int height; /* Height from the bottom */ ++ unsigned int count; ++ unsigned int fulls; /* Number of full sublevel trees */ ++ struct sradix_tree_node *parent; ++ void *stores[0]; ++}; ++ ++/* A simple radix tree implementation */ ++struct sradix_tree_root { ++ unsigned int height; ++ struct sradix_tree_node *rnode; ++ ++ /* Where found to have available empty stores in its sublevels */ ++ struct sradix_tree_node *enter_node; ++ unsigned int shift; ++ unsigned int stores_size; ++ unsigned int mask; ++ unsigned long min; /* The first hole index */ ++ unsigned long num; ++ //unsigned long *height_to_maxindex; ++ ++ /* How the node is allocated and freed. */ ++ struct sradix_tree_node *(*alloc)(void); ++ void (*free)(struct sradix_tree_node *node); ++ ++ /* When a new node is added and removed */ ++ void (*extend)(struct sradix_tree_node *parent, struct sradix_tree_node *child); ++ void (*assign)(struct sradix_tree_node *node, unsigned int index, void *item); ++ void (*rm)(struct sradix_tree_node *node, unsigned int offset); ++}; ++ ++struct sradix_tree_path { ++ struct sradix_tree_node *node; ++ int offset; ++}; ++ ++static inline ++void init_sradix_tree_root(struct sradix_tree_root *root, unsigned long shift) ++{ ++ root->height = 0; ++ root->rnode = NULL; ++ root->shift = shift; ++ root->stores_size = 1UL << shift; ++ root->mask = root->stores_size - 1; ++} ++ ++ ++extern void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *, unsigned long)); ++ ++extern int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num); ++ ++extern void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index); ++ ++extern void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index); ++ ++#endif /* _LINUX_SRADIX_TREE_H */ +diff -Nur a/include/linux/uksm.h b/include/linux/uksm.h +--- a/include/linux/uksm.h 1970-01-01 01:00:00.000000000 +0100 ++++ b/include/linux/uksm.h 2019-02-09 17:23:06.726863699 +0000 +@@ -0,0 +1,149 @@ ++#ifndef __LINUX_UKSM_H ++#define __LINUX_UKSM_H ++/* ++ * Memory merging support. ++ * ++ * This code enables dynamic sharing of identical pages found in different ++ * memory areas, even if they are not shared by fork(). ++ */ ++ ++/* if !CONFIG_UKSM this file should not be compiled at all. */ ++#ifdef CONFIG_UKSM ++ ++#include <linux/bitops.h> ++#include <linux/mm.h> ++#include <linux/pagemap.h> ++#include <linux/rmap.h> ++#include <linux/sched.h> ++ ++extern unsigned long zero_pfn __read_mostly; ++extern unsigned long uksm_zero_pfn __read_mostly; ++extern struct page *empty_uksm_zero_page; ++ ++/* must be done before linked to mm */ ++extern void uksm_vma_add_new(struct vm_area_struct *vma); ++extern void uksm_remove_vma(struct vm_area_struct *vma); ++ ++#define UKSM_SLOT_NEED_SORT (1 << 0) ++#define UKSM_SLOT_NEED_RERAND (1 << 1) ++#define UKSM_SLOT_SCANNED (1 << 2) /* It's scanned in this round */ ++#define UKSM_SLOT_FUL_SCANNED (1 << 3) ++#define UKSM_SLOT_IN_UKSM (1 << 4) ++ ++struct vma_slot { ++ struct sradix_tree_node *snode; ++ unsigned long sindex; ++ ++ struct list_head slot_list; ++ unsigned long fully_scanned_round; ++ unsigned long dedup_num; ++ unsigned long pages_scanned; ++ unsigned long this_sampled; ++ unsigned long last_scanned; ++ unsigned long pages_to_scan; ++ struct scan_rung *rung; ++ struct page **rmap_list_pool; ++ unsigned int *pool_counts; ++ unsigned long pool_size; ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ unsigned long ctime_j; ++ unsigned long pages; ++ unsigned long flags; ++ unsigned long pages_cowed; /* pages cowed this round */ ++ unsigned long pages_merged; /* pages merged this round */ ++ unsigned long pages_bemerged; ++ ++ /* when it has page merged in this eval round */ ++ struct list_head dedup_list; ++}; ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __dec_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++ if (pte_pfn(pte) == uksm_zero_pfn) ++ __inc_zone_page_state(empty_uksm_zero_page, NR_UKSM_ZERO_PAGES); ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++ if (vma->uksm_vma_slot && PageKsm(page)) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++ if (vma->uksm_vma_slot && pte_pfn(pte) == uksm_zero_pfn) ++ vma->uksm_vma_slot->pages_cowed++; ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++#ifdef VM_SAO ++ if (vm_flags & VM_SAO) ++ return 0; ++#endif ++ ++ return !(vm_flags & (VM_PFNMAP | VM_IO | VM_DONTEXPAND | ++ VM_HUGETLB | VM_MIXEDMAP | VM_SHARED ++ | VM_MAYSHARE | VM_GROWSUP | VM_GROWSDOWN)); ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++ if (uksm_flags_can_scan(*vm_flags_p)) ++ *vm_flags_p |= VM_MERGEABLE; ++} ++ ++/* ++ * Just a wrapper for BUG_ON for where ksm_zeropage must not be. TODO: it will ++ * be removed when uksm zero page patch is stable enough. ++ */ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++ BUG_ON(pte_pfn(pte) == uksm_zero_pfn); ++} ++#else ++static inline void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++} ++ ++static inline void uksm_unmap_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_map_zero_page(pte_t pte) ++{ ++} ++ ++static inline void uksm_cow_page(struct vm_area_struct *vma, struct page *page) ++{ ++} ++ ++static inline void uksm_cow_pte(struct vm_area_struct *vma, pte_t pte) ++{ ++} ++ ++static inline int uksm_flags_can_scan(unsigned long vm_flags) ++{ ++ return 0; ++} ++ ++static inline void uksm_vm_flags_mod(unsigned long *vm_flags_p) ++{ ++} ++ ++static inline void uksm_bugon_zeropage(pte_t pte) ++{ ++} ++#endif /* !CONFIG_UKSM */ ++#endif /* __LINUX_UKSM_H */ +diff -Nur a/kernel/fork.c b/kernel/fork.c +--- a/kernel/fork.c 2019-02-09 17:20:30.481821193 +0000 ++++ b/kernel/fork.c 2019-02-09 17:23:06.736864024 +0000 +@@ -543,7 +543,7 @@ + __vma_link_rb(mm, tmp, rb_link, rb_parent); + rb_link = &tmp->vm_rb.rb_right; + rb_parent = &tmp->vm_rb; +- ++ uksm_vma_add_new(tmp); + mm->map_count++; + if (!(tmp->vm_flags & VM_WIPEONFORK)) + retval = copy_page_range(mm, oldmm, mpnt); +diff -Nur a/lib/Makefile b/lib/Makefile +--- a/lib/Makefile 2019-02-06 16:30:16.000000000 +0000 ++++ b/lib/Makefile 2019-02-09 17:23:06.736864024 +0000 +@@ -18,7 +18,7 @@ + KCOV_INSTRUMENT_dynamic_debug.o := n + + lib-y := ctype.o string.o vsprintf.o cmdline.o \ +- rbtree.o radix-tree.o timerqueue.o\ ++ rbtree.o radix-tree.o sradix-tree.o timerqueue.o\ + idr.o int_sqrt.o extable.o \ + sha1.o chacha20.o irq_regs.o argv_split.o \ + flex_proportions.o ratelimit.o show_mem.o \ +diff -Nur a/lib/sradix-tree.c b/lib/sradix-tree.c +--- a/lib/sradix-tree.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/lib/sradix-tree.c 2019-02-09 17:23:06.736864024 +0000 +@@ -0,0 +1,476 @@ ++#include <linux/errno.h> ++#include <linux/mm.h> ++#include <linux/mman.h> ++#include <linux/spinlock.h> ++#include <linux/slab.h> ++#include <linux/gcd.h> ++#include <linux/sradix-tree.h> ++ ++static inline int sradix_node_full(struct sradix_tree_root *root, struct sradix_tree_node *node) ++{ ++ return node->fulls == root->stores_size || ++ (node->height == 1 && node->count == root->stores_size); ++} ++ ++/* ++ * Extend a sradix tree so it can store key @index. ++ */ ++static int sradix_tree_extend(struct sradix_tree_root *root, unsigned long index) ++{ ++ struct sradix_tree_node *node; ++ unsigned int height; ++ ++ if (unlikely(root->rnode == NULL)) { ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ node->height = 1; ++ root->rnode = node; ++ root->height = 1; ++ } ++ ++ /* Figure out what the height should be. */ ++ height = root->height; ++ index >>= root->shift * height; ++ ++ while (index) { ++ index >>= root->shift; ++ height++; ++ } ++ ++ while (height > root->height) { ++ unsigned int newheight; ++ ++ if (!(node = root->alloc())) ++ return -ENOMEM; ++ ++ /* Increase the height. */ ++ node->stores[0] = root->rnode; ++ root->rnode->parent = node; ++ if (root->extend) ++ root->extend(node, root->rnode); ++ ++ newheight = root->height + 1; ++ node->height = newheight; ++ node->count = 1; ++ if (sradix_node_full(root, root->rnode)) ++ node->fulls = 1; ++ ++ root->rnode = node; ++ root->height = newheight; ++ } ++ ++ return 0; ++} ++ ++/* ++ * Search the next item from the current node, that is not NULL ++ * and can satify root->iter(). ++ */ ++void *sradix_tree_next(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index, ++ int (*iter)(void *item, unsigned long height)) ++{ ++ unsigned long offset; ++ void *item; ++ ++ if (unlikely(node == NULL)) { ++ node = root->rnode; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ ++ if (node->height == 1) ++ return item; ++ else ++ goto go_down; ++ } ++ ++ while (node) { ++ offset = (index & root->mask) + 1; ++ for (; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (offset < root->stores_size) ++ break; ++ ++ node = node->parent; ++ index >>= root->shift; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ while (node->height > 1) { ++go_down: ++ node = item; ++ for (offset = 0; offset < root->stores_size; offset++) { ++ item = node->stores[offset]; ++ if (item && (!iter || iter(item, node->height))) ++ break; ++ } ++ ++ if (unlikely(offset >= root->stores_size)) ++ return NULL; ++ } ++ ++ BUG_ON(offset > root->stores_size); ++ ++ return item; ++} ++ ++/* ++ * Blindly insert the item to the tree. Typically, we reuse the ++ * first empty store item. ++ */ ++int sradix_tree_enter(struct sradix_tree_root *root, void **item, int num) ++{ ++ unsigned long index; ++ unsigned int height; ++ struct sradix_tree_node *node, *tmp = NULL; ++ int offset, offset_saved; ++ void **store = NULL; ++ int error, i, j, shift; ++ ++go_on: ++ index = root->min; ++ ++ if (root->enter_node && !sradix_node_full(root, root->enter_node)) { ++ node = root->enter_node; ++ BUG_ON((index >> (root->shift * root->height))); ++ } else { ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height)) ++ || sradix_node_full(root, node)) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return error; ++ ++ node = root->rnode; ++ } ++ } ++ ++ ++ height = node->height; ++ shift = (height - 1) * root->shift; ++ offset = (index >> shift) & root->mask; ++ while (shift > 0) { ++ offset_saved = offset; ++ for (; offset < root->stores_size; offset++) { ++ store = &node->stores[offset]; ++ tmp = *store; ++ ++ if (!tmp || !sradix_node_full(root, tmp)) ++ break; ++ } ++ BUG_ON(offset >= root->stores_size); ++ ++ if (offset != offset_saved) { ++ index += (offset - offset_saved) << shift; ++ index &= ~((1UL << shift) - 1); ++ } ++ ++ if (!tmp) { ++ if (!(tmp = root->alloc())) ++ return -ENOMEM; ++ ++ tmp->height = shift / root->shift; ++ *store = tmp; ++ tmp->parent = node; ++ node->count++; ++// if (root->extend) ++// root->extend(node, tmp); ++ } ++ ++ node = tmp; ++ shift -= root->shift; ++ offset = (index >> shift) & root->mask; ++ } ++ ++ BUG_ON(node->height != 1); ++ ++ ++ store = &node->stores[offset]; ++ for (i = 0, j = 0; ++ j < root->stores_size - node->count && ++ i < root->stores_size - offset && j < num; i++) { ++ if (!store[i]) { ++ store[i] = item[j]; ++ if (root->assign) ++ root->assign(node, index + i, item[j]); ++ j++; ++ } ++ } ++ ++ node->count += j; ++ root->num += j; ++ num -= j; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ root->enter_node = NULL; ++ } else { ++ root->min = index + i - 1; ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ ++ if (num) { ++ item += j; ++ goto go_on; ++ } ++ ++ return 0; ++} ++ ++ ++/** ++ * sradix_tree_shrink - shrink height of a sradix tree to minimal ++ * @root sradix tree root ++ * ++ */ ++static inline void sradix_tree_shrink(struct sradix_tree_root *root) ++{ ++ /* try to shrink tree height */ ++ while (root->height > 1) { ++ struct sradix_tree_node *to_free = root->rnode; ++ ++ /* ++ * The candidate node has more than one child, or its child ++ * is not at the leftmost store, we cannot shrink. ++ */ ++ if (to_free->count != 1 || !to_free->stores[0]) ++ break; ++ ++ root->rnode = to_free->stores[0]; ++ root->rnode->parent = NULL; ++ root->height--; ++ if (unlikely(root->enter_node == to_free)) ++ root->enter_node = NULL; ++ root->free(to_free); ++ } ++} ++ ++/* ++ * Del the item on the known leaf node and index ++ */ ++void sradix_tree_delete_from_leaf(struct sradix_tree_root *root, ++ struct sradix_tree_node *node, unsigned long index) ++{ ++ unsigned int offset; ++ struct sradix_tree_node *start, *end; ++ ++ BUG_ON(node->height != 1); ++ ++ start = node; ++ while (node && !(--node->count)) ++ node = node->parent; ++ ++ end = node; ++ if (!node) { ++ root->rnode = NULL; ++ root->height = 0; ++ root->min = 0; ++ root->num = 0; ++ root->enter_node = NULL; ++ } else { ++ offset = (index >> (root->shift * (node->height - 1))) & root->mask; ++ if (root->rm) ++ root->rm(node, offset); ++ node->stores[offset] = NULL; ++ root->num--; ++ if (root->min > index) { ++ root->min = index; ++ root->enter_node = node; ++ } ++ } ++ ++ if (start != end) { ++ do { ++ node = start; ++ start = start->parent; ++ if (unlikely(root->enter_node == node)) ++ root->enter_node = end; ++ root->free(node); ++ } while (start != end); ++ ++ /* ++ * Note that shrink may free "end", so enter_node still need to ++ * be checked inside. ++ */ ++ sradix_tree_shrink(root); ++ } else if (node->count == root->stores_size - 1) { ++ /* It WAS a full leaf node. Update the ancestors */ ++ node = node->parent; ++ while (node) { ++ node->fulls--; ++ if (node->fulls != root->stores_size - 1) ++ break; ++ ++ node = node->parent; ++ } ++ } ++} ++ ++void *sradix_tree_lookup(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return NULL; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return NULL; ++ ++ shift -= root->shift; ++ } while (shift >= 0); ++ ++ return node; ++} ++ ++/* ++ * Return the item if it exists, otherwise create it in place ++ * and return the created item. ++ */ ++void *sradix_tree_lookup_create(struct sradix_tree_root *root, ++ unsigned long index, void *(*item_alloc)(void)) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node, *tmp; ++ void *item; ++ int shift, error; ++ ++ if (root->rnode == NULL || (index >> (root->shift * root->height))) { ++ if (item_alloc) { ++ error = sradix_tree_extend(root, index); ++ if (error) ++ return NULL; ++ } else { ++ return NULL; ++ } ++ } ++ ++ node = root->rnode; ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ if (!node->stores[offset]) { ++ if (!(tmp = root->alloc())) ++ return NULL; ++ ++ tmp->height = shift / root->shift; ++ node->stores[offset] = tmp; ++ tmp->parent = node; ++ node->count++; ++ node = tmp; ++ } else { ++ node = node->stores[offset]; ++ } ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ BUG_ON(node->height != 1); ++ offset = index & root->mask; ++ if (node->stores[offset]) { ++ return node->stores[offset]; ++ } else if (item_alloc) { ++ if (!(item = item_alloc())) ++ return NULL; ++ ++ node->stores[offset] = item; ++ ++ /* ++ * NOTE: we do NOT call root->assign here, since this item is ++ * newly created by us having no meaning. Caller can call this ++ * if it's necessary to do so. ++ */ ++ ++ node->count++; ++ root->num++; ++ ++ while (sradix_node_full(root, node)) { ++ node = node->parent; ++ if (!node) ++ break; ++ ++ node->fulls++; ++ } ++ ++ if (unlikely(!node)) { ++ /* All nodes are full */ ++ root->min = 1 << (root->height * root->shift); ++ } else { ++ if (root->min == index) { ++ root->min |= (1UL << (node->height - 1)) - 1; ++ root->min++; ++ root->enter_node = node; ++ } ++ } ++ ++ return item; ++ } else { ++ return NULL; ++ } ++ ++} ++ ++int sradix_tree_delete(struct sradix_tree_root *root, unsigned long index) ++{ ++ unsigned int height, offset; ++ struct sradix_tree_node *node; ++ int shift; ++ ++ node = root->rnode; ++ if (node == NULL || (index >> (root->shift * root->height))) ++ return -ENOENT; ++ ++ height = root->height; ++ shift = (height - 1) * root->shift; ++ ++ do { ++ offset = (index >> shift) & root->mask; ++ node = node->stores[offset]; ++ if (!node) ++ return -ENOENT; ++ ++ shift -= root->shift; ++ } while (shift > 0); ++ ++ offset = index & root->mask; ++ if (!node->stores[offset]) ++ return -ENOENT; ++ ++ sradix_tree_delete_from_leaf(root, node, index); ++ ++ return 0; ++} +diff -Nur a/mm/Kconfig b/mm/Kconfig +--- a/mm/Kconfig 2019-02-09 17:20:30.491821512 +0000 ++++ b/mm/Kconfig 2019-02-09 17:23:06.736864024 +0000 +@@ -307,6 +307,32 @@ + See Documentation/vm/ksm.rst for more information: KSM is inactive + until a program has madvised that an area is MADV_MERGEABLE, and + root has set /sys/kernel/mm/ksm/run to 1 (if CONFIG_SYSFS is set). ++choice ++ prompt "Choose UKSM/KSM strategy" ++ default UKSM ++ depends on KSM ++ help ++ This option allows to select a UKSM/KSM stragety. ++ ++config UKSM ++ bool "Ultra-KSM for page merging" ++ depends on KSM ++ help ++ UKSM is inspired by the Linux kernel project \u2014 KSM(Kernel Same ++ page Merging), but with a fundamentally rewritten core algorithm. With ++ an advanced algorithm, UKSM now can transparently scans all anonymously ++ mapped user space applications with an significantly improved scan speed ++ and CPU efficiency. Since KVM is friendly to KSM, KVM can also benefit from ++ UKSM. Now UKSM has its first stable release and first real world enterprise user. ++ For more information, please goto its project page. ++ (www.kerneldedup.org) ++ ++config KSM_LEGACY ++ bool "Legacy KSM implementation" ++ depends on KSM ++ help ++ The legacy KSM implementation from Red Hat. ++endchoice + + config DEFAULT_MMAP_MIN_ADDR + int "Low address space to protect from user allocation" +diff -Nur a/mm/ksm.c b/mm/ksm.c +--- a/mm/ksm.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/mm/ksm.c 2019-02-09 17:23:06.736864024 +0000 +@@ -842,17 +842,6 @@ + return err; + } + +-static inline struct stable_node *page_stable_node(struct page *page) +-{ +- return PageKsm(page) ? page_rmapping(page) : NULL; +-} +- +-static inline void set_page_stable_node(struct page *page, +- struct stable_node *stable_node) +-{ +- page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); +-} +- + #ifdef CONFIG_SYSFS + /* + * Only called through the sysfs control interface: +diff -Nur a/mm/Makefile b/mm/Makefile +--- a/mm/Makefile 2019-02-06 16:30:16.000000000 +0000 ++++ b/mm/Makefile 2019-02-09 17:23:06.736864024 +0000 +@@ -64,7 +64,8 @@ + obj-$(CONFIG_SPARSEMEM_VMEMMAP) += sparse-vmemmap.o + obj-$(CONFIG_SLOB) += slob.o + obj-$(CONFIG_MMU_NOTIFIER) += mmu_notifier.o +-obj-$(CONFIG_KSM) += ksm.o ++obj-$(CONFIG_KSM_LEGACY) += ksm.o ++obj-$(CONFIG_UKSM) += uksm.o + obj-$(CONFIG_PAGE_POISONING) += page_poison.o + obj-$(CONFIG_SLAB) += slab.o + obj-$(CONFIG_SLUB) += slub.o +diff -Nur a/mm/memory.c b/mm/memory.c +--- a/mm/memory.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/mm/memory.c 2019-02-09 17:23:06.736864024 +0000 +@@ -128,6 +128,25 @@ + + unsigned long highest_memmap_pfn __read_mostly; + ++#ifdef CONFIG_UKSM ++unsigned long uksm_zero_pfn __read_mostly; ++EXPORT_SYMBOL_GPL(uksm_zero_pfn); ++struct page *empty_uksm_zero_page; ++ ++static int __init setup_uksm_zero_page(void) ++{ ++ empty_uksm_zero_page = alloc_pages(__GFP_ZERO & ~__GFP_MOVABLE, 0); ++ if (!empty_uksm_zero_page) ++ panic("Oh boy, that early out of memory?"); ++ ++ SetPageReserved(empty_uksm_zero_page); ++ uksm_zero_pfn = page_to_pfn(empty_uksm_zero_page); ++ ++ return 0; ++} ++core_initcall(setup_uksm_zero_page); ++#endif ++ + /* + * CONFIG_MMU architectures set up ZERO_PAGE in their paging_init() + */ +@@ -139,6 +158,7 @@ + core_initcall(init_zero_pfn); + + ++ + #if defined(SPLIT_RSS_COUNTING) + + void sync_mm_rss(struct mm_struct *mm) +@@ -1040,6 +1060,9 @@ + get_page(page); + page_dup_rmap(page, false); + rss[mm_counter(page)]++; ++ ++ /* Should return NULL in vm_normal_page() */ ++ uksm_bugon_zeropage(pte); + } else if (pte_devmap(pte)) { + page = pte_page(pte); + +@@ -1053,6 +1076,8 @@ + page_dup_rmap(page, false); + rss[mm_counter(page)]++; + } ++ } else { ++ uksm_map_zero_page(pte); + } + + out_set_pte: +@@ -1322,8 +1347,10 @@ + ptent = ptep_get_and_clear_full(mm, addr, pte, + tlb->fullmm); + tlb_remove_tlb_entry(tlb, pte, addr); +- if (unlikely(!page)) ++ if (unlikely(!page)) { ++ uksm_unmap_zero_page(ptent); + continue; ++ } + + if (!PageAnon(page)) { + if (pte_dirty(ptent)) { +@@ -2353,8 +2380,10 @@ + clear_page(kaddr); + kunmap_atomic(kaddr); + flush_dcache_page(dst); +- } else ++ } else { + copy_user_highpage(dst, src, va, vma); ++ uksm_cow_page(vma, src); ++ } + } + + static gfp_t __get_fault_gfp_mask(struct vm_area_struct *vma) +@@ -2503,6 +2532,7 @@ + vmf->address); + if (!new_page) + goto oom; ++ uksm_cow_pte(vma, vmf->orig_pte); + } else { + new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, + vmf->address); +@@ -2529,7 +2559,9 @@ + mm_counter_file(old_page)); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } ++ uksm_bugon_zeropage(vmf->orig_pte); + } else { ++ uksm_unmap_zero_page(vmf->orig_pte); + inc_mm_counter_fast(mm, MM_ANONPAGES); + } + flush_cache_page(vma, vmf->address, pte_pfn(vmf->orig_pte)); +diff -Nur a/mm/mmap.c b/mm/mmap.c +--- a/mm/mmap.c 2019-02-09 17:20:30.491821512 +0000 ++++ b/mm/mmap.c 2019-02-09 17:23:06.736864024 +0000 +@@ -45,6 +45,7 @@ + #include <linux/moduleparam.h> + #include <linux/pkeys.h> + #include <linux/oom.h> ++#include <linux/ksm.h> + + #include <linux/uaccess.h> + #include <asm/cacheflush.h> +@@ -182,6 +183,7 @@ + if (vma->vm_file) + fput(vma->vm_file); + mpol_put(vma_policy(vma)); ++ uksm_remove_vma(vma); + vm_area_free(vma); + return next; + } +@@ -708,9 +710,16 @@ + long adjust_next = 0; + int remove_next = 0; + ++/* ++ * to avoid deadlock, ksm_remove_vma must be done before any spin_lock is ++ * acquired ++ */ ++ uksm_remove_vma(vma); ++ + if (next && !insert) { + struct vm_area_struct *exporter = NULL, *importer = NULL; + ++ uksm_remove_vma(next); + if (end >= next->vm_end) { + /* + * vma expands, overlapping all the next, and +@@ -843,6 +852,7 @@ + end_changed = true; + } + vma->vm_pgoff = pgoff; ++ + if (adjust_next) { + next->vm_start += adjust_next << PAGE_SHIFT; + next->vm_pgoff += adjust_next; +@@ -948,6 +958,7 @@ + if (remove_next == 2) { + remove_next = 1; + end = next->vm_end; ++ uksm_remove_vma(next); + goto again; + } + else if (next) +@@ -974,10 +985,14 @@ + */ + VM_WARN_ON(mm->highest_vm_end != vm_end_gap(vma)); + } ++ } else { ++ if (next && !insert) ++ uksm_vma_add_new(next); + } + if (insert && file) + uprobe_mmap(insert); + ++ uksm_vma_add_new(vma); + validate_mm(mm); + + return 0; +@@ -1434,6 +1449,9 @@ + vm_flags |= calc_vm_prot_bits(prot, pkey) | calc_vm_flag_bits(flags) | + mm->def_flags | VM_MAYREAD | VM_MAYWRITE | VM_MAYEXEC; + ++ /* If uksm is enabled, we add VM_MERGEABLE to new VMAs. */ ++ uksm_vm_flags_mod(&vm_flags); ++ + if (flags & MAP_LOCKED) + if (!can_do_mlock()) + return -EPERM; +@@ -1798,6 +1816,7 @@ + allow_write_access(file); + } + file = vma->vm_file; ++ uksm_vma_add_new(vma); + out: + perf_event_mmap(vma); + +@@ -1840,6 +1859,7 @@ + if (vm_flags & VM_DENYWRITE) + allow_write_access(file); + free_vma: ++ uksm_remove_vma(vma); + vm_area_free(vma); + unacct_error: + if (charged) +@@ -2659,6 +2679,8 @@ + else + err = vma_adjust(vma, vma->vm_start, addr, vma->vm_pgoff, new); + ++ uksm_vma_add_new(new); ++ + /* Success. */ + if (!err) + return 0; +@@ -2944,6 +2966,7 @@ + if ((flags & (~VM_EXEC)) != 0) + return -EINVAL; + flags |= VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT | mm->def_flags; ++ uksm_vm_flags_mod(&flags); + + error = get_unmapped_area(NULL, addr, len, 0, MAP_FIXED); + if (offset_in_page(error)) +@@ -3000,6 +3023,7 @@ + vma->vm_flags = flags; + vma->vm_page_prot = vm_get_page_prot(flags); + vma_link(mm, vma, prev, rb_link, rb_parent); ++ uksm_vma_add_new(vma); + out: + perf_event_mmap(vma); + mm->total_vm += len >> PAGE_SHIFT; +@@ -3077,6 +3101,12 @@ + up_write(&mm->mmap_sem); + } + ++ /* ++ * Taking write lock on mmap_sem does not harm others, ++ * but it's crucial for uksm to avoid races. ++ */ ++ down_write(&mm->mmap_sem); ++ + if (mm->locked_vm) { + vma = mm->mmap; + while (vma) { +@@ -3111,6 +3141,11 @@ + vma = remove_vma(vma); + } + vm_unacct_memory(nr_accounted); ++ ++ mm->mmap = NULL; ++ mm->mm_rb = RB_ROOT; ++ vmacache_invalidate(mm); ++ up_write(&mm->mmap_sem); + } + + /* Insert vm structure into process list sorted by address +@@ -3218,6 +3253,7 @@ + new_vma->vm_ops->open(new_vma); + vma_link(mm, new_vma, prev, rb_link, rb_parent); + *need_rmap_locks = false; ++ uksm_vma_add_new(new_vma); + } + return new_vma; + +@@ -3368,6 +3404,7 @@ + vm_stat_account(mm, vma->vm_flags, len >> PAGE_SHIFT); + + perf_event_mmap(vma); ++ uksm_vma_add_new(vma); + + return vma; + +diff -Nur a/mm/rmap.c b/mm/rmap.c +--- a/mm/rmap.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/mm/rmap.c 2019-02-09 17:23:06.736864024 +0000 +@@ -1017,9 +1017,9 @@ + + /** + * __page_set_anon_rmap - set up new anonymous rmap +- * @page: Page to add to rmap ++ * @page: Page to add to rmap + * @vma: VM area to add page to. +- * @address: User virtual address of the mapping ++ * @address: User virtual address of the mapping + * @exclusive: the page is exclusively owned by the current process + */ + static void __page_set_anon_rmap(struct page *page, +diff -Nur a/mm/uksm.c b/mm/uksm.c +--- a/mm/uksm.c 1970-01-01 01:00:00.000000000 +0100 ++++ b/mm/uksm.c 2019-02-09 17:23:06.736864024 +0000 +@@ -0,0 +1,5584 @@ ++/* ++ * Ultra KSM. Copyright (C) 2011-2012 Nai Xia ++ * ++ * This is an improvement upon KSM. Some basic data structures and routines ++ * are borrowed from ksm.c . ++ * ++ * Its new features: ++ * 1. Full system scan: ++ * It automatically scans all user processes' anonymous VMAs. Kernel-user ++ * interaction to submit a memory area to KSM is no longer needed. ++ * ++ * 2. Rich area detection: ++ * It automatically detects rich areas containing abundant duplicated ++ * pages based. Rich areas are given a full scan speed. Poor areas are ++ * sampled at a reasonable speed with very low CPU consumption. ++ * ++ * 3. Ultra Per-page scan speed improvement: ++ * A new hash algorithm is proposed. As a result, on a machine with ++ * Core(TM)2 Quad Q9300 CPU in 32-bit mode and 800MHZ DDR2 main memory, it ++ * can scan memory areas that does not contain duplicated pages at speed of ++ * 627MB/sec ~ 2445MB/sec and can merge duplicated areas at speed of ++ * 477MB/sec ~ 923MB/sec. ++ * ++ * 4. Thrashing area avoidance: ++ * Thrashing area(an VMA that has frequent Ksm page break-out) can be ++ * filtered out. My benchmark shows it's more efficient than KSM's per-page ++ * hash value based volatile page detection. ++ * ++ * ++ * 5. Misc changes upon KSM: ++ * * It has a fully x86-opitmized memcmp dedicated for 4-byte-aligned page ++ * comparison. It's much faster than default C version on x86. ++ * * rmap_item now has an struct *page member to loosely cache a ++ * address-->page mapping, which reduces too much time-costly ++ * follow_page(). ++ * * The VMA creation/exit procedures are hooked to let the Ultra KSM know. ++ * * try_to_merge_two_pages() now can revert a pte if it fails. No break_ ++ * ksm is needed for this case. ++ * ++ * 6. Full Zero Page consideration(contributed by Figo Zhang) ++ * Now uksmd consider full zero pages as special pages and merge them to an ++ * special unswappable uksm zero page. ++ */ ++ ++#include <linux/errno.h> ++#include <linux/mm.h> ++#include <linux/fs.h> ++#include <linux/mman.h> ++#include <linux/sched.h> ++#include <linux/sched/mm.h> ++#include <linux/sched/coredump.h> ++#include <linux/sched/cputime.h> ++#include <linux/rwsem.h> ++#include <linux/pagemap.h> ++#include <linux/rmap.h> ++#include <linux/spinlock.h> ++#include <linux/jhash.h> ++#include <linux/delay.h> ++#include <linux/kthread.h> ++#include <linux/wait.h> ++#include <linux/slab.h> ++#include <linux/rbtree.h> ++#include <linux/memory.h> ++#include <linux/mmu_notifier.h> ++#include <linux/swap.h> ++#include <linux/ksm.h> ++#include <linux/crypto.h> ++#include <linux/scatterlist.h> ++#include <crypto/hash.h> ++#include <linux/random.h> ++#include <linux/math64.h> ++#include <linux/gcd.h> ++#include <linux/freezer.h> ++#include <linux/oom.h> ++#include <linux/numa.h> ++#include <linux/sradix-tree.h> ++ ++#include <asm/tlbflush.h> ++#include "internal.h" ++ ++#ifdef CONFIG_X86 ++#undef memcmp ++ ++#ifdef CONFIG_X86_32 ++#define memcmp memcmpx86_32 ++/* ++ * Compare 4-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_32(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 4; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testl %3,%3\n\t" ++ "repe; cmpsd\n\t" ++ "je 1f\n\t" ++ "sbbl %0,%0\n\t" ++ "orl $1,%0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++/* ++ * Check the page is all zero ? ++ */ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 4; ++ ++ __asm__ __volatile__ ++ ("repe; scasl;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++ ++#elif defined(CONFIG_X86_64) ++#define memcmp memcmpx86_64 ++/* ++ * Compare 8-byte-aligned address s1 and s2, with length n ++ */ ++int memcmpx86_64(void *s1, void *s2, size_t n) ++{ ++ size_t num = n / 8; ++ register int res; ++ ++ __asm__ __volatile__ ++ ( ++ "testq %q3,%q3\n\t" ++ "repe; cmpsq\n\t" ++ "je 1f\n\t" ++ "sbbq %q0,%q0\n\t" ++ "orq $1,%q0\n" ++ "1:" ++ : "=&a" (res), "+&S" (s1), "+&D" (s2), "+&c" (num) ++ : "0" (0) ++ : "cc"); ++ ++ return res; ++} ++ ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned char same; ++ ++ len /= 8; ++ ++ __asm__ __volatile__ ++ ("repe; scasq;" ++ "sete %0" ++ : "=qm" (same), "+D" (s1), "+c" (len) ++ : "a" (0) ++ : "cc"); ++ ++ return same; ++} ++ ++#endif ++#else ++static int is_full_zero(const void *s1, size_t len) ++{ ++ unsigned long *src = s1; ++ int i; ++ ++ len /= sizeof(*src); ++ ++ for (i = 0; i < len; i++) { ++ if (src[i]) ++ return 0; ++ } ++ ++ return 1; ++} ++#endif ++ ++#define UKSM_RUNG_ROUND_FINISHED (1 << 0) ++#define TIME_RATIO_SCALE 10000 ++ ++#define SLOT_TREE_NODE_SHIFT 8 ++#define SLOT_TREE_NODE_STORE_SIZE (1UL << SLOT_TREE_NODE_SHIFT) ++struct slot_tree_node { ++ unsigned long size; ++ struct sradix_tree_node snode; ++ void *stores[SLOT_TREE_NODE_STORE_SIZE]; ++}; ++ ++static struct kmem_cache *slot_tree_node_cachep; ++ ++static struct sradix_tree_node *slot_tree_node_alloc(void) ++{ ++ struct slot_tree_node *p; ++ ++ p = kmem_cache_zalloc(slot_tree_node_cachep, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!p) ++ return NULL; ++ ++ return &p->snode; ++} ++ ++static void slot_tree_node_free(struct sradix_tree_node *node) ++{ ++ struct slot_tree_node *p; ++ ++ p = container_of(node, struct slot_tree_node, snode); ++ kmem_cache_free(slot_tree_node_cachep, p); ++} ++ ++static void slot_tree_node_extend(struct sradix_tree_node *parent, ++ struct sradix_tree_node *child) ++{ ++ struct slot_tree_node *p, *c; ++ ++ p = container_of(parent, struct slot_tree_node, snode); ++ c = container_of(child, struct slot_tree_node, snode); ++ ++ p->size += c->size; ++} ++ ++void slot_tree_node_assign(struct sradix_tree_node *node, ++ unsigned int index, void *item) ++{ ++ struct vma_slot *slot = item; ++ struct slot_tree_node *cur; ++ ++ slot->snode = node; ++ slot->sindex = index; ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size += slot->pages; ++ node = node->parent; ++ } ++} ++ ++void slot_tree_node_rm(struct sradix_tree_node *node, unsigned int offset) ++{ ++ struct vma_slot *slot; ++ struct slot_tree_node *cur; ++ unsigned long pages; ++ ++ if (node->height == 1) { ++ slot = node->stores[offset]; ++ pages = slot->pages; ++ } else { ++ cur = container_of(node->stores[offset], ++ struct slot_tree_node, snode); ++ pages = cur->size; ++ } ++ ++ while (node) { ++ cur = container_of(node, struct slot_tree_node, snode); ++ cur->size -= pages; ++ node = node->parent; ++ } ++} ++ ++unsigned long slot_iter_index; ++int slot_iter(void *item, unsigned long height) ++{ ++ struct slot_tree_node *node; ++ struct vma_slot *slot; ++ ++ if (height == 1) { ++ slot = item; ++ if (slot_iter_index < slot->pages) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= slot->pages; ++ return 0; ++ } ++ ++ } else { ++ node = container_of(item, struct slot_tree_node, snode); ++ if (slot_iter_index < node->size) { ++ /*in this one*/ ++ return 1; ++ } else { ++ slot_iter_index -= node->size; ++ return 0; ++ } ++ } ++} ++ ++ ++static inline void slot_tree_init_root(struct sradix_tree_root *root) ++{ ++ init_sradix_tree_root(root, SLOT_TREE_NODE_SHIFT); ++ root->alloc = slot_tree_node_alloc; ++ root->free = slot_tree_node_free; ++ root->extend = slot_tree_node_extend; ++ root->assign = slot_tree_node_assign; ++ root->rm = slot_tree_node_rm; ++} ++ ++void slot_tree_init(void) ++{ ++ slot_tree_node_cachep = kmem_cache_create("slot_tree_node", ++ sizeof(struct slot_tree_node), 0, ++ SLAB_PANIC | SLAB_RECLAIM_ACCOUNT, ++ NULL); ++} ++ ++ ++/* Each rung of this ladder is a list of VMAs having a same scan ratio */ ++struct scan_rung { ++ //struct list_head scanned_list; ++ struct sradix_tree_root vma_root; ++ struct sradix_tree_root vma_root2; ++ ++ struct vma_slot *current_scan; ++ unsigned long current_offset; ++ ++ /* ++ * The initial value for current_offset, it should loop over ++ * [0~ step - 1] to let all slot have its chance to be scanned. ++ */ ++ unsigned long offset_init; ++ unsigned long step; /* dynamic step for current_offset */ ++ unsigned int flags; ++ unsigned long pages_to_scan; ++ //unsigned long fully_scanned_slots; ++ /* ++ * a little bit tricky - if cpu_time_ratio > 0, then the value is the ++ * the cpu time ratio it can spend in rung_i for every scan ++ * period. if < 0, then it is the cpu time ratio relative to the ++ * max cpu percentage user specified. Both in unit of ++ * 1/TIME_RATIO_SCALE ++ */ ++ int cpu_ratio; ++ ++ /* ++ * How long it will take for all slots in this rung to be fully ++ * scanned? If it's zero, we don't care about the cover time: ++ * it's fully scanned. ++ */ ++ unsigned int cover_msecs; ++ //unsigned long vma_num; ++ //unsigned long pages; /* Sum of all slot's pages in rung */ ++}; ++ ++/** ++ * node of either the stable or unstale rbtree ++ * ++ */ ++struct tree_node { ++ struct rb_node node; /* link in the main (un)stable rbtree */ ++ struct rb_root sub_root; /* rb_root for sublevel collision rbtree */ ++ u32 hash; ++ unsigned long count; /* TODO: merged with sub_root */ ++ struct list_head all_list; /* all tree nodes in stable/unstable tree */ ++}; ++ ++/** ++ * struct stable_node - node of the stable rbtree ++ * @node: rb node of this ksm page in the stable tree ++ * @hlist: hlist head of rmap_items using this ksm page ++ * @kpfn: page frame number of this ksm page ++ */ ++struct stable_node { ++ struct rb_node node; /* link in sub-rbtree */ ++ struct tree_node *tree_node; /* it's tree node root in stable tree, NULL if it's in hell list */ ++ struct hlist_head hlist; ++ unsigned long kpfn; ++ u32 hash_max; /* if ==0 then it's not been calculated yet */ ++ struct list_head all_list; /* in a list for all stable nodes */ ++}; ++ ++/** ++ * struct node_vma - group rmap_items linked in a same stable ++ * node together. ++ */ ++struct node_vma { ++ union { ++ struct vma_slot *slot; ++ unsigned long key; /* slot is used as key sorted on hlist */ ++ }; ++ struct hlist_node hlist; ++ struct hlist_head rmap_hlist; ++ struct stable_node *head; ++}; ++ ++/** ++ * struct rmap_item - reverse mapping item for virtual addresses ++ * @rmap_list: next rmap_item in mm_slot's singly-linked rmap_list ++ * @anon_vma: pointer to anon_vma for this mm,address, when in stable tree ++ * @mm: the memory structure this rmap_item is pointing into ++ * @address: the virtual address this rmap_item tracks (+ flags in low bits) ++ * @node: rb node of this rmap_item in the unstable tree ++ * @head: pointer to stable_node heading this list in the stable tree ++ * @hlist: link into hlist of rmap_items hanging off that stable_node ++ */ ++struct rmap_item { ++ struct vma_slot *slot; ++ struct page *page; ++ unsigned long address; /* + low bits used for flags below */ ++ unsigned long hash_round; ++ unsigned long entry_index; ++ union { ++ struct {/* when in unstable tree */ ++ struct rb_node node; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ }; ++ struct { /* when in stable tree */ ++ struct node_vma *head; ++ struct hlist_node hlist; ++ struct anon_vma *anon_vma; ++ }; ++ }; ++} __aligned(4); ++ ++struct rmap_list_entry { ++ union { ++ struct rmap_item *item; ++ unsigned long addr; ++ }; ++ /* lowest bit is used for is_addr tag */ ++} __aligned(4); /* 4 aligned to fit in to pages*/ ++ ++ ++/* Basic data structure definition ends */ ++ ++ ++/* ++ * Flags for rmap_item to judge if it's listed in the stable/unstable tree. ++ * The flags use the low bits of rmap_item.address ++ */ ++#define UNSTABLE_FLAG 0x1 ++#define STABLE_FLAG 0x2 ++#define get_rmap_addr(x) ((x)->address & PAGE_MASK) ++ ++/* ++ * rmap_list_entry helpers ++ */ ++#define IS_ADDR_FLAG 1 ++#define is_addr(ptr) ((unsigned long)(ptr) & IS_ADDR_FLAG) ++#define set_is_addr(ptr) ((ptr) |= IS_ADDR_FLAG) ++#define get_clean_addr(ptr) (((ptr) & ~(__typeof__(ptr))IS_ADDR_FLAG)) ++ ++ ++/* ++ * High speed caches for frequently allocated and freed structs ++ */ ++static struct kmem_cache *rmap_item_cache; ++static struct kmem_cache *stable_node_cache; ++static struct kmem_cache *node_vma_cache; ++static struct kmem_cache *vma_slot_cache; ++static struct kmem_cache *tree_node_cache; ++#define UKSM_KMEM_CACHE(__struct, __flags) kmem_cache_create("uksm_"#__struct,\ ++ sizeof(struct __struct), __alignof__(struct __struct),\ ++ (__flags), NULL) ++ ++/* Array of all scan_rung, uksm_scan_ladder[0] having the minimum scan ratio */ ++#define SCAN_LADDER_SIZE 4 ++static struct scan_rung uksm_scan_ladder[SCAN_LADDER_SIZE]; ++ ++/* The evaluation rounds uksmd has finished */ ++static unsigned long long uksm_eval_round = 1; ++ ++/* ++ * we add 1 to this var when we consider we should rebuild the whole ++ * unstable tree. ++ */ ++static unsigned long uksm_hash_round = 1; ++ ++/* ++ * How many times the whole memory is scanned. ++ */ ++static unsigned long long fully_scanned_round = 1; ++ ++/* The total number of virtual pages of all vma slots */ ++static u64 uksm_pages_total; ++ ++/* The number of pages has been scanned since the start up */ ++static u64 uksm_pages_scanned; ++ ++static u64 scanned_virtual_pages; ++ ++/* The number of pages has been scanned since last encode_benefit call */ ++static u64 uksm_pages_scanned_last; ++ ++/* If the scanned number is tooo large, we encode it here */ ++static u64 pages_scanned_stored; ++ ++static unsigned long pages_scanned_base; ++ ++/* The number of nodes in the stable tree */ ++static unsigned long uksm_pages_shared; ++ ++/* The number of page slots additionally sharing those nodes */ ++static unsigned long uksm_pages_sharing; ++ ++/* The number of nodes in the unstable tree */ ++static unsigned long uksm_pages_unshared; ++ ++/* ++ * Milliseconds ksmd should sleep between scans, ++ * >= 100ms to be consistent with ++ * scan_time_to_sleep_msec() ++ */ ++static unsigned int uksm_sleep_jiffies; ++ ++/* The real value for the uksmd next sleep */ ++static unsigned int uksm_sleep_real; ++ ++/* Saved value for user input uksm_sleep_jiffies when it's enlarged */ ++static unsigned int uksm_sleep_saved; ++ ++/* Max percentage of cpu utilization ksmd can take to scan in one batch */ ++static unsigned int uksm_max_cpu_percentage; ++ ++static int uksm_cpu_governor; ++ ++static char *uksm_cpu_governor_str[4] = { "full", "medium", "low", "quiet" }; ++ ++struct uksm_cpu_preset_s { ++ int cpu_ratio[SCAN_LADDER_SIZE]; ++ unsigned int cover_msecs[SCAN_LADDER_SIZE]; ++ unsigned int max_cpu; /* percentage */ ++}; ++ ++struct uksm_cpu_preset_s uksm_cpu_preset[4] = { ++ { {20, 40, -2500, -10000}, {1000, 500, 200, 50}, 95}, ++ { {20, 30, -2500, -10000}, {1000, 500, 400, 100}, 50}, ++ { {10, 20, -5000, -10000}, {1500, 1000, 1000, 250}, 20}, ++ { {10, 20, 40, 75}, {2000, 1000, 1000, 1000}, 1}, ++}; ++ ++/* The default value for uksm_ema_page_time if it's not initialized */ ++#define UKSM_PAGE_TIME_DEFAULT 500 ++ ++/*cost to scan one page by expotional moving average in nsecs */ ++static unsigned long uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++/* The expotional moving average alpha weight, in percentage. */ ++#define EMA_ALPHA 20 ++ ++/* ++ * The threshold used to filter out thrashing areas, ++ * If it == 0, filtering is disabled, otherwise it's the percentage up-bound ++ * of the thrashing ratio of all areas. Any area with a bigger thrashing ratio ++ * will be considered as having a zero duplication ratio. ++ */ ++static unsigned int uksm_thrash_threshold = 50; ++ ++/* How much dedup ratio is considered to be abundant*/ ++static unsigned int uksm_abundant_threshold = 10; ++ ++/* All slots having merged pages in this eval round. */ ++struct list_head vma_slot_dedup = LIST_HEAD_INIT(vma_slot_dedup); ++ ++/* How many times the ksmd has slept since startup */ ++static unsigned long long uksm_sleep_times; ++ ++#define UKSM_RUN_STOP 0 ++#define UKSM_RUN_MERGE 1 ++static unsigned int uksm_run = 1; ++ ++static DECLARE_WAIT_QUEUE_HEAD(uksm_thread_wait); ++static DEFINE_MUTEX(uksm_thread_mutex); ++ ++/* ++ * List vma_slot_new is for newly created vma_slot waiting to be added by ++ * ksmd. If one cannot be added(e.g. due to it's too small), it's moved to ++ * vma_slot_noadd. vma_slot_del is the list for vma_slot whose corresponding ++ * VMA has been removed/freed. ++ */ ++struct list_head vma_slot_new = LIST_HEAD_INIT(vma_slot_new); ++struct list_head vma_slot_noadd = LIST_HEAD_INIT(vma_slot_noadd); ++struct list_head vma_slot_del = LIST_HEAD_INIT(vma_slot_del); ++static DEFINE_SPINLOCK(vma_slot_list_lock); ++ ++/* The unstable tree heads */ ++static struct rb_root root_unstable_tree = RB_ROOT; ++ ++/* ++ * All tree_nodes are in a list to be freed at once when unstable tree is ++ * freed after each scan round. ++ */ ++static struct list_head unstable_tree_node_list = ++ LIST_HEAD_INIT(unstable_tree_node_list); ++ ++/* List contains all stable nodes */ ++static struct list_head stable_node_list = LIST_HEAD_INIT(stable_node_list); ++ ++/* ++ * When the hash strength is changed, the stable tree must be delta_hashed and ++ * re-structured. We use two set of below structs to speed up the ++ * re-structuring of stable tree. ++ */ ++static struct list_head ++stable_tree_node_list[2] = {LIST_HEAD_INIT(stable_tree_node_list[0]), ++ LIST_HEAD_INIT(stable_tree_node_list[1])}; ++ ++static struct list_head *stable_tree_node_listp = &stable_tree_node_list[0]; ++static struct rb_root root_stable_tree[2] = {RB_ROOT, RB_ROOT}; ++static struct rb_root *root_stable_treep = &root_stable_tree[0]; ++static unsigned long stable_tree_index; ++ ++/* The hash strength needed to hash a full page */ ++#define HASH_STRENGTH_FULL (PAGE_SIZE / sizeof(u32)) ++ ++/* The hash strength needed for loop-back hashing */ ++#define HASH_STRENGTH_MAX (HASH_STRENGTH_FULL + 10) ++ ++/* The random offsets in a page */ ++static u32 *random_nums; ++ ++/* The hash strength */ ++static unsigned long hash_strength = HASH_STRENGTH_FULL >> 4; ++ ++/* The delta value each time the hash strength increases or decreases */ ++static unsigned long hash_strength_delta; ++#define HASH_STRENGTH_DELTA_MAX 5 ++ ++/* The time we have saved due to random_sample_hash */ ++static u64 rshash_pos; ++ ++/* The time we have wasted due to hash collision */ ++static u64 rshash_neg; ++ ++struct uksm_benefit { ++ u64 pos; ++ u64 neg; ++ u64 scanned; ++ unsigned long base; ++} benefit; ++ ++/* ++ * The relative cost of memcmp, compared to 1 time unit of random sample ++ * hash, this value is tested when ksm module is initialized ++ */ ++static unsigned long memcmp_cost; ++ ++static unsigned long rshash_neg_cont_zero; ++static unsigned long rshash_cont_obscure; ++ ++/* The possible states of hash strength adjustment heuristic */ ++enum rshash_states { ++ RSHASH_STILL, ++ RSHASH_TRYUP, ++ RSHASH_TRYDOWN, ++ RSHASH_NEW, ++ RSHASH_PRE_STILL, ++}; ++ ++/* The possible direction we are about to adjust hash strength */ ++enum rshash_direct { ++ GO_UP, ++ GO_DOWN, ++ OBSCURE, ++ STILL, ++}; ++ ++/* random sampling hash state machine */ ++static struct { ++ enum rshash_states state; ++ enum rshash_direct pre_direct; ++ u8 below_count; ++ /* Keep a lookup window of size 5, iff above_count/below_count > 3 ++ * in this window we stop trying. ++ */ ++ u8 lookup_window_index; ++ u64 stable_benefit; ++ unsigned long turn_point_down; ++ unsigned long turn_benefit_down; ++ unsigned long turn_point_up; ++ unsigned long turn_benefit_up; ++ unsigned long stable_point; ++} rshash_state; ++ ++/*zero page hash table, hash_strength [0 ~ HASH_STRENGTH_MAX]*/ ++static u32 *zero_hash_table; ++ ++static inline struct node_vma *alloc_node_vma(void) ++{ ++ struct node_vma *node_vma; ++ ++ node_vma = kmem_cache_zalloc(node_vma_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (node_vma) { ++ INIT_HLIST_HEAD(&node_vma->rmap_hlist); ++ INIT_HLIST_NODE(&node_vma->hlist); ++ } ++ return node_vma; ++} ++ ++static inline void free_node_vma(struct node_vma *node_vma) ++{ ++ kmem_cache_free(node_vma_cache, node_vma); ++} ++ ++ ++static inline struct vma_slot *alloc_vma_slot(void) ++{ ++ struct vma_slot *slot; ++ ++ /* ++ * In case ksm is not initialized by now. ++ * Oops, we need to consider the call site of uksm_init() in the future. ++ */ ++ if (!vma_slot_cache) ++ return NULL; ++ ++ slot = kmem_cache_zalloc(vma_slot_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (slot) { ++ INIT_LIST_HEAD(&slot->slot_list); ++ INIT_LIST_HEAD(&slot->dedup_list); ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ } ++ return slot; ++} ++ ++static inline void free_vma_slot(struct vma_slot *vma_slot) ++{ ++ kmem_cache_free(vma_slot_cache, vma_slot); ++} ++ ++ ++ ++static inline struct rmap_item *alloc_rmap_item(void) ++{ ++ struct rmap_item *rmap_item; ++ ++ rmap_item = kmem_cache_zalloc(rmap_item_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (rmap_item) { ++ /* bug on lowest bit is not clear for flag use */ ++ BUG_ON(is_addr(rmap_item)); ++ } ++ return rmap_item; ++} ++ ++static inline void free_rmap_item(struct rmap_item *rmap_item) ++{ ++ rmap_item->slot = NULL; /* debug safety */ ++ kmem_cache_free(rmap_item_cache, rmap_item); ++} ++ ++static inline struct stable_node *alloc_stable_node(void) ++{ ++ struct stable_node *node; ++ ++ node = kmem_cache_alloc(stable_node_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!node) ++ return NULL; ++ ++ INIT_HLIST_HEAD(&node->hlist); ++ list_add(&node->all_list, &stable_node_list); ++ return node; ++} ++ ++static inline void free_stable_node(struct stable_node *stable_node) ++{ ++ list_del(&stable_node->all_list); ++ kmem_cache_free(stable_node_cache, stable_node); ++} ++ ++static inline struct tree_node *alloc_tree_node(struct list_head *list) ++{ ++ struct tree_node *node; ++ ++ node = kmem_cache_zalloc(tree_node_cache, GFP_KERNEL | ++ __GFP_NORETRY | __GFP_NOWARN); ++ if (!node) ++ return NULL; ++ ++ list_add(&node->all_list, list); ++ return node; ++} ++ ++static inline void free_tree_node(struct tree_node *node) ++{ ++ list_del(&node->all_list); ++ kmem_cache_free(tree_node_cache, node); ++} ++ ++static void uksm_drop_anon_vma(struct rmap_item *rmap_item) ++{ ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ ++ put_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * Remove a stable node from stable_tree, may unlink from its tree_node and ++ * may remove its parent tree_node if no other stable node is pending. ++ * ++ * @stable_node The node need to be removed ++ * @unlink_rb Will this node be unlinked from the rbtree? ++ * @remove_tree_ node Will its tree_node be removed if empty? ++ */ ++static void remove_node_from_stable_tree(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ struct hlist_node *n; ++ ++ if (!hlist_empty(&stable_node->hlist)) { ++ hlist_for_each_entry_safe(node_vma, n, ++ &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { ++ uksm_pages_sharing--; ++ ++ uksm_drop_anon_vma(rmap_item); ++ rmap_item->address &= PAGE_MASK; ++ } ++ free_node_vma(node_vma); ++ cond_resched(); ++ } ++ ++ /* the last one is counted as shared */ ++ uksm_pages_shared--; ++ uksm_pages_sharing++; ++ } ++ ++ if (stable_node->tree_node && unlink_rb) { ++ rb_erase(&stable_node->node, ++ &stable_node->tree_node->sub_root); ++ ++ if (RB_EMPTY_ROOT(&stable_node->tree_node->sub_root) && ++ remove_tree_node) { ++ rb_erase(&stable_node->tree_node->node, ++ root_stable_treep); ++ free_tree_node(stable_node->tree_node); ++ } else { ++ stable_node->tree_node->count--; ++ } ++ } ++ ++ free_stable_node(stable_node); ++} ++ ++ ++/* ++ * get_uksm_page: checks if the page indicated by the stable node ++ * is still its ksm page, despite having held no reference to it. ++ * In which case we can trust the content of the page, and it ++ * returns the gotten page; but if the page has now been zapped, ++ * remove the stale node from the stable tree and return NULL. ++ * ++ * You would expect the stable_node to hold a reference to the ksm page. ++ * But if it increments the page's count, swapping out has to wait for ++ * ksmd to come around again before it can free the page, which may take ++ * seconds or even minutes: much too unresponsive. So instead we use a ++ * "keyhole reference": access to the ksm page from the stable node peeps ++ * out through its keyhole to see if that page still holds the right key, ++ * pointing back to this stable node. This relies on freeing a PageAnon ++ * page to reset its page->mapping to NULL, and relies on no other use of ++ * a page to put something that might look like our key in page->mapping. ++ * ++ * include/linux/pagemap.h page_cache_get_speculative() is a good reference, ++ * but this is different - made simpler by uksm_thread_mutex being held, but ++ * interesting for assuming that no other use of the struct page could ever ++ * put our expected_mapping into page->mapping (or a field of the union which ++ * coincides with page->mapping). The RCU calls are not for KSM at all, but ++ * to keep the page_count protocol described with page_cache_get_speculative. ++ * ++ * Note: it is possible that get_uksm_page() will return NULL one moment, ++ * then page the next, if the page is in between page_freeze_refs() and ++ * page_unfreeze_refs(): this shouldn't be a problem anywhere, the page ++ * is on its way to being freed; but it is an anomaly to bear in mind. ++ * ++ * @unlink_rb: if the removal of this node will firstly unlink from ++ * its rbtree. stable_node_reinsert will prevent this when restructuring the ++ * node from its old tree. ++ * ++ * @remove_tree_node: if this is the last one of its tree_node, will the ++ * tree_node be freed ? If we are inserting stable node, this tree_node may ++ * be reused, so don't free it. ++ */ ++static struct page *get_uksm_page(struct stable_node *stable_node, ++ int unlink_rb, int remove_tree_node) ++{ ++ struct page *page; ++ void *expected_mapping; ++ unsigned long kpfn; ++ ++ expected_mapping = (void *)((unsigned long)stable_node | ++ PAGE_MAPPING_KSM); ++again: ++ kpfn = READ_ONCE(stable_node->kpfn); ++ page = pfn_to_page(kpfn); ++ ++ /* ++ * page is computed from kpfn, so on most architectures reading ++ * page->mapping is naturally ordered after reading node->kpfn, ++ * but on Alpha we need to be more careful. ++ */ ++ smp_read_barrier_depends(); ++ ++ if (READ_ONCE(page->mapping) != expected_mapping) ++ goto stale; ++ ++ /* ++ * We cannot do anything with the page while its refcount is 0. ++ * Usually 0 means free, or tail of a higher-order page: in which ++ * case this node is no longer referenced, and should be freed; ++ * however, it might mean that the page is under page_freeze_refs(). ++ * The __remove_mapping() case is easy, again the node is now stale; ++ * but if page is swapcache in migrate_page_move_mapping(), it might ++ * still be our page, in which case it's essential to keep the node. ++ */ ++ while (!get_page_unless_zero(page)) { ++ /* ++ * Another check for page->mapping != expected_mapping would ++ * work here too. We have chosen the !PageSwapCache test to ++ * optimize the common case, when the page is or is about to ++ * be freed: PageSwapCache is cleared (under spin_lock_irq) ++ * in the freeze_refs section of __remove_mapping(); but Anon ++ * page->mapping reset to NULL later, in free_pages_prepare(). ++ */ ++ if (!PageSwapCache(page)) ++ goto stale; ++ cpu_relax(); ++ } ++ ++ if (READ_ONCE(page->mapping) != expected_mapping) { ++ put_page(page); ++ goto stale; ++ } ++ ++ lock_page(page); ++ if (READ_ONCE(page->mapping) != expected_mapping) { ++ unlock_page(page); ++ put_page(page); ++ goto stale; ++ } ++ unlock_page(page); ++ return page; ++stale: ++ /* ++ * We come here from above when page->mapping or !PageSwapCache ++ * suggests that the node is stale; but it might be under migration. ++ * We need smp_rmb(), matching the smp_wmb() in ksm_migrate_page(), ++ * before checking whether node->kpfn has been changed. ++ */ ++ smp_rmb(); ++ if (stable_node->kpfn != kpfn) ++ goto again; ++ ++ remove_node_from_stable_tree(stable_node, unlink_rb, remove_tree_node); ++ ++ return NULL; ++} ++ ++/* ++ * Removing rmap_item from stable or unstable tree. ++ * This function will clean the information from the stable/unstable tree. ++ */ ++static inline void remove_rmap_item_from_tree(struct rmap_item *rmap_item) ++{ ++ if (rmap_item->address & STABLE_FLAG) { ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct page *page; ++ ++ node_vma = rmap_item->head; ++ stable_node = node_vma->head; ++ page = get_uksm_page(stable_node, 1, 1); ++ if (!page) ++ goto out; ++ ++ /* ++ * page lock is needed because it's racing with ++ * try_to_unmap_ksm(), etc. ++ */ ++ lock_page(page); ++ hlist_del(&rmap_item->hlist); ++ ++ if (hlist_empty(&node_vma->rmap_hlist)) { ++ hlist_del(&node_vma->hlist); ++ free_node_vma(node_vma); ++ } ++ unlock_page(page); ++ ++ put_page(page); ++ if (hlist_empty(&stable_node->hlist)) { ++ /* do NOT call remove_node_from_stable_tree() here, ++ * it's possible for a forked rmap_item not in ++ * stable tree while the in-tree rmap_items were ++ * deleted. ++ */ ++ uksm_pages_shared--; ++ } else ++ uksm_pages_sharing--; ++ ++ ++ uksm_drop_anon_vma(rmap_item); ++ } else if (rmap_item->address & UNSTABLE_FLAG) { ++ if (rmap_item->hash_round == uksm_hash_round) { ++ ++ rb_erase(&rmap_item->node, ++ &rmap_item->tree_node->sub_root); ++ if (RB_EMPTY_ROOT(&rmap_item->tree_node->sub_root)) { ++ rb_erase(&rmap_item->tree_node->node, ++ &root_unstable_tree); ++ ++ free_tree_node(rmap_item->tree_node); ++ } else ++ rmap_item->tree_node->count--; ++ } ++ uksm_pages_unshared--; ++ } ++ ++ rmap_item->address &= PAGE_MASK; ++ rmap_item->hash_max = 0; ++ ++out: ++ cond_resched(); /* we're called from many long loops */ ++} ++ ++static inline int slot_in_uksm(struct vma_slot *slot) ++{ ++ return list_empty(&slot->slot_list); ++} ++ ++/* ++ * Test if the mm is exiting ++ */ ++static inline bool uksm_test_exit(struct mm_struct *mm) ++{ ++ return atomic_read(&mm->mm_users) == 0; ++} ++ ++static inline unsigned long vma_pool_size(struct vma_slot *slot) ++{ ++ return round_up(sizeof(struct rmap_list_entry) * slot->pages, ++ PAGE_SIZE) >> PAGE_SHIFT; ++} ++ ++#define CAN_OVERFLOW_U64(x, delta) (U64_MAX - (x) < (delta)) ++ ++/* must be done with sem locked */ ++static int slot_pool_alloc(struct vma_slot *slot) ++{ ++ unsigned long pool_size; ++ ++ if (slot->rmap_list_pool) ++ return 0; ++ ++ pool_size = vma_pool_size(slot); ++ slot->rmap_list_pool = kcalloc(pool_size, sizeof(struct page *), ++ GFP_KERNEL); ++ if (!slot->rmap_list_pool) ++ return -ENOMEM; ++ ++ slot->pool_counts = kcalloc(pool_size, sizeof(unsigned int), ++ GFP_KERNEL); ++ if (!slot->pool_counts) { ++ kfree(slot->rmap_list_pool); ++ return -ENOMEM; ++ } ++ ++ slot->pool_size = pool_size; ++ BUG_ON(CAN_OVERFLOW_U64(uksm_pages_total, slot->pages)); ++ slot->flags |= UKSM_SLOT_IN_UKSM; ++ uksm_pages_total += slot->pages; ++ ++ return 0; ++} ++ ++/* ++ * Called after vma is unlinked from its mm ++ */ ++void uksm_remove_vma(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma->uksm_vma_slot) ++ return; ++ ++ spin_lock(&vma_slot_list_lock); ++ slot = vma->uksm_vma_slot; ++ if (!slot) ++ goto out; ++ ++ if (slot_in_uksm(slot)) { ++ /** ++ * This slot has been added by ksmd, so move to the del list ++ * waiting ksmd to free it. ++ */ ++ list_add_tail(&slot->slot_list, &vma_slot_del); ++ } else { ++ /** ++ * It's still on new list. It's ok to free slot directly. ++ */ ++ list_del(&slot->slot_list); ++ free_vma_slot(slot); ++ } ++out: ++ vma->uksm_vma_slot = NULL; ++ spin_unlock(&vma_slot_list_lock); ++} ++ ++/** ++ * Need to do two things: ++ * 1. check if slot was moved to del list ++ * 2. make sure the mmap_sem is manipulated under valid vma. ++ * ++ * My concern here is that in some cases, this may make ++ * vma_slot_list_lock() waiters to serialized further by some ++ * sem->wait_lock, can this really be expensive? ++ * ++ * ++ * @return ++ * 0: if successfully locked mmap_sem ++ * -ENOENT: this slot was moved to del list ++ * -EBUSY: vma lock failed ++ */ ++static int try_down_read_slot_mmap_sem(struct vma_slot *slot) ++{ ++ struct vm_area_struct *vma; ++ struct mm_struct *mm; ++ struct rw_semaphore *sem; ++ ++ spin_lock(&vma_slot_list_lock); ++ ++ /* the slot_list was removed and inited from new list, when it enters ++ * uksm_list. If now it's not empty, then it must be moved to del list ++ */ ++ if (!slot_in_uksm(slot)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ BUG_ON(slot->pages != vma_pages(slot->vma)); ++ /* Ok, vma still valid */ ++ vma = slot->vma; ++ mm = vma->vm_mm; ++ sem = &mm->mmap_sem; ++ ++ if (uksm_test_exit(mm)) { ++ spin_unlock(&vma_slot_list_lock); ++ return -ENOENT; ++ } ++ ++ if (down_read_trylock(sem)) { ++ spin_unlock(&vma_slot_list_lock); ++ if (slot_pool_alloc(slot)) { ++ uksm_remove_vma(vma); ++ up_read(sem); ++ return -ENOENT; ++ } ++ return 0; ++ } ++ ++ spin_unlock(&vma_slot_list_lock); ++ return -EBUSY; ++} ++ ++static inline unsigned long ++vma_page_address(struct page *page, struct vm_area_struct *vma) ++{ ++ pgoff_t pgoff = page->index; ++ unsigned long address; ++ ++ address = vma->vm_start + ((pgoff - vma->vm_pgoff) << PAGE_SHIFT); ++ if (unlikely(address < vma->vm_start || address >= vma->vm_end)) { ++ /* page should be within @vma mapping range */ ++ return -EFAULT; ++ } ++ return address; ++} ++ ++ ++/* return 0 on success with the item's mmap_sem locked */ ++static inline int get_mergeable_page_lock_mmap(struct rmap_item *item) ++{ ++ struct mm_struct *mm; ++ struct vma_slot *slot = item->slot; ++ int err = -EINVAL; ++ ++ struct page *page; ++ ++ /* ++ * try_down_read_slot_mmap_sem() returns non-zero if the slot ++ * has been removed by uksm_remove_vma(). ++ */ ++ if (try_down_read_slot_mmap_sem(slot)) ++ return -EBUSY; ++ ++ mm = slot->vma->vm_mm; ++ ++ if (uksm_test_exit(mm)) ++ goto failout_up; ++ ++ page = item->page; ++ rcu_read_lock(); ++ if (!get_page_unless_zero(page)) { ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ ++ /* No need to consider huge page here. */ ++ if (item->slot->vma->anon_vma != page_anon_vma(page) || ++ vma_page_address(page, item->slot->vma) != get_rmap_addr(item)) { ++ /* ++ * TODO: ++ * should we release this item becase of its stale page ++ * mapping? ++ */ ++ put_page(page); ++ rcu_read_unlock(); ++ goto failout_up; ++ } ++ rcu_read_unlock(); ++ return 0; ++ ++failout_up: ++ up_read(&mm->mmap_sem); ++ return err; ++} ++ ++/* ++ * What kind of VMA is considered ? ++ */ ++static inline int vma_can_enter(struct vm_area_struct *vma) ++{ ++ return uksm_flags_can_scan(vma->vm_flags); ++} ++ ++/* ++ * Called whenever a fresh new vma is created A new vma_slot. ++ * is created and inserted into a global list Must be called. ++ * after vma is inserted to its mm. ++ */ ++void uksm_vma_add_new(struct vm_area_struct *vma) ++{ ++ struct vma_slot *slot; ++ ++ if (!vma_can_enter(vma)) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ slot = alloc_vma_slot(); ++ if (!slot) { ++ vma->uksm_vma_slot = NULL; ++ return; ++ } ++ ++ vma->uksm_vma_slot = slot; ++ vma->vm_flags |= VM_MERGEABLE; ++ slot->vma = vma; ++ slot->mm = vma->vm_mm; ++ slot->ctime_j = jiffies; ++ slot->pages = vma_pages(vma); ++ spin_lock(&vma_slot_list_lock); ++ list_add_tail(&slot->slot_list, &vma_slot_new); ++ spin_unlock(&vma_slot_list_lock); ++} ++ ++/* 32/3 < they < 32/2 */ ++#define shiftl 8 ++#define shiftr 12 ++ ++#define HASH_FROM_TO(from, to) \ ++for (index = from; index < to; index++) { \ ++ pos = random_nums[index]; \ ++ hash += key[pos]; \ ++ hash += (hash << shiftl); \ ++ hash ^= (hash >> shiftr); \ ++} ++ ++ ++#define HASH_FROM_DOWN_TO(from, to) \ ++for (index = from - 1; index >= to; index--) { \ ++ hash ^= (hash >> shiftr); \ ++ hash ^= (hash >> (shiftr*2)); \ ++ hash -= (hash << shiftl); \ ++ hash += (hash << (shiftl*2)); \ ++ pos = random_nums[index]; \ ++ hash -= key[pos]; \ ++} ++ ++/* ++ * The main random sample hash function. ++ */ ++static u32 random_sample_hash(void *addr, u32 hash_strength) ++{ ++ u32 hash = 0xdeadbeef; ++ int index, pos, loop = hash_strength; ++ u32 *key = (u32 *)addr; ++ ++ if (loop > HASH_STRENGTH_FULL) ++ loop = HASH_STRENGTH_FULL; ++ ++ HASH_FROM_TO(0, loop); ++ ++ if (hash_strength > HASH_STRENGTH_FULL) { ++ loop = hash_strength - HASH_STRENGTH_FULL; ++ HASH_FROM_TO(0, loop); ++ } ++ ++ return hash; ++} ++ ++ ++/** ++ * It's used when hash strength is adjusted ++ * ++ * @addr The page's virtual address ++ * @from The original hash strength ++ * @to The hash strength changed to ++ * @hash The hash value generated with "from" hash value ++ * ++ * return the hash value ++ */ ++static u32 delta_hash(void *addr, int from, int to, u32 hash) ++{ ++ u32 *key = (u32 *)addr; ++ int index, pos; /* make sure they are int type */ ++ ++ if (to > from) { ++ if (from >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_TO(from, to); ++ } else if (to <= HASH_STRENGTH_FULL) { ++ HASH_FROM_TO(from, to); ++ } else { ++ HASH_FROM_TO(from, HASH_STRENGTH_FULL); ++ HASH_FROM_TO(0, to - HASH_STRENGTH_FULL); ++ } ++ } else { ++ if (from <= HASH_STRENGTH_FULL) { ++ HASH_FROM_DOWN_TO(from, to); ++ } else if (to >= HASH_STRENGTH_FULL) { ++ from -= HASH_STRENGTH_FULL; ++ to -= HASH_STRENGTH_FULL; ++ HASH_FROM_DOWN_TO(from, to); ++ } else { ++ HASH_FROM_DOWN_TO(from - HASH_STRENGTH_FULL, 0); ++ HASH_FROM_DOWN_TO(HASH_STRENGTH_FULL, to); ++ } ++ } ++ ++ return hash; ++} ++ ++/** ++ * ++ * Called when: rshash_pos or rshash_neg is about to overflow or a scan round ++ * has finished. ++ * ++ * return 0 if no page has been scanned since last call, 1 otherwise. ++ */ ++static inline int encode_benefit(void) ++{ ++ u64 scanned_delta, pos_delta, neg_delta; ++ unsigned long base = benefit.base; ++ ++ scanned_delta = uksm_pages_scanned - uksm_pages_scanned_last; ++ ++ if (!scanned_delta) ++ return 0; ++ ++ scanned_delta >>= base; ++ pos_delta = rshash_pos >> base; ++ neg_delta = rshash_neg >> base; ++ ++ if (CAN_OVERFLOW_U64(benefit.pos, pos_delta) || ++ CAN_OVERFLOW_U64(benefit.neg, neg_delta) || ++ CAN_OVERFLOW_U64(benefit.scanned, scanned_delta)) { ++ benefit.scanned >>= 1; ++ benefit.neg >>= 1; ++ benefit.pos >>= 1; ++ benefit.base++; ++ scanned_delta >>= 1; ++ pos_delta >>= 1; ++ neg_delta >>= 1; ++ } ++ ++ benefit.pos += pos_delta; ++ benefit.neg += neg_delta; ++ benefit.scanned += scanned_delta; ++ ++ BUG_ON(!benefit.scanned); ++ ++ rshash_pos = rshash_neg = 0; ++ uksm_pages_scanned_last = uksm_pages_scanned; ++ ++ return 1; ++} ++ ++static inline void reset_benefit(void) ++{ ++ benefit.pos = 0; ++ benefit.neg = 0; ++ benefit.base = 0; ++ benefit.scanned = 0; ++} ++ ++static inline void inc_rshash_pos(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_pos, delta)) ++ encode_benefit(); ++ ++ rshash_pos += delta; ++} ++ ++static inline void inc_rshash_neg(unsigned long delta) ++{ ++ if (CAN_OVERFLOW_U64(rshash_neg, delta)) ++ encode_benefit(); ++ ++ rshash_neg += delta; ++} ++ ++ ++static inline u32 page_hash(struct page *page, unsigned long hash_strength, ++ int cost_accounting) ++{ ++ u32 val; ++ unsigned long delta; ++ ++ void *addr = kmap_atomic(page); ++ ++ val = random_sample_hash(addr, hash_strength); ++ kunmap_atomic(addr); ++ ++ if (cost_accounting) { ++ if (hash_strength < HASH_STRENGTH_FULL) ++ delta = HASH_STRENGTH_FULL - hash_strength; ++ else ++ delta = 0; ++ ++ inc_rshash_pos(delta); ++ } ++ ++ return val; ++} ++ ++static int memcmp_pages(struct page *page1, struct page *page2, ++ int cost_accounting) ++{ ++ char *addr1, *addr2; ++ int ret; ++ ++ addr1 = kmap_atomic(page1); ++ addr2 = kmap_atomic(page2); ++ ret = memcmp(addr1, addr2, PAGE_SIZE); ++ kunmap_atomic(addr2); ++ kunmap_atomic(addr1); ++ ++ if (cost_accounting) ++ inc_rshash_neg(memcmp_cost); ++ ++ return ret; ++} ++ ++static inline int pages_identical(struct page *page1, struct page *page2) ++{ ++ return !memcmp_pages(page1, page2, 0); ++} ++ ++static inline int is_page_full_zero(struct page *page) ++{ ++ char *addr; ++ int ret; ++ ++ addr = kmap_atomic(page); ++ ret = is_full_zero(addr, PAGE_SIZE); ++ kunmap_atomic(addr); ++ ++ return ret; ++} ++ ++static int write_protect_page(struct vm_area_struct *vma, struct page *page, ++ pte_t *orig_pte, pte_t *old_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ struct page_vma_mapped_walk pvmw = { ++ .page = page, ++ .vma = vma, ++ }; ++ int swapped; ++ int err = -EFAULT; ++ unsigned long mmun_start; /* For mmu_notifiers */ ++ unsigned long mmun_end; /* For mmu_notifiers */ ++ ++ pvmw.address = page_address_in_vma(page, vma); ++ if (pvmw.address == -EFAULT) ++ goto out; ++ ++ BUG_ON(PageTransCompound(page)); ++ ++ mmun_start = pvmw.address; ++ mmun_end = pvmw.address + PAGE_SIZE; ++ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ++ ++ if (!page_vma_mapped_walk(&pvmw)) ++ goto out_mn; ++ if (WARN_ONCE(!pvmw.pte, "Unexpected PMD mapping?")) ++ goto out_unlock; ++ ++ if (old_pte) ++ *old_pte = *pvmw.pte; ++ ++ if (pte_write(*pvmw.pte) || pte_dirty(*pvmw.pte) || ++ (pte_protnone(*pvmw.pte) && pte_savedwrite(*pvmw.pte)) || mm_tlb_flush_pending(mm)) { ++ pte_t entry; ++ ++ swapped = PageSwapCache(page); ++ flush_cache_page(vma, pvmw.address, page_to_pfn(page)); ++ /* ++ * Ok this is tricky, when get_user_pages_fast() run it doesn't ++ * take any lock, therefore the check that we are going to make ++ * with the pagecount against the mapcount is racey and ++ * O_DIRECT can happen right after the check. ++ * So we clear the pte and flush the tlb before the check ++ * this assure us that no O_DIRECT can happen after the check ++ * or in the middle of the check. ++ */ ++ entry = ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); ++ /* ++ * Check that no O_DIRECT or similar I/O is in progress on the ++ * page ++ */ ++ if (page_mapcount(page) + 1 + swapped != page_count(page)) { ++ set_pte_at(mm, pvmw.address, pvmw.pte, entry); ++ goto out_unlock; ++ } ++ if (pte_dirty(entry)) ++ set_page_dirty(page); ++ ++ if (pte_protnone(entry)) ++ entry = pte_mkclean(pte_clear_savedwrite(entry)); ++ else ++ entry = pte_mkclean(pte_wrprotect(entry)); ++ ++ set_pte_at_notify(mm, pvmw.address, pvmw.pte, entry); ++ } ++ *orig_pte = *pvmw.pte; ++ err = 0; ++ ++out_unlock: ++ page_vma_mapped_walk_done(&pvmw); ++out_mn: ++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); ++out: ++ return err; ++} ++ ++#define MERGE_ERR_PGERR 1 /* the page is invalid cannot continue */ ++#define MERGE_ERR_COLLI 2 /* there is a collision */ ++#define MERGE_ERR_COLLI_MAX 3 /* collision at the max hash strength */ ++#define MERGE_ERR_CHANGED 4 /* the page has changed since last hash */ ++ ++ ++/** ++ * replace_page - replace page in vma by new ksm page ++ * @vma: vma that holds the pte pointing to page ++ * @page: the page we are replacing by kpage ++ * @kpage: the ksm page we replace page by ++ * @orig_pte: the original value of the pte ++ * ++ * Returns 0 on success, MERGE_ERR_PGERR on failure. ++ */ ++static int replace_page(struct vm_area_struct *vma, struct page *page, ++ struct page *kpage, pte_t orig_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ pte_t entry; ++ ++ unsigned long addr; ++ int err = MERGE_ERR_PGERR; ++ unsigned long mmun_start; /* For mmu_notifiers */ ++ unsigned long mmun_end; /* For mmu_notifiers */ ++ ++ addr = page_address_in_vma(page, vma); ++ if (addr == -EFAULT) ++ goto out; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ p4d = p4d_offset(pgd, addr); ++ pud = pud_offset(p4d, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ BUG_ON(pmd_trans_huge(*pmd)); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ mmun_start = addr; ++ mmun_end = addr + PAGE_SIZE; ++ mmu_notifier_invalidate_range_start(mm, mmun_start, mmun_end); ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, orig_pte)) { ++ pte_unmap_unlock(ptep, ptl); ++ goto out_mn; ++ } ++ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush_notify(vma, addr, ptep); ++ entry = mk_pte(kpage, vma->vm_page_prot); ++ ++ /* special treatment is needed for zero_page */ ++ if ((page_to_pfn(kpage) == uksm_zero_pfn) || ++ (page_to_pfn(kpage) == zero_pfn)) { ++ entry = pte_mkspecial(entry); ++ dec_mm_counter(mm, MM_ANONPAGES); ++ inc_zone_page_state(page, NR_UKSM_ZERO_PAGES); ++ } else { ++ get_page(kpage); ++ page_add_anon_rmap(kpage, vma, addr, false); ++ } ++ ++ set_pte_at_notify(mm, addr, ptep, entry); ++ ++ page_remove_rmap(page, false); ++ if (!page_mapped(page)) ++ try_to_free_swap(page); ++ put_page(page); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out_mn: ++ mmu_notifier_invalidate_range_end(mm, mmun_start, mmun_end); ++out: ++ return err; ++} ++ ++ ++/** ++ * Fully hash a page with HASH_STRENGTH_MAX return a non-zero hash value. The ++ * zero hash value at HASH_STRENGTH_MAX is used to indicated that its ++ * hash_max member has not been calculated. ++ * ++ * @page The page needs to be hashed ++ * @hash_old The hash value calculated with current hash strength ++ * ++ * return the new hash value calculated at HASH_STRENGTH_MAX ++ */ ++static inline u32 page_hash_max(struct page *page, u32 hash_old) ++{ ++ u32 hash_max = 0; ++ void *addr; ++ ++ addr = kmap_atomic(page); ++ hash_max = delta_hash(addr, hash_strength, ++ HASH_STRENGTH_MAX, hash_old); ++ ++ kunmap_atomic(addr); ++ ++ if (!hash_max) ++ hash_max = 1; ++ ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ return hash_max; ++} ++ ++/* ++ * We compare the hash again, to ensure that it is really a hash collision ++ * instead of being caused by page write. ++ */ ++static inline int check_collision(struct rmap_item *rmap_item, ++ u32 hash) ++{ ++ int err; ++ struct page *page = rmap_item->page; ++ ++ /* if this rmap_item has already been hash_maxed, then the collision ++ * must appears in the second-level rbtree search. In this case we check ++ * if its hash_max value has been changed. Otherwise, the collision ++ * happens in the first-level rbtree search, so we check against it's ++ * current hash value. ++ */ ++ if (rmap_item->hash_max) { ++ inc_rshash_neg(memcmp_cost); ++ inc_rshash_neg(HASH_STRENGTH_MAX - hash_strength); ++ ++ if (rmap_item->hash_max == page_hash_max(page, hash)) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } else { ++ inc_rshash_neg(memcmp_cost + hash_strength); ++ ++ if (page_hash(page, hash_strength, 0) == hash) ++ err = MERGE_ERR_COLLI; ++ else ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ return err; ++} ++ ++/** ++ * Try to merge a rmap_item.page with a kpage in stable node. kpage must ++ * already be a ksm page. ++ * ++ * @return 0 if the pages were merged, -EFAULT otherwise. ++ */ ++static int try_to_merge_with_uksm_page(struct rmap_item *rmap_item, ++ struct page *kpage, u32 hash) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = MERGE_ERR_PGERR; ++ struct page *page; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ page = rmap_item->page; ++ ++ if (page == kpage) { /* ksm page forked */ ++ err = 0; ++ goto out; ++ } ++ ++ /* ++ * We need the page lock to read a stable PageSwapCache in ++ * write_protect_page(). We use trylock_page() instead of ++ * lock_page() because we don't want to wait here - we ++ * prefer to continue scanning and merging different pages, ++ * then come back to this page when it is unlocked. ++ */ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page) || !PageKsm(kpage)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ /* ++ * If this anonymous page is mapped only here, its pte may need ++ * to be write-protected. If it's mapped elsewhere, all of its ++ * ptes are necessarily already write-protected. But in either ++ * case, we need to lock and check page_count is not raised. ++ */ ++ if (write_protect_page(vma, page, &orig_pte, NULL) == 0) { ++ if (pages_identical(page, kpage)) ++ err = replace_page(vma, page, kpage, orig_pte); ++ else ++ err = check_collision(rmap_item, hash); ++ } ++ ++ if ((vma->vm_flags & VM_LOCKED) && kpage && !err) { ++ munlock_vma_page(page); ++ if (!PageMlocked(kpage)) { ++ unlock_page(page); ++ lock_page(kpage); ++ mlock_vma_page(kpage); ++ page = kpage; /* for final unlock */ ++ } ++ } ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++ ++ ++/** ++ * If two pages fail to merge in try_to_merge_two_pages, then we have a chance ++ * to restore a page mapping that has been changed in try_to_merge_two_pages. ++ * ++ * @return 0 on success. ++ */ ++static int restore_uksm_page_pte(struct vm_area_struct *vma, unsigned long addr, ++ pte_t orig_pte, pte_t wprt_pte) ++{ ++ struct mm_struct *mm = vma->vm_mm; ++ pgd_t *pgd; ++ p4d_t *p4d; ++ pud_t *pud; ++ pmd_t *pmd; ++ pte_t *ptep; ++ spinlock_t *ptl; ++ ++ int err = -EFAULT; ++ ++ pgd = pgd_offset(mm, addr); ++ if (!pgd_present(*pgd)) ++ goto out; ++ ++ p4d = p4d_offset(pgd, addr); ++ pud = pud_offset(p4d, addr); ++ if (!pud_present(*pud)) ++ goto out; ++ ++ pmd = pmd_offset(pud, addr); ++ if (!pmd_present(*pmd)) ++ goto out; ++ ++ ptep = pte_offset_map_lock(mm, pmd, addr, &ptl); ++ if (!pte_same(*ptep, wprt_pte)) { ++ /* already copied, let it be */ ++ pte_unmap_unlock(ptep, ptl); ++ goto out; ++ } ++ ++ /* ++ * Good boy, still here. When we still get the ksm page, it does not ++ * return to the free page pool, there is no way that a pte was changed ++ * to other page and gets back to this page. And remind that ksm page ++ * do not reuse in do_wp_page(). So it's safe to restore the original ++ * pte. ++ */ ++ flush_cache_page(vma, addr, pte_pfn(*ptep)); ++ ptep_clear_flush_notify(vma, addr, ptep); ++ set_pte_at_notify(mm, addr, ptep, orig_pte); ++ ++ pte_unmap_unlock(ptep, ptl); ++ err = 0; ++out: ++ return err; ++} ++ ++/** ++ * try_to_merge_two_pages() - take two identical pages and prepare ++ * them to be merged into one page(rmap_item->page) ++ * ++ * @return 0 if we successfully merged two identical pages into ++ * one ksm page. MERGE_ERR_COLLI if it's only a hash collision ++ * search in rbtree. MERGE_ERR_CHANGED if rmap_item has been ++ * changed since it's hashed. MERGE_ERR_PGERR otherwise. ++ * ++ */ ++static int try_to_merge_two_pages(struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ u32 hash) ++{ ++ pte_t orig_pte1 = __pte(0), orig_pte2 = __pte(0); ++ pte_t wprt_pte1 = __pte(0), wprt_pte2 = __pte(0); ++ struct vm_area_struct *vma1 = rmap_item->slot->vma; ++ struct vm_area_struct *vma2 = tree_rmap_item->slot->vma; ++ struct page *page = rmap_item->page; ++ struct page *tree_page = tree_rmap_item->page; ++ int err = MERGE_ERR_PGERR; ++ struct address_space *saved_mapping; ++ ++ ++ if (rmap_item->page == tree_rmap_item->page) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ if (write_protect_page(vma1, page, &wprt_pte1, &orig_pte1) != 0) { ++ unlock_page(page); ++ goto out; ++ } ++ ++ /* ++ * While we hold page lock, upgrade page from ++ * PageAnon+anon_vma to PageKsm+NULL stable_node: ++ * stable_tree_insert() will update stable_node. ++ */ ++ saved_mapping = page->mapping; ++ set_page_stable_node(page, NULL); ++ mark_page_accessed(page); ++ if (!PageDirty(page)) ++ SetPageDirty(page); ++ ++ unlock_page(page); ++ ++ if (!trylock_page(tree_page)) ++ goto restore_out; ++ ++ if (!PageAnon(tree_page)) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if (PageTransCompound(tree_page)) { ++ err = split_huge_page(tree_page); ++ if (err) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ } ++ ++ if (write_protect_page(vma2, tree_page, &wprt_pte2, &orig_pte2) != 0) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if (pages_identical(page, tree_page)) { ++ err = replace_page(vma2, tree_page, page, wprt_pte2); ++ if (err) { ++ unlock_page(tree_page); ++ goto restore_out; ++ } ++ ++ if ((vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(tree_page); ++ if (!PageMlocked(page)) { ++ unlock_page(tree_page); ++ lock_page(page); ++ mlock_vma_page(page); ++ tree_page = page; /* for final unlock */ ++ } ++ } ++ ++ unlock_page(tree_page); ++ ++ goto out; /* success */ ++ ++ } else { ++ if (tree_rmap_item->hash_max && ++ tree_rmap_item->hash_max == rmap_item->hash_max) { ++ err = MERGE_ERR_COLLI_MAX; ++ } else if (page_hash(page, hash_strength, 0) == ++ page_hash(tree_page, hash_strength, 0)) { ++ inc_rshash_neg(memcmp_cost + hash_strength * 2); ++ err = MERGE_ERR_COLLI; ++ } else { ++ err = MERGE_ERR_CHANGED; ++ } ++ ++ unlock_page(tree_page); ++ } ++ ++restore_out: ++ lock_page(page); ++ if (!restore_uksm_page_pte(vma1, get_rmap_addr(rmap_item), ++ orig_pte1, wprt_pte1)) ++ page->mapping = saved_mapping; ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++static inline int hash_cmp(u32 new_val, u32 node_val) ++{ ++ if (new_val > node_val) ++ return 1; ++ else if (new_val < node_val) ++ return -1; ++ else ++ return 0; ++} ++ ++static inline u32 rmap_item_hash_max(struct rmap_item *item, u32 hash) ++{ ++ u32 hash_max = item->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(item->page, hash); ++ ++ item->hash_max = hash_max; ++ } ++ ++ return hash_max; ++} ++ ++ ++ ++/** ++ * stable_tree_search() - search the stable tree for a page ++ * ++ * @item: the rmap_item we are comparing with ++ * @hash: the hash value of this item->page already calculated ++ * ++ * @return the page we have found, NULL otherwise. The page returned has ++ * been gotten. ++ */ ++static struct page *stable_tree_search(struct rmap_item *item, u32 hash) ++{ ++ struct rb_node *node = root_stable_treep->rb_node; ++ struct tree_node *tree_node; ++ unsigned long hash_max; ++ struct page *page = item->page; ++ struct stable_node *stable_node; ++ ++ stable_node = page_stable_node(page); ++ if (stable_node) { ++ /* ksm page forked, that is ++ * if (PageKsm(page) && !in_stable_tree(rmap_item)) ++ * it's actually gotten once outside. ++ */ ++ get_page(page); ++ return page; ++ } ++ ++ while (node) { ++ int cmp; ++ ++ tree_node = rb_entry(node, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ break; ++ } ++ ++ if (!node) ++ return NULL; ++ ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ BUG_ON(!stable_node); ++ ++ goto get_page_out; ++ } ++ ++ /* ++ * ok, we have to search the second ++ * level subtree, hash the page to a ++ * full strength. ++ */ ++ node = tree_node->sub_root.rb_node; ++ BUG_ON(!node); ++ hash_max = rmap_item_hash_max(item, hash); ++ ++ while (node) { ++ int cmp; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) ++ node = node->rb_left; ++ else if (cmp > 0) ++ node = node->rb_right; ++ else ++ goto get_page_out; ++ } ++ ++ return NULL; ++ ++get_page_out: ++ page = get_uksm_page(stable_node, 1, 1); ++ return page; ++} ++ ++static int try_merge_rmap_item(struct rmap_item *item, ++ struct page *kpage, ++ struct page *tree_page) ++{ ++ struct vm_area_struct *vma = item->slot->vma; ++ struct page_vma_mapped_walk pvmw = { ++ .page = kpage, ++ .vma = vma, ++ }; ++ ++ pvmw.address = get_rmap_addr(item); ++ if (!page_vma_mapped_walk(&pvmw)) ++ return 0; ++ ++ if (pte_write(*pvmw.pte)) { ++ /* has changed, abort! */ ++ page_vma_mapped_walk_done(&pvmw); ++ return 0; ++ } ++ ++ get_page(tree_page); ++ page_add_anon_rmap(tree_page, vma, pvmw.address, false); ++ ++ flush_cache_page(vma, pvmw.address, page_to_pfn(kpage)); ++ ptep_clear_flush_notify(vma, pvmw.address, pvmw.pte); ++ set_pte_at_notify(vma->vm_mm, pvmw.address, pvmw.pte, ++ mk_pte(tree_page, vma->vm_page_prot)); ++ ++ page_remove_rmap(kpage, false); ++ put_page(kpage); ++ ++ page_vma_mapped_walk_done(&pvmw); ++ ++ return 1; ++} ++ ++/** ++ * try_to_merge_with_stable_page() - when two rmap_items need to be inserted ++ * into stable tree, the page was found to be identical to a stable ksm page, ++ * this is the last chance we can merge them into one. ++ * ++ * @item1: the rmap_item holding the page which we wanted to insert ++ * into stable tree. ++ * @item2: the other rmap_item we found when unstable tree search ++ * @oldpage: the page currently mapped by the two rmap_items ++ * @tree_page: the page we found identical in stable tree node ++ * @success1: return if item1 is successfully merged ++ * @success2: return if item2 is successfully merged ++ */ ++static void try_merge_with_stable(struct rmap_item *item1, ++ struct rmap_item *item2, ++ struct page **kpage, ++ struct page *tree_page, ++ int *success1, int *success2) ++{ ++ struct vm_area_struct *vma1 = item1->slot->vma; ++ struct vm_area_struct *vma2 = item2->slot->vma; ++ *success1 = 0; ++ *success2 = 0; ++ ++ if (unlikely(*kpage == tree_page)) { ++ /* I don't think this can really happen */ ++ pr_warn("UKSM: unexpected condition detected in " ++ "%s -- *kpage == tree_page !\n", __func__); ++ *success1 = 1; ++ *success2 = 1; ++ return; ++ } ++ ++ if (!PageAnon(*kpage) || !PageKsm(*kpage)) ++ goto failed; ++ ++ if (!trylock_page(tree_page)) ++ goto failed; ++ ++ /* If the oldpage is still ksm and still pointed ++ * to in the right place, and still write protected, ++ * we are confident it's not changed, no need to ++ * memcmp anymore. ++ * be ware, we cannot take nested pte locks, ++ * deadlock risk. ++ */ ++ if (!try_merge_rmap_item(item1, *kpage, tree_page)) ++ goto unlock_failed; ++ ++ /* ok, then vma2, remind that pte1 already set */ ++ if (!try_merge_rmap_item(item2, *kpage, tree_page)) ++ goto success_1; ++ ++ *success2 = 1; ++success_1: ++ *success1 = 1; ++ ++ ++ if ((*success1 && vma1->vm_flags & VM_LOCKED) || ++ (*success2 && vma2->vm_flags & VM_LOCKED)) { ++ munlock_vma_page(*kpage); ++ if (!PageMlocked(tree_page)) ++ mlock_vma_page(tree_page); ++ } ++ ++ /* ++ * We do not need oldpage any more in the caller, so can break the lock ++ * now. ++ */ ++ unlock_page(*kpage); ++ *kpage = tree_page; /* Get unlocked outside. */ ++ return; ++ ++unlock_failed: ++ unlock_page(tree_page); ++failed: ++ return; ++} ++ ++static inline void stable_node_hash_max(struct stable_node *node, ++ struct page *page, u32 hash) ++{ ++ u32 hash_max = node->hash_max; ++ ++ if (!hash_max) { ++ hash_max = page_hash_max(page, hash); ++ node->hash_max = hash_max; ++ } ++} ++ ++static inline ++struct stable_node *new_stable_node(struct tree_node *tree_node, ++ struct page *kpage, u32 hash_max) ++{ ++ struct stable_node *new_stable_node; ++ ++ new_stable_node = alloc_stable_node(); ++ if (!new_stable_node) ++ return NULL; ++ ++ new_stable_node->kpfn = page_to_pfn(kpage); ++ new_stable_node->hash_max = hash_max; ++ new_stable_node->tree_node = tree_node; ++ set_page_stable_node(kpage, new_stable_node); ++ ++ return new_stable_node; ++} ++ ++static inline ++struct stable_node *first_level_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ int cmp; ++ struct page *tree_page; ++ u32 hash_max = 0; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent = NULL, **new; ++ ++ /* this tree node contains no sub-tree yet */ ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ ++ return stable_node; ++ ++ } else { ++ /* ++ * collision in first level try to create a subtree. ++ * A new node need to be created. ++ */ ++ put_page(tree_page); ++ ++ stable_node_hash_max(stable_node, tree_page, ++ tree_node->hash); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ parent = &stable_node->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto failed; ++ } ++ ++ } else { ++ /* the only stable_node deleted, we reuse its tree_node. ++ */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++static inline ++struct stable_node *stable_subtree_insert(struct tree_node *tree_node, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ struct page **kpage, u32 hash, ++ int *success1, int *success2) ++{ ++ struct page *tree_page; ++ u32 hash_max; ++ struct stable_node *stable_node, *new_snode; ++ struct rb_node *parent, **new; ++ ++research: ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(hash_max, stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ cmp = memcmp_pages(*kpage, tree_page, 1); ++ if (!cmp) { ++ try_merge_with_stable(rmap_item, ++ tree_rmap_item, kpage, ++ tree_page, success1, success2); ++ ++ put_page(tree_page); ++ if (!*success1 && !*success2) ++ goto failed; ++ /* ++ * successfully merged with a stable ++ * node ++ */ ++ return stable_node; ++ } else { ++ put_page(tree_page); ++ goto failed; ++ } ++ } else { ++ /* ++ * stable node may be deleted, ++ * and subtree maybe ++ * restructed, cannot ++ * continue, research it. ++ */ ++ if (tree_node->count) { ++ goto research; ++ } else { ++ /* reuse the tree node*/ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ } ++ } ++ } ++ ++ new_snode = new_stable_node(tree_node, *kpage, hash_max); ++ if (!new_snode) ++ goto failed; ++ ++ rb_link_node(&new_snode->node, parent, new); ++ rb_insert_color(&new_snode->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ ++ return new_snode; ++ ++failed: ++ return NULL; ++} ++ ++ ++/** ++ * stable_tree_insert() - try to insert a merged page in unstable tree to ++ * the stable tree ++ * ++ * @kpage: the page need to be inserted ++ * @hash: the current hash of this page ++ * @rmap_item: the rmap_item being scanned ++ * @tree_rmap_item: the rmap_item found on unstable tree ++ * @success1: return if rmap_item is merged ++ * @success2: return if tree_rmap_item is merged ++ * ++ * @return the stable_node on stable tree if at least one ++ * rmap_item is inserted into stable tree, NULL ++ * otherwise. ++ */ ++static struct stable_node * ++stable_tree_insert(struct page **kpage, u32 hash, ++ struct rmap_item *rmap_item, ++ struct rmap_item *tree_rmap_item, ++ int *success1, int *success2) ++{ ++ struct rb_node **new = &root_stable_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ u32 hash_max = 0; ++ ++ *success1 = *success2 = 0; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ if (tree_node->count == 1) { ++ stable_node = first_level_insert(tree_node, rmap_item, ++ tree_rmap_item, kpage, ++ hash, success1, success2); ++ } else { ++ stable_node = stable_subtree_insert(tree_node, ++ rmap_item, tree_rmap_item, kpage, ++ hash, success1, success2); ++ } ++ } else { ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(stable_tree_node_listp); ++ if (!tree_node) { ++ stable_node = NULL; ++ goto out; ++ } ++ ++ stable_node = new_stable_node(tree_node, *kpage, hash_max); ++ if (!stable_node) { ++ free_tree_node(tree_node); ++ goto out; ++ } ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_stable_treep); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ ++ rb_link_node(&stable_node->node, parent, new); ++ rb_insert_color(&stable_node->node, &tree_node->sub_root); ++ tree_node->count++; ++ *success1 = *success2 = 1; ++ } ++ ++out: ++ return stable_node; ++} ++ ++ ++/** ++ * get_tree_rmap_item_page() - try to get the page and lock the mmap_sem ++ * ++ * @return 0 on success, -EBUSY if unable to lock the mmap_sem, ++ * -EINVAL if the page mapping has been changed. ++ */ ++static inline int get_tree_rmap_item_page(struct rmap_item *tree_rmap_item) ++{ ++ int err; ++ ++ err = get_mergeable_page_lock_mmap(tree_rmap_item); ++ ++ if (err == -EINVAL) { ++ /* its page map has been changed, remove it */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++ ++ /* The page is gotten and mmap_sem is locked now. */ ++ return err; ++} ++ ++ ++/** ++ * unstable_tree_search_insert() - search an unstable tree rmap_item with the ++ * same hash value. Get its page and trylock the mmap_sem ++ */ ++static inline ++struct rmap_item *unstable_tree_search_insert(struct rmap_item *rmap_item, ++ u32 hash) ++ ++{ ++ struct rb_node **new = &root_unstable_tree.rb_node; ++ struct rb_node *parent = NULL; ++ struct tree_node *tree_node; ++ u32 hash_max; ++ struct rmap_item *tree_rmap_item; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* got the tree_node */ ++ if (tree_node->count == 1) { ++ tree_rmap_item = rb_entry(tree_node->sub_root.rb_node, ++ struct rmap_item, node); ++ BUG_ON(!tree_rmap_item); ++ ++ goto get_page_out; ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ BUG_ON(!*new); ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ ++ while (*new) { ++ int cmp; ++ ++ tree_rmap_item = rb_entry(*new, struct rmap_item, ++ node); ++ ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = *new; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto get_page_out; ++ } ++ } else { ++ /* alloc a new tree_node */ ++ tree_node = alloc_tree_node(&unstable_tree_node_list); ++ if (!tree_node) ++ return NULL; ++ ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, &root_unstable_tree); ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++ /* did not found even in sub-tree */ ++ rmap_item->tree_node = tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, &tree_node->sub_root); ++ ++ uksm_pages_unshared++; ++ return NULL; ++ ++get_page_out: ++ if (tree_rmap_item->page == rmap_item->page) ++ return NULL; ++ ++ if (get_tree_rmap_item_page(tree_rmap_item)) ++ return NULL; ++ ++ return tree_rmap_item; ++} ++ ++static void hold_anon_vma(struct rmap_item *rmap_item, ++ struct anon_vma *anon_vma) ++{ ++ rmap_item->anon_vma = anon_vma; ++ get_anon_vma(anon_vma); ++} ++ ++ ++/** ++ * stable_tree_append() - append a rmap_item to a stable node. Deduplication ++ * ratio statistics is done in this function. ++ * ++ */ ++static void stable_tree_append(struct rmap_item *rmap_item, ++ struct stable_node *stable_node, int logdedup) ++{ ++ struct node_vma *node_vma = NULL, *new_node_vma, *node_vma_cont = NULL; ++ unsigned long key = (unsigned long)rmap_item->slot; ++ unsigned long factor = rmap_item->slot->rung->step; ++ ++ BUG_ON(!stable_node); ++ rmap_item->address |= STABLE_FLAG; ++ ++ if (hlist_empty(&stable_node->hlist)) { ++ uksm_pages_shared++; ++ goto node_vma_new; ++ } else { ++ uksm_pages_sharing++; ++ } ++ ++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { ++ if (node_vma->key >= key) ++ break; ++ ++ if (logdedup) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ ++ if (node_vma) { ++ if (node_vma->key == key) { ++ node_vma_cont = hlist_entry_safe(node_vma->hlist.next, struct node_vma, hlist); ++ goto node_vma_ok; ++ } else if (node_vma->key > key) { ++ node_vma_cont = node_vma; ++ } ++ } ++ ++node_vma_new: ++ /* no same vma already in node, alloc a new node_vma */ ++ new_node_vma = alloc_node_vma(); ++ BUG_ON(!new_node_vma); ++ new_node_vma->head = stable_node; ++ new_node_vma->slot = rmap_item->slot; ++ ++ if (!node_vma) { ++ hlist_add_head(&new_node_vma->hlist, &stable_node->hlist); ++ } else if (node_vma->key != key) { ++ if (node_vma->key < key) ++ hlist_add_behind(&new_node_vma->hlist, &node_vma->hlist); ++ else { ++ hlist_add_before(&new_node_vma->hlist, ++ &node_vma->hlist); ++ } ++ ++ } ++ node_vma = new_node_vma; ++ ++node_vma_ok: /* ok, ready to add to the list */ ++ rmap_item->head = node_vma; ++ hlist_add_head(&rmap_item->hlist, &node_vma->rmap_hlist); ++ hold_anon_vma(rmap_item, rmap_item->slot->vma->anon_vma); ++ if (logdedup) { ++ rmap_item->slot->pages_merged++; ++ if (node_vma_cont) { ++ node_vma = node_vma_cont; ++ hlist_for_each_entry_continue(node_vma, hlist) { ++ node_vma->slot->pages_bemerged += factor; ++ if (list_empty(&node_vma->slot->dedup_list)) ++ list_add(&node_vma->slot->dedup_list, ++ &vma_slot_dedup); ++ } ++ } ++ } ++} ++ ++/* ++ * We use break_ksm to break COW on a ksm page: it's a stripped down ++ * ++ * if (get_user_pages(addr, 1, 1, 1, &page, NULL) == 1) ++ * put_page(page); ++ * ++ * but taking great care only to touch a ksm page, in a VM_MERGEABLE vma, ++ * in case the application has unmapped and remapped mm,addr meanwhile. ++ * Could a ksm page appear anywhere else? Actually yes, in a VM_PFNMAP ++ * mmap of /dev/mem or /dev/kmem, where we would not want to touch it. ++ */ ++static int break_ksm(struct vm_area_struct *vma, unsigned long addr) ++{ ++ struct page *page; ++ int ret = 0; ++ ++ do { ++ cond_resched(); ++ page = follow_page(vma, addr, FOLL_GET | FOLL_MIGRATION | FOLL_REMOTE); ++ if (IS_ERR_OR_NULL(page)) ++ break; ++ if (PageKsm(page)) { ++ ret = handle_mm_fault(vma, addr, ++ FAULT_FLAG_WRITE | FAULT_FLAG_REMOTE); ++ } else ++ ret = VM_FAULT_WRITE; ++ put_page(page); ++ } while (!(ret & (VM_FAULT_WRITE | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | VM_FAULT_OOM))); ++ /* ++ * We must loop because handle_mm_fault() may back out if there's ++ * any difficulty e.g. if pte accessed bit gets updated concurrently. ++ * ++ * VM_FAULT_WRITE is what we have been hoping for: it indicates that ++ * COW has been broken, even if the vma does not permit VM_WRITE; ++ * but note that a concurrent fault might break PageKsm for us. ++ * ++ * VM_FAULT_SIGBUS could occur if we race with truncation of the ++ * backing file, which also invalidates anonymous pages: that's ++ * okay, that truncation will have unmapped the PageKsm for us. ++ * ++ * VM_FAULT_OOM: at the time of writing (late July 2009), setting ++ * aside mem_cgroup limits, VM_FAULT_OOM would only be set if the ++ * current task has TIF_MEMDIE set, and will be OOM killed on return ++ * to user; and ksmd, having no mm, would never be chosen for that. ++ * ++ * But if the mm is in a limited mem_cgroup, then the fault may fail ++ * with VM_FAULT_OOM even if the current task is not TIF_MEMDIE; and ++ * even ksmd can fail in this way - though it's usually breaking ksm ++ * just to undo a merge it made a moment before, so unlikely to oom. ++ * ++ * That's a pity: we might therefore have more kernel pages allocated ++ * than we're counting as nodes in the stable tree; but uksm_do_scan ++ * will retry to break_cow on each pass, so should recover the page ++ * in due course. The important thing is to not let VM_MERGEABLE ++ * be cleared while any such pages might remain in the area. ++ */ ++ return (ret & VM_FAULT_OOM) ? -ENOMEM : 0; ++} ++ ++static void break_cow(struct rmap_item *rmap_item) ++{ ++ struct vm_area_struct *vma = rmap_item->slot->vma; ++ struct mm_struct *mm = vma->vm_mm; ++ unsigned long addr = get_rmap_addr(rmap_item); ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ break_ksm(vma, addr); ++out: ++ return; ++} ++ ++/* ++ * Though it's very tempting to unmerge in_stable_tree(rmap_item)s rather ++ * than check every pte of a given vma, the locking doesn't quite work for ++ * that - an rmap_item is assigned to the stable tree after inserting ksm ++ * page and upping mmap_sem. Nor does it fit with the way we skip dup'ing ++ * rmap_items from parent to child at fork time (so as not to waste time ++ * if exit comes before the next scan reaches it). ++ * ++ * Similarly, although we'd like to remove rmap_items (so updating counts ++ * and freeing memory) when unmerging an area, it's easier to leave that ++ * to the next pass of ksmd - consider, for example, how ksmd might be ++ * in cmp_and_merge_page on one of the rmap_items we would be removing. ++ */ ++inline int unmerge_uksm_pages(struct vm_area_struct *vma, ++ unsigned long start, unsigned long end) ++{ ++ unsigned long addr; ++ int err = 0; ++ ++ for (addr = start; addr < end && !err; addr += PAGE_SIZE) { ++ if (uksm_test_exit(vma->vm_mm)) ++ break; ++ if (signal_pending(current)) ++ err = -ERESTARTSYS; ++ else ++ err = break_ksm(vma, addr); ++ } ++ return err; ++} ++ ++static inline void inc_uksm_pages_scanned(void) ++{ ++ u64 delta; ++ ++ ++ if (uksm_pages_scanned == U64_MAX) { ++ encode_benefit(); ++ ++ delta = uksm_pages_scanned >> pages_scanned_base; ++ ++ if (CAN_OVERFLOW_U64(pages_scanned_stored, delta)) { ++ pages_scanned_stored >>= 1; ++ delta >>= 1; ++ pages_scanned_base++; ++ } ++ ++ pages_scanned_stored += delta; ++ ++ uksm_pages_scanned = uksm_pages_scanned_last = 0; ++ } ++ ++ uksm_pages_scanned++; ++} ++ ++static inline int find_zero_page_hash(int strength, u32 hash) ++{ ++ return (zero_hash_table[strength] == hash); ++} ++ ++static ++int cmp_and_merge_zero_page(struct vm_area_struct *vma, struct page *page) ++{ ++ struct page *zero_page = empty_uksm_zero_page; ++ struct mm_struct *mm = vma->vm_mm; ++ pte_t orig_pte = __pte(0); ++ int err = -EFAULT; ++ ++ if (uksm_test_exit(mm)) ++ goto out; ++ ++ if (!trylock_page(page)) ++ goto out; ++ ++ if (!PageAnon(page)) ++ goto out_unlock; ++ ++ if (PageTransCompound(page)) { ++ err = split_huge_page(page); ++ if (err) ++ goto out_unlock; ++ } ++ ++ if (write_protect_page(vma, page, &orig_pte, 0) == 0) { ++ if (is_page_full_zero(page)) ++ err = replace_page(vma, page, zero_page, orig_pte); ++ } ++ ++out_unlock: ++ unlock_page(page); ++out: ++ return err; ++} ++ ++/* ++ * cmp_and_merge_page() - first see if page can be merged into the stable ++ * tree; if not, compare hash to previous and if it's the same, see if page ++ * can be inserted into the unstable tree, or merged with a page already there ++ * and both transferred to the stable tree. ++ * ++ * @page: the page that we are searching identical page to. ++ * @rmap_item: the reverse mapping into the virtual address of this page ++ */ ++static void cmp_and_merge_page(struct rmap_item *rmap_item, u32 hash) ++{ ++ struct rmap_item *tree_rmap_item; ++ struct page *page; ++ struct page *kpage = NULL; ++ u32 hash_max; ++ int err; ++ unsigned int success1, success2; ++ struct stable_node *snode; ++ int cmp; ++ struct rb_node *parent = NULL, **new; ++ ++ remove_rmap_item_from_tree(rmap_item); ++ page = rmap_item->page; ++ ++ /* We first start with searching the page inside the stable tree */ ++ kpage = stable_tree_search(rmap_item, hash); ++ if (kpage) { ++ err = try_to_merge_with_uksm_page(rmap_item, kpage, ++ hash); ++ if (!err) { ++ /* ++ * The page was successfully merged, add ++ * its rmap_item to the stable tree. ++ * page lock is needed because it's ++ * racing with try_to_unmap_ksm(), etc. ++ */ ++ lock_page(kpage); ++ snode = page_stable_node(kpage); ++ stable_tree_append(rmap_item, snode, 1); ++ unlock_page(kpage); ++ put_page(kpage); ++ return; /* success */ ++ } ++ put_page(kpage); ++ ++ /* ++ * if it's a collision and it has been search in sub-rbtree ++ * (hash_max != 0), we want to abort, because if it is ++ * successfully merged in unstable tree, the collision trends to ++ * happen again. ++ */ ++ if (err == MERGE_ERR_COLLI && rmap_item->hash_max) ++ return; ++ } ++ ++ tree_rmap_item = ++ unstable_tree_search_insert(rmap_item, hash); ++ if (tree_rmap_item) { ++ err = try_to_merge_two_pages(rmap_item, tree_rmap_item, hash); ++ /* ++ * As soon as we merge this page, we want to remove the ++ * rmap_item of the page we have merged with from the unstable ++ * tree, and insert it instead as new node in the stable tree. ++ */ ++ if (!err) { ++ kpage = page; ++ remove_rmap_item_from_tree(tree_rmap_item); ++ lock_page(kpage); ++ snode = stable_tree_insert(&kpage, hash, ++ rmap_item, tree_rmap_item, ++ &success1, &success2); ++ ++ /* ++ * Do not log dedup for tree item, it's not counted as ++ * scanned in this round. ++ */ ++ if (success2) ++ stable_tree_append(tree_rmap_item, snode, 0); ++ ++ /* ++ * The order of these two stable append is important: ++ * we are scanning rmap_item. ++ */ ++ if (success1) ++ stable_tree_append(rmap_item, snode, 1); ++ ++ /* ++ * The original kpage may be unlocked inside ++ * stable_tree_insert() already. This page ++ * should be unlocked before doing ++ * break_cow(). ++ */ ++ unlock_page(kpage); ++ ++ if (!success1) ++ break_cow(rmap_item); ++ ++ if (!success2) ++ break_cow(tree_rmap_item); ++ ++ } else if (err == MERGE_ERR_COLLI) { ++ BUG_ON(tree_rmap_item->tree_node->count > 1); ++ ++ rmap_item_hash_max(tree_rmap_item, ++ tree_rmap_item->tree_node->hash); ++ ++ hash_max = rmap_item_hash_max(rmap_item, hash); ++ cmp = hash_cmp(hash_max, tree_rmap_item->hash_max); ++ parent = &tree_rmap_item->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto put_up_out; ++ ++ rmap_item->tree_node = tree_rmap_item->tree_node; ++ rmap_item->address |= UNSTABLE_FLAG; ++ rmap_item->hash_round = uksm_hash_round; ++ rb_link_node(&rmap_item->node, parent, new); ++ rb_insert_color(&rmap_item->node, ++ &tree_rmap_item->tree_node->sub_root); ++ rmap_item->tree_node->count++; ++ } else { ++ /* ++ * either one of the page has changed or they collide ++ * at the max hash, we consider them as ill items. ++ */ ++ remove_rmap_item_from_tree(tree_rmap_item); ++ } ++put_up_out: ++ put_page(tree_rmap_item->page); ++ up_read(&tree_rmap_item->slot->vma->vm_mm->mmap_sem); ++ } ++} ++ ++ ++ ++ ++static inline unsigned long get_pool_index(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = (sizeof(struct rmap_list_entry *) * index) >> PAGE_SHIFT; ++ if (pool_index >= slot->pool_size) ++ BUG(); ++ return pool_index; ++} ++ ++static inline unsigned long index_page_offset(unsigned long index) ++{ ++ return offset_in_page(sizeof(struct rmap_list_entry *) * index); ++} ++ ++static inline ++struct rmap_list_entry *get_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index, int need_alloc) ++{ ++ unsigned long pool_index; ++ struct page *page; ++ void *addr; ++ ++ ++ pool_index = get_pool_index(slot, index); ++ if (!slot->rmap_list_pool[pool_index]) { ++ if (!need_alloc) ++ return NULL; ++ ++ page = alloc_page(GFP_KERNEL | __GFP_ZERO | __GFP_NOWARN); ++ if (!page) ++ return NULL; ++ ++ slot->rmap_list_pool[pool_index] = page; ++ } ++ ++ addr = kmap(slot->rmap_list_pool[pool_index]); ++ addr += index_page_offset(index); ++ ++ return addr; ++} ++ ++static inline void put_rmap_list_entry(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ kunmap(slot->rmap_list_pool[pool_index]); ++} ++ ++static inline int entry_is_new(struct rmap_list_entry *entry) ++{ ++ return !entry->item; ++} ++ ++static inline unsigned long get_index_orig_addr(struct vma_slot *slot, ++ unsigned long index) ++{ ++ return slot->vma->vm_start + (index << PAGE_SHIFT); ++} ++ ++static inline unsigned long get_entry_address(struct rmap_list_entry *entry) ++{ ++ unsigned long addr; ++ ++ if (is_addr(entry->addr)) ++ addr = get_clean_addr(entry->addr); ++ else if (entry->item) ++ addr = get_rmap_addr(entry->item); ++ else ++ BUG(); ++ ++ return addr; ++} ++ ++static inline struct rmap_item *get_entry_item(struct rmap_list_entry *entry) ++{ ++ if (is_addr(entry->addr)) ++ return NULL; ++ ++ return entry->item; ++} ++ ++static inline void inc_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ slot->pool_counts[pool_index]++; ++} ++ ++static inline void dec_rmap_list_pool_count(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ BUG_ON(!slot->rmap_list_pool[pool_index]); ++ BUG_ON(!slot->pool_counts[pool_index]); ++ slot->pool_counts[pool_index]--; ++} ++ ++static inline int entry_has_rmap(struct rmap_list_entry *entry) ++{ ++ return !is_addr(entry->addr) && entry->item; ++} ++ ++static inline void swap_entries(struct rmap_list_entry *entry1, ++ unsigned long index1, ++ struct rmap_list_entry *entry2, ++ unsigned long index2) ++{ ++ struct rmap_list_entry tmp; ++ ++ /* swapping two new entries is meaningless */ ++ BUG_ON(entry_is_new(entry1) && entry_is_new(entry2)); ++ ++ tmp = *entry1; ++ *entry1 = *entry2; ++ *entry2 = tmp; ++ ++ if (entry_has_rmap(entry1)) ++ entry1->item->entry_index = index1; ++ ++ if (entry_has_rmap(entry2)) ++ entry2->item->entry_index = index2; ++ ++ if (entry_has_rmap(entry1) && !entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry1->item->slot, index1); ++ dec_rmap_list_pool_count(entry1->item->slot, index2); ++ } else if (!entry_has_rmap(entry1) && entry_has_rmap(entry2)) { ++ inc_rmap_list_pool_count(entry2->item->slot, index2); ++ dec_rmap_list_pool_count(entry2->item->slot, index1); ++ } ++} ++ ++static inline void free_entry_item(struct rmap_list_entry *entry) ++{ ++ unsigned long index; ++ struct rmap_item *item; ++ ++ if (!is_addr(entry->addr)) { ++ BUG_ON(!entry->item); ++ item = entry->item; ++ entry->addr = get_rmap_addr(item); ++ set_is_addr(entry->addr); ++ index = item->entry_index; ++ remove_rmap_item_from_tree(item); ++ dec_rmap_list_pool_count(item->slot, index); ++ free_rmap_item(item); ++ } ++} ++ ++static inline int pool_entry_boundary(unsigned long index) ++{ ++ unsigned long linear_addr; ++ ++ linear_addr = sizeof(struct rmap_list_entry *) * index; ++ return index && !offset_in_page(linear_addr); ++} ++ ++static inline void try_free_last_pool(struct vma_slot *slot, ++ unsigned long index) ++{ ++ unsigned long pool_index; ++ ++ pool_index = get_pool_index(slot, index); ++ if (slot->rmap_list_pool[pool_index] && ++ !slot->pool_counts[pool_index]) { ++ __free_page(slot->rmap_list_pool[pool_index]); ++ slot->rmap_list_pool[pool_index] = NULL; ++ slot->flags |= UKSM_SLOT_NEED_SORT; ++ } ++ ++} ++ ++static inline unsigned long vma_item_index(struct vm_area_struct *vma, ++ struct rmap_item *item) ++{ ++ return (get_rmap_addr(item) - vma->vm_start) >> PAGE_SHIFT; ++} ++ ++static int within_same_pool(struct vma_slot *slot, ++ unsigned long i, unsigned long j) ++{ ++ unsigned long pool_i, pool_j; ++ ++ pool_i = get_pool_index(slot, i); ++ pool_j = get_pool_index(slot, j); ++ ++ return (pool_i == pool_j); ++} ++ ++static void sort_rmap_entry_list(struct vma_slot *slot) ++{ ++ unsigned long i, j; ++ struct rmap_list_entry *entry, *swap_entry; ++ ++ entry = get_rmap_list_entry(slot, 0, 0); ++ for (i = 0; i < slot->pages; ) { ++ ++ if (!entry) ++ goto skip_whole_pool; ++ ++ if (entry_is_new(entry)) ++ goto next_entry; ++ ++ if (is_addr(entry->addr)) { ++ entry->addr = 0; ++ goto next_entry; ++ } ++ ++ j = vma_item_index(slot->vma, entry->item); ++ if (j == i) ++ goto next_entry; ++ ++ if (within_same_pool(slot, i, j)) ++ swap_entry = entry + j - i; ++ else ++ swap_entry = get_rmap_list_entry(slot, j, 1); ++ ++ swap_entries(entry, i, swap_entry, j); ++ if (!within_same_pool(slot, i, j)) ++ put_rmap_list_entry(slot, j); ++ continue; ++ ++skip_whole_pool: ++ i += PAGE_SIZE / sizeof(*entry); ++ if (i < slot->pages) ++ entry = get_rmap_list_entry(slot, i, 0); ++ continue; ++ ++next_entry: ++ if (i >= slot->pages - 1 || ++ !within_same_pool(slot, i, i + 1)) { ++ put_rmap_list_entry(slot, i); ++ if (i + 1 < slot->pages) ++ entry = get_rmap_list_entry(slot, i + 1, 0); ++ } else ++ entry++; ++ i++; ++ continue; ++ } ++ ++ /* free empty pool entries which contain no rmap_item */ ++ /* CAN be simplied to based on only pool_counts when bug freed !!!!! */ ++ for (i = 0; i < slot->pool_size; i++) { ++ unsigned char has_rmap; ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ has_rmap = 0; ++ addr = kmap(slot->rmap_list_pool[i]); ++ BUG_ON(!addr); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ has_rmap = 1; ++ } ++ kunmap(slot->rmap_list_pool[i]); ++ if (!has_rmap) { ++ BUG_ON(slot->pool_counts[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ slot->rmap_list_pool[i] = NULL; ++ } ++ } ++ ++ slot->flags &= ~UKSM_SLOT_NEED_SORT; ++} ++ ++/* ++ * vma_fully_scanned() - if all the pages in this slot have been scanned. ++ */ ++static inline int vma_fully_scanned(struct vma_slot *slot) ++{ ++ return slot->pages_scanned == slot->pages; ++} ++ ++/** ++ * get_next_rmap_item() - Get the next rmap_item in a vma_slot according to ++ * its random permutation. This function is embedded with the random ++ * permutation index management code. ++ */ ++static struct rmap_item *get_next_rmap_item(struct vma_slot *slot, u32 *hash) ++{ ++ unsigned long rand_range, addr, swap_index, scan_index; ++ struct rmap_item *item = NULL; ++ struct rmap_list_entry *scan_entry, *swap_entry = NULL; ++ struct page *page; ++ ++ scan_index = swap_index = slot->pages_scanned % slot->pages; ++ ++ if (pool_entry_boundary(scan_index)) ++ try_free_last_pool(slot, scan_index - 1); ++ ++ if (vma_fully_scanned(slot)) { ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ slot->flags |= UKSM_SLOT_NEED_RERAND; ++ else ++ slot->flags &= ~UKSM_SLOT_NEED_RERAND; ++ if (slot->flags & UKSM_SLOT_NEED_SORT) ++ sort_rmap_entry_list(slot); ++ } ++ ++ scan_entry = get_rmap_list_entry(slot, scan_index, 1); ++ if (!scan_entry) ++ return NULL; ++ ++ if (entry_is_new(scan_entry)) { ++ scan_entry->addr = get_index_orig_addr(slot, scan_index); ++ set_is_addr(scan_entry->addr); ++ } ++ ++ if (slot->flags & UKSM_SLOT_NEED_RERAND) { ++ rand_range = slot->pages - scan_index; ++ BUG_ON(!rand_range); ++ swap_index = scan_index + (prandom_u32() % rand_range); ++ } ++ ++ if (swap_index != scan_index) { ++ swap_entry = get_rmap_list_entry(slot, swap_index, 1); ++ if (entry_is_new(swap_entry)) { ++ swap_entry->addr = get_index_orig_addr(slot, ++ swap_index); ++ set_is_addr(swap_entry->addr); ++ } ++ swap_entries(scan_entry, scan_index, swap_entry, swap_index); ++ } ++ ++ addr = get_entry_address(scan_entry); ++ item = get_entry_item(scan_entry); ++ BUG_ON(addr > slot->vma->vm_end || addr < slot->vma->vm_start); ++ ++ page = follow_page(slot->vma, addr, FOLL_GET); ++ if (IS_ERR_OR_NULL(page)) ++ goto nopage; ++ ++ if (!PageAnon(page)) ++ goto putpage; ++ ++ /*check is zero_page pfn or uksm_zero_page*/ ++ if ((page_to_pfn(page) == zero_pfn) ++ || (page_to_pfn(page) == uksm_zero_pfn)) ++ goto putpage; ++ ++ flush_anon_page(slot->vma, page, addr); ++ flush_dcache_page(page); ++ ++ ++ *hash = page_hash(page, hash_strength, 1); ++ inc_uksm_pages_scanned(); ++ /*if the page content all zero, re-map to zero-page*/ ++ if (find_zero_page_hash(hash_strength, *hash)) { ++ if (!cmp_and_merge_zero_page(slot->vma, page)) { ++ slot->pages_merged++; ++ ++ /* For full-zero pages, no need to create rmap item */ ++ goto putpage; ++ } else { ++ inc_rshash_neg(memcmp_cost / 2); ++ } ++ } ++ ++ if (!item) { ++ item = alloc_rmap_item(); ++ if (item) { ++ /* It has already been zeroed */ ++ item->slot = slot; ++ item->address = addr; ++ item->entry_index = scan_index; ++ scan_entry->item = item; ++ inc_rmap_list_pool_count(slot, scan_index); ++ } else ++ goto putpage; ++ } ++ ++ BUG_ON(item->slot != slot); ++ /* the page may have changed */ ++ item->page = page; ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return item; ++ ++putpage: ++ put_page(page); ++ page = NULL; ++nopage: ++ /* no page, store addr back and free rmap_item if possible */ ++ free_entry_item(scan_entry); ++ put_rmap_list_entry(slot, scan_index); ++ if (swap_entry) ++ put_rmap_list_entry(slot, swap_index); ++ return NULL; ++} ++ ++static inline int in_stable_tree(struct rmap_item *rmap_item) ++{ ++ return rmap_item->address & STABLE_FLAG; ++} ++ ++/** ++ * scan_vma_one_page() - scan the next page in a vma_slot. Called with ++ * mmap_sem locked. ++ */ ++static noinline void scan_vma_one_page(struct vma_slot *slot) ++{ ++ u32 hash; ++ struct mm_struct *mm; ++ struct rmap_item *rmap_item = NULL; ++ struct vm_area_struct *vma = slot->vma; ++ ++ mm = vma->vm_mm; ++ BUG_ON(!mm); ++ BUG_ON(!slot); ++ ++ rmap_item = get_next_rmap_item(slot, &hash); ++ if (!rmap_item) ++ goto out1; ++ ++ if (PageKsm(rmap_item->page) && in_stable_tree(rmap_item)) ++ goto out2; ++ ++ cmp_and_merge_page(rmap_item, hash); ++out2: ++ put_page(rmap_item->page); ++out1: ++ slot->pages_scanned++; ++ slot->this_sampled++; ++ if (slot->fully_scanned_round != fully_scanned_round) ++ scanned_virtual_pages++; ++ ++ if (vma_fully_scanned(slot)) ++ slot->fully_scanned_round = fully_scanned_round; ++} ++ ++static inline unsigned long rung_get_pages(struct scan_rung *rung) ++{ ++ struct slot_tree_node *node; ++ ++ if (!rung->vma_root.rnode) ++ return 0; ++ ++ node = container_of(rung->vma_root.rnode, struct slot_tree_node, snode); ++ ++ return node->size; ++} ++ ++#define RUNG_SAMPLED_MIN 3 ++ ++static inline ++void uksm_calc_rung_step(struct scan_rung *rung, ++ unsigned long page_time, unsigned long ratio) ++{ ++ unsigned long sampled, pages; ++ ++ /* will be fully scanned ? */ ++ if (!rung->cover_msecs) { ++ rung->step = 1; ++ return; ++ } ++ ++ sampled = rung->cover_msecs * (NSEC_PER_MSEC / TIME_RATIO_SCALE) ++ * ratio / page_time; ++ ++ /* ++ * Before we finsish a scan round and expensive per-round jobs, ++ * we need to have a chance to estimate the per page time. So ++ * the sampled number can not be too small. ++ */ ++ if (sampled < RUNG_SAMPLED_MIN) ++ sampled = RUNG_SAMPLED_MIN; ++ ++ pages = rung_get_pages(rung); ++ if (likely(pages > sampled)) ++ rung->step = pages / sampled; ++ else ++ rung->step = 1; ++} ++ ++static inline int step_need_recalc(struct scan_rung *rung) ++{ ++ unsigned long pages, stepmax; ++ ++ pages = rung_get_pages(rung); ++ stepmax = pages / RUNG_SAMPLED_MIN; ++ ++ return pages && (rung->step > pages || ++ (stepmax && rung->step > stepmax)); ++} ++ ++static inline ++void reset_current_scan(struct scan_rung *rung, int finished, int step_recalc) ++{ ++ struct vma_slot *slot; ++ ++ if (finished) ++ rung->flags |= UKSM_RUNG_ROUND_FINISHED; ++ ++ if (step_recalc || step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ slot_iter_index = prandom_u32() % rung->step; ++ BUG_ON(!rung->vma_root.rnode); ++ slot = sradix_tree_next(&rung->vma_root, NULL, 0, slot_iter); ++ BUG_ON(!slot); ++ ++ rung->current_scan = slot; ++ rung->current_offset = slot_iter_index; ++} ++ ++static inline struct sradix_tree_root *slot_get_root(struct vma_slot *slot) ++{ ++ return &slot->rung->vma_root; ++} ++ ++/* ++ * return if resetted. ++ */ ++static int advance_current_scan(struct scan_rung *rung) ++{ ++ unsigned short n; ++ struct vma_slot *slot, *next = NULL; ++ ++ BUG_ON(!rung->vma_root.num); ++ ++ slot = rung->current_scan; ++ n = (slot->pages - rung->current_offset) % rung->step; ++ slot_iter_index = rung->step - n; ++ next = sradix_tree_next(&rung->vma_root, slot->snode, ++ slot->sindex, slot_iter); ++ ++ if (next) { ++ rung->current_offset = slot_iter_index; ++ rung->current_scan = next; ++ return 0; ++ } else { ++ reset_current_scan(rung, 1, 0); ++ return 1; ++ } ++} ++ ++static inline void rung_rm_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ struct sradix_tree_root *root; ++ ++ if (rung->current_scan == slot) ++ advance_current_scan(rung); ++ ++ root = slot_get_root(slot); ++ sradix_tree_delete_from_leaf(root, slot->snode, slot->sindex); ++ slot->snode = NULL; ++ if (step_need_recalc(rung)) { ++ uksm_calc_rung_step(rung, uksm_ema_page_time, rung->cpu_ratio); ++ BUG_ON(step_need_recalc(rung)); ++ } ++ ++ /* In case advance_current_scan loop back to this slot again */ ++ if (rung->vma_root.num && rung->current_scan == slot) ++ reset_current_scan(slot->rung, 1, 0); ++} ++ ++static inline void rung_add_new_slots(struct scan_rung *rung, ++ struct vma_slot **slots, unsigned long num) ++{ ++ int err; ++ struct vma_slot *slot; ++ unsigned long i; ++ struct sradix_tree_root *root = &rung->vma_root; ++ ++ err = sradix_tree_enter(root, (void **)slots, num); ++ BUG_ON(err); ++ ++ for (i = 0; i < num; i++) { ++ slot = slots[i]; ++ slot->rung = rung; ++ BUG_ON(vma_fully_scanned(slot)); ++ } ++ ++ if (rung->vma_root.num == num) ++ reset_current_scan(rung, 0, 1); ++} ++ ++static inline int rung_add_one_slot(struct scan_rung *rung, ++ struct vma_slot *slot) ++{ ++ int err; ++ ++ err = sradix_tree_enter(&rung->vma_root, (void **)&slot, 1); ++ if (err) ++ return err; ++ ++ slot->rung = rung; ++ if (rung->vma_root.num == 1) ++ reset_current_scan(rung, 0, 1); ++ ++ return 0; ++} ++ ++/* ++ * Return true if the slot is deleted from its rung. ++ */ ++static inline int vma_rung_enter(struct vma_slot *slot, struct scan_rung *rung) ++{ ++ struct scan_rung *old_rung = slot->rung; ++ int err; ++ ++ if (old_rung == rung) ++ return 0; ++ ++ rung_rm_slot(slot); ++ err = rung_add_one_slot(rung, slot); ++ if (err) { ++ err = rung_add_one_slot(old_rung, slot); ++ WARN_ON(err); /* OOPS, badly OOM, we lost this slot */ ++ } ++ ++ return 1; ++} ++ ++static inline int vma_rung_up(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[SCAN_LADDER_SIZE-1]) ++ rung++; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++static inline int vma_rung_down(struct vma_slot *slot) ++{ ++ struct scan_rung *rung; ++ ++ rung = slot->rung; ++ if (slot->rung != &uksm_scan_ladder[0]) ++ rung--; ++ ++ return vma_rung_enter(slot, rung); ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ unsigned long pages; ++ ++ pages = slot->this_sampled; ++ if (!pages) ++ return 0; ++ ++ BUG_ON(slot->pages_scanned == slot->last_scanned); ++ ++ ret = slot->pages_merged; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_merged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_merged - slot->pages_cowed; ++ } ++ } ++ ++ return ret * 100 / pages; ++} ++ ++/** ++ * cal_dedup_ratio() - Calculate the deduplication ratio for this slot. ++ */ ++static unsigned long cal_dedup_ratio_old(struct vma_slot *slot) ++{ ++ unsigned long ret; ++ unsigned long pages; ++ ++ pages = slot->pages; ++ if (!pages) ++ return 0; ++ ++ ret = slot->pages_bemerged; ++ ++ /* Thrashing area filtering */ ++ if (ret && uksm_thrash_threshold) { ++ if (slot->pages_cowed * 100 / slot->pages_bemerged ++ > uksm_thrash_threshold) { ++ ret = 0; ++ } else { ++ ret = slot->pages_bemerged - slot->pages_cowed; ++ } ++ } ++ ++ return ret * 100 / pages; ++} ++ ++/** ++ * stable_node_reinsert() - When the hash_strength has been adjusted, the ++ * stable tree need to be restructured, this is the function re-inserting the ++ * stable node. ++ */ ++static inline void stable_node_reinsert(struct stable_node *new_node, ++ struct page *page, ++ struct rb_root *root_treep, ++ struct list_head *tree_node_listp, ++ u32 hash) ++{ ++ struct rb_node **new = &root_treep->rb_node; ++ struct rb_node *parent = NULL; ++ struct stable_node *stable_node; ++ struct tree_node *tree_node; ++ struct page *tree_page; ++ int cmp; ++ ++ while (*new) { ++ int cmp; ++ ++ tree_node = rb_entry(*new, struct tree_node, node); ++ ++ cmp = hash_cmp(hash, tree_node->hash); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else ++ break; ++ } ++ ++ if (*new) { ++ /* find a stable tree node with same first level hash value */ ++ stable_node_hash_max(new_node, page, hash); ++ if (tree_node->count == 1) { ++ stable_node = rb_entry(tree_node->sub_root.rb_node, ++ struct stable_node, node); ++ tree_page = get_uksm_page(stable_node, 1, 0); ++ if (tree_page) { ++ stable_node_hash_max(stable_node, ++ tree_page, hash); ++ put_page(tree_page); ++ ++ /* prepare for stable node insertion */ ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ parent = &stable_node->node; ++ if (cmp < 0) ++ new = &parent->rb_left; ++ else if (cmp > 0) ++ new = &parent->rb_right; ++ else ++ goto failed; ++ ++ goto add_node; ++ } else { ++ /* the only stable_node deleted, the tree node ++ * was not deleted. ++ */ ++ goto tree_node_reuse; ++ } ++ } ++ ++ /* well, search the collision subtree */ ++ new = &tree_node->sub_root.rb_node; ++ parent = NULL; ++ BUG_ON(!*new); ++ while (*new) { ++ int cmp; ++ ++ stable_node = rb_entry(*new, struct stable_node, node); ++ ++ cmp = hash_cmp(new_node->hash_max, ++ stable_node->hash_max); ++ ++ if (cmp < 0) { ++ parent = *new; ++ new = &parent->rb_left; ++ } else if (cmp > 0) { ++ parent = *new; ++ new = &parent->rb_right; ++ } else { ++ /* oh, no, still a collision */ ++ goto failed; ++ } ++ } ++ ++ goto add_node; ++ } ++ ++ /* no tree node found */ ++ tree_node = alloc_tree_node(tree_node_listp); ++ if (!tree_node) { ++ pr_err("UKSM: memory allocation error!\n"); ++ goto failed; ++ } else { ++ tree_node->hash = hash; ++ rb_link_node(&tree_node->node, parent, new); ++ rb_insert_color(&tree_node->node, root_treep); ++ ++tree_node_reuse: ++ /* prepare for stable node insertion */ ++ parent = NULL; ++ new = &tree_node->sub_root.rb_node; ++ } ++ ++add_node: ++ rb_link_node(&new_node->node, parent, new); ++ rb_insert_color(&new_node->node, &tree_node->sub_root); ++ new_node->tree_node = tree_node; ++ tree_node->count++; ++ return; ++ ++failed: ++ /* This can only happen when two nodes have collided ++ * in two levels. ++ */ ++ new_node->tree_node = NULL; ++ return; ++} ++ ++static inline void free_all_tree_nodes(struct list_head *list) ++{ ++ struct tree_node *node, *tmp; ++ ++ list_for_each_entry_safe(node, tmp, list, all_list) { ++ free_tree_node(node); ++ } ++} ++ ++/** ++ * stable_tree_delta_hash() - Delta hash the stable tree from previous hash ++ * strength to the current hash_strength. It re-structures the hole tree. ++ */ ++static inline void stable_tree_delta_hash(u32 prev_hash_strength) ++{ ++ struct stable_node *node, *tmp; ++ struct rb_root *root_new_treep; ++ struct list_head *new_tree_node_listp; ++ ++ stable_tree_index = (stable_tree_index + 1) % 2; ++ root_new_treep = &root_stable_tree[stable_tree_index]; ++ new_tree_node_listp = &stable_tree_node_list[stable_tree_index]; ++ *root_new_treep = RB_ROOT; ++ BUG_ON(!list_empty(new_tree_node_listp)); ++ ++ /* ++ * we need to be safe, the node could be removed by get_uksm_page() ++ */ ++ list_for_each_entry_safe(node, tmp, &stable_node_list, all_list) { ++ void *addr; ++ struct page *node_page; ++ u32 hash; ++ ++ /* ++ * We are completely re-structuring the stable nodes to a new ++ * stable tree. We don't want to touch the old tree unlinks and ++ * old tree_nodes. The old tree_nodes will be freed at once. ++ */ ++ node_page = get_uksm_page(node, 0, 0); ++ if (!node_page) ++ continue; ++ ++ if (node->tree_node) { ++ hash = node->tree_node->hash; ++ ++ addr = kmap_atomic(node_page); ++ ++ hash = delta_hash(addr, prev_hash_strength, ++ hash_strength, hash); ++ kunmap_atomic(addr); ++ } else { ++ /* ++ *it was not inserted to rbtree due to collision in last ++ *round scan. ++ */ ++ hash = page_hash(node_page, hash_strength, 0); ++ } ++ ++ stable_node_reinsert(node, node_page, root_new_treep, ++ new_tree_node_listp, hash); ++ put_page(node_page); ++ } ++ ++ root_stable_treep = root_new_treep; ++ free_all_tree_nodes(stable_tree_node_listp); ++ BUG_ON(!list_empty(stable_tree_node_listp)); ++ stable_tree_node_listp = new_tree_node_listp; ++} ++ ++static inline void inc_hash_strength(unsigned long delta) ++{ ++ hash_strength += 1 << delta; ++ if (hash_strength > HASH_STRENGTH_MAX) ++ hash_strength = HASH_STRENGTH_MAX; ++} ++ ++static inline void dec_hash_strength(unsigned long delta) ++{ ++ unsigned long change = 1 << delta; ++ ++ if (hash_strength <= change + 1) ++ hash_strength = 1; ++ else ++ hash_strength -= change; ++} ++ ++static inline void inc_hash_strength_delta(void) ++{ ++ hash_strength_delta++; ++ if (hash_strength_delta > HASH_STRENGTH_DELTA_MAX) ++ hash_strength_delta = HASH_STRENGTH_DELTA_MAX; ++} ++ ++static inline unsigned long get_current_neg_ratio(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ ++ if (!neg) ++ return 0; ++ ++ if (!pos || neg > pos) ++ return 100; ++ ++ if (neg > div64_u64(U64_MAX, 100)) ++ pos = div64_u64(pos, 100); ++ else ++ neg *= 100; ++ ++ return div64_u64(neg, pos); ++} ++ ++static inline unsigned long get_current_benefit(void) ++{ ++ u64 pos = benefit.pos; ++ u64 neg = benefit.neg; ++ u64 scanned = benefit.scanned; ++ ++ if (neg > pos) ++ return 0; ++ ++ return div64_u64((pos - neg), scanned); ++} ++ ++static inline int judge_rshash_direction(void) ++{ ++ u64 current_neg_ratio, stable_benefit; ++ u64 current_benefit, delta = 0; ++ int ret = STILL; ++ ++ /* ++ * Try to probe a value after the boot, and in case the system ++ * are still for a long time. ++ */ ++ if ((fully_scanned_round & 0xFFULL) == 10) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ current_neg_ratio = get_current_neg_ratio(); ++ ++ if (current_neg_ratio == 0) { ++ rshash_neg_cont_zero++; ++ if (rshash_neg_cont_zero > 2) ++ return GO_DOWN; ++ else ++ return STILL; ++ } ++ rshash_neg_cont_zero = 0; ++ ++ if (current_neg_ratio > 90) { ++ ret = GO_UP; ++ goto out; ++ } ++ ++ current_benefit = get_current_benefit(); ++ stable_benefit = rshash_state.stable_benefit; ++ ++ if (!stable_benefit) { ++ ret = OBSCURE; ++ goto out; ++ } ++ ++ if (current_benefit > stable_benefit) ++ delta = current_benefit - stable_benefit; ++ else if (current_benefit < stable_benefit) ++ delta = stable_benefit - current_benefit; ++ ++ delta = div64_u64(100 * delta, stable_benefit); ++ ++ if (delta > 50) { ++ rshash_cont_obscure++; ++ if (rshash_cont_obscure > 2) ++ return OBSCURE; ++ else ++ return STILL; ++ } ++ ++out: ++ rshash_cont_obscure = 0; ++ return ret; ++} ++ ++/** ++ * rshash_adjust() - The main function to control the random sampling state ++ * machine for hash strength adapting. ++ * ++ * return true if hash_strength has changed. ++ */ ++static inline int rshash_adjust(void) ++{ ++ unsigned long prev_hash_strength = hash_strength; ++ ++ if (!encode_benefit()) ++ return 0; ++ ++ switch (rshash_state.state) { ++ case RSHASH_STILL: ++ switch (judge_rshash_direction()) { ++ case GO_UP: ++ if (rshash_state.pre_direct == GO_DOWN) ++ hash_strength_delta = 0; ++ ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_UP; ++ break; ++ ++ case GO_DOWN: ++ if (rshash_state.pre_direct == GO_UP) ++ hash_strength_delta = 0; ++ ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.pre_direct = GO_DOWN; ++ break; ++ ++ case OBSCURE: ++ rshash_state.stable_point = hash_strength; ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYDOWN; ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ break; ++ ++ case STILL: ++ break; ++ default: ++ BUG(); ++ } ++ break; ++ ++ case RSHASH_TRYDOWN: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.stable_benefit) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > ++ rshash_state.turn_benefit_down) { ++ rshash_state.turn_point_down = hash_strength; ++ rshash_state.turn_benefit_down = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_UP || ++ hash_strength == 1) { ++ hash_strength = rshash_state.stable_point; ++ hash_strength_delta = 0; ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ rshash_state.lookup_window_index = 0; ++ rshash_state.state = RSHASH_TRYUP; ++ hash_strength_delta = 0; ++ } else { ++ dec_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ break; ++ ++ case RSHASH_TRYUP: ++ if (rshash_state.lookup_window_index++ % 5 == 0) ++ rshash_state.below_count = 0; ++ ++ if (get_current_benefit() < rshash_state.turn_benefit_down) ++ rshash_state.below_count++; ++ else if (get_current_benefit() > rshash_state.turn_benefit_up) { ++ rshash_state.turn_point_up = hash_strength; ++ rshash_state.turn_benefit_up = get_current_benefit(); ++ } ++ ++ if (rshash_state.below_count >= 3 || ++ judge_rshash_direction() == GO_DOWN || ++ hash_strength == HASH_STRENGTH_MAX) { ++ hash_strength = rshash_state.turn_benefit_up > ++ rshash_state.turn_benefit_down ? ++ rshash_state.turn_point_up : ++ rshash_state.turn_point_down; ++ ++ rshash_state.state = RSHASH_PRE_STILL; ++ } else { ++ inc_hash_strength(hash_strength_delta); ++ inc_hash_strength_delta(); ++ } ++ ++ break; ++ ++ case RSHASH_NEW: ++ case RSHASH_PRE_STILL: ++ rshash_state.stable_benefit = get_current_benefit(); ++ rshash_state.state = RSHASH_STILL; ++ hash_strength_delta = 0; ++ break; ++ default: ++ BUG(); ++ } ++ ++ /* rshash_neg = rshash_pos = 0; */ ++ reset_benefit(); ++ ++ if (prev_hash_strength != hash_strength) ++ stable_tree_delta_hash(prev_hash_strength); ++ ++ return prev_hash_strength != hash_strength; ++} ++ ++/** ++ * round_update_ladder() - The main function to do update of all the ++ * adjustments whenever a scan round is finished. ++ */ ++static noinline void round_update_ladder(void) ++{ ++ int i; ++ unsigned long dedup; ++ struct vma_slot *slot, *tmp_slot; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) ++ uksm_scan_ladder[i].flags &= ~UKSM_RUNG_ROUND_FINISHED; ++ ++ list_for_each_entry_safe(slot, tmp_slot, &vma_slot_dedup, dedup_list) { ++ ++ /* slot may be rung_rm_slot() when mm exits */ ++ if (slot->snode) { ++ dedup = cal_dedup_ratio_old(slot); ++ if (dedup && dedup >= uksm_abundant_threshold) ++ vma_rung_up(slot); ++ } ++ ++ slot->pages_bemerged = 0; ++ slot->pages_cowed = 0; ++ ++ list_del_init(&slot->dedup_list); ++ } ++} ++ ++static void uksm_del_vma_slot(struct vma_slot *slot) ++{ ++ int i, j; ++ struct rmap_list_entry *entry; ++ ++ if (slot->snode) { ++ /* ++ * In case it just failed when entering the rung, it's not ++ * necessary. ++ */ ++ rung_rm_slot(slot); ++ } ++ ++ if (!list_empty(&slot->dedup_list)) ++ list_del(&slot->dedup_list); ++ ++ if (!slot->rmap_list_pool || !slot->pool_counts) { ++ /* In case it OOMed in uksm_vma_enter() */ ++ goto out; ++ } ++ ++ for (i = 0; i < slot->pool_size; i++) { ++ void *addr; ++ ++ if (!slot->rmap_list_pool[i]) ++ continue; ++ ++ addr = kmap(slot->rmap_list_pool[i]); ++ for (j = 0; j < PAGE_SIZE / sizeof(*entry); j++) { ++ entry = (struct rmap_list_entry *)addr + j; ++ if (is_addr(entry->addr)) ++ continue; ++ if (!entry->item) ++ continue; ++ ++ remove_rmap_item_from_tree(entry->item); ++ free_rmap_item(entry->item); ++ slot->pool_counts[i]--; ++ } ++ BUG_ON(slot->pool_counts[i]); ++ kunmap(slot->rmap_list_pool[i]); ++ __free_page(slot->rmap_list_pool[i]); ++ } ++ kfree(slot->rmap_list_pool); ++ kfree(slot->pool_counts); ++ ++out: ++ slot->rung = NULL; ++ if (slot->flags & UKSM_SLOT_IN_UKSM) { ++ BUG_ON(uksm_pages_total < slot->pages); ++ uksm_pages_total -= slot->pages; ++ } ++ ++ if (slot->fully_scanned_round == fully_scanned_round) ++ scanned_virtual_pages -= slot->pages; ++ else ++ scanned_virtual_pages -= slot->pages_scanned; ++ free_vma_slot(slot); ++} ++ ++ ++#define SPIN_LOCK_PERIOD 32 ++static struct vma_slot *cleanup_slots[SPIN_LOCK_PERIOD]; ++static inline void cleanup_vma_slots(void) ++{ ++ struct vma_slot *slot; ++ int i; ++ ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_del)) { ++ slot = list_entry(vma_slot_del.next, ++ struct vma_slot, slot_list); ++ list_del(&slot->slot_list); ++ cleanup_slots[i++] = slot; ++ if (i == SPIN_LOCK_PERIOD) { ++ spin_unlock(&vma_slot_list_lock); ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++ i = 0; ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ spin_unlock(&vma_slot_list_lock); ++ ++ while (--i >= 0) ++ uksm_del_vma_slot(cleanup_slots[i]); ++} ++ ++/* ++ * Expotional moving average formula ++ */ ++static inline unsigned long ema(unsigned long curr, unsigned long last_ema) ++{ ++ /* ++ * For a very high burst, even the ema cannot work well, a false very ++ * high per-page time estimation can result in feedback in very high ++ * overhead of context switch and rung update -- this will then lead ++ * to higher per-paper time, this may not converge. ++ * ++ * Instead, we try to approach this value in a binary manner. ++ */ ++ if (curr > last_ema * 10) ++ return last_ema * 2; ++ ++ return (EMA_ALPHA * curr + (100 - EMA_ALPHA) * last_ema) / 100; ++} ++ ++/* ++ * convert cpu ratio in 1/TIME_RATIO_SCALE configured by user to ++ * nanoseconds based on current uksm_sleep_jiffies. ++ */ ++static inline unsigned long cpu_ratio_to_nsec(unsigned int ratio) ++{ ++ return NSEC_PER_USEC * jiffies_to_usecs(uksm_sleep_jiffies) / ++ (TIME_RATIO_SCALE - ratio) * ratio; ++} ++ ++ ++static inline unsigned long rung_real_ratio(int cpu_time_ratio) ++{ ++ unsigned long ret; ++ ++ BUG_ON(!cpu_time_ratio); ++ ++ if (cpu_time_ratio > 0) ++ ret = cpu_time_ratio; ++ else ++ ret = (unsigned long)(-cpu_time_ratio) * ++ uksm_max_cpu_percentage / 100UL; ++ ++ return ret ? ret : 1; ++} ++ ++static noinline void uksm_calc_scan_pages(void) ++{ ++ struct scan_rung *ladder = uksm_scan_ladder; ++ unsigned long sleep_usecs, nsecs; ++ unsigned long ratio; ++ int i; ++ unsigned long per_page; ++ ++ if (uksm_ema_page_time > 100000 || ++ (((unsigned long) uksm_eval_round & (256UL - 1)) == 0UL)) ++ uksm_ema_page_time = UKSM_PAGE_TIME_DEFAULT; ++ ++ per_page = uksm_ema_page_time; ++ BUG_ON(!per_page); ++ ++ /* ++ * For every 8 eval round, we try to probe a uksm_sleep_jiffies value ++ * based on saved user input. ++ */ ++ if (((unsigned long) uksm_eval_round & (8UL - 1)) == 0UL) ++ uksm_sleep_jiffies = uksm_sleep_saved; ++ ++ /* We require a rung scan at least 1 page in a period. */ ++ nsecs = per_page; ++ ratio = rung_real_ratio(ladder[0].cpu_ratio); ++ if (cpu_ratio_to_nsec(ratio) < nsecs) { ++ sleep_usecs = nsecs * (TIME_RATIO_SCALE - ratio) / ratio ++ / NSEC_PER_USEC; ++ uksm_sleep_jiffies = usecs_to_jiffies(sleep_usecs) + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ ratio = rung_real_ratio(ladder[i].cpu_ratio); ++ ladder[i].pages_to_scan = cpu_ratio_to_nsec(ratio) / ++ per_page; ++ BUG_ON(!ladder[i].pages_to_scan); ++ uksm_calc_rung_step(&ladder[i], per_page, ratio); ++ } ++} ++ ++/* ++ * From the scan time of this round (ns) to next expected min sleep time ++ * (ms), be careful of the possible overflows. ratio is taken from ++ * rung_real_ratio() ++ */ ++static inline ++unsigned int scan_time_to_sleep(unsigned long long scan_time, unsigned long ratio) ++{ ++ scan_time >>= 20; /* to msec level now */ ++ BUG_ON(scan_time > (ULONG_MAX / TIME_RATIO_SCALE)); ++ ++ return (unsigned int) ((unsigned long) scan_time * ++ (TIME_RATIO_SCALE - ratio) / ratio); ++} ++ ++#define __round_mask(x, y) ((__typeof__(x))((y)-1)) ++#define round_up(x, y) ((((x)-1) | __round_mask(x, y))+1) ++ ++static void uksm_vma_enter(struct vma_slot **slots, unsigned long num) ++{ ++ struct scan_rung *rung; ++ ++ rung = &uksm_scan_ladder[0]; ++ rung_add_new_slots(rung, slots, num); ++} ++ ++static struct vma_slot *batch_slots[SLOT_TREE_NODE_STORE_SIZE]; ++ ++static void uksm_enter_all_slots(void) ++{ ++ struct vma_slot *slot; ++ unsigned long index; ++ struct list_head empty_vma_list; ++ int i; ++ ++ i = 0; ++ index = 0; ++ INIT_LIST_HEAD(&empty_vma_list); ++ ++ spin_lock(&vma_slot_list_lock); ++ while (!list_empty(&vma_slot_new)) { ++ slot = list_entry(vma_slot_new.next, ++ struct vma_slot, slot_list); ++ ++ if (!slot->vma->anon_vma) { ++ list_move(&slot->slot_list, &empty_vma_list); ++ } else if (vma_can_enter(slot->vma)) { ++ batch_slots[index++] = slot; ++ list_del_init(&slot->slot_list); ++ } else { ++ list_move(&slot->slot_list, &vma_slot_noadd); ++ } ++ ++ if (++i == SPIN_LOCK_PERIOD || ++ (index && !(index % SLOT_TREE_NODE_STORE_SIZE))) { ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index && !(index % SLOT_TREE_NODE_STORE_SIZE)) { ++ uksm_vma_enter(batch_slots, index); ++ index = 0; ++ } ++ i = 0; ++ cond_resched(); ++ spin_lock(&vma_slot_list_lock); ++ } ++ } ++ ++ list_splice(&empty_vma_list, &vma_slot_new); ++ ++ spin_unlock(&vma_slot_list_lock); ++ ++ if (index) ++ uksm_vma_enter(batch_slots, index); ++ ++} ++ ++static inline int rung_round_finished(struct scan_rung *rung) ++{ ++ return rung->flags & UKSM_RUNG_ROUND_FINISHED; ++} ++ ++static inline void judge_slot(struct vma_slot *slot) ++{ ++ struct scan_rung *rung = slot->rung; ++ unsigned long dedup; ++ int deleted; ++ ++ dedup = cal_dedup_ratio(slot); ++ if (vma_fully_scanned(slot) && uksm_thrash_threshold) ++ deleted = vma_rung_enter(slot, &uksm_scan_ladder[0]); ++ else if (dedup && dedup >= uksm_abundant_threshold) ++ deleted = vma_rung_up(slot); ++ else ++ deleted = vma_rung_down(slot); ++ ++ slot->pages_merged = 0; ++ slot->pages_cowed = 0; ++ slot->this_sampled = 0; ++ ++ if (vma_fully_scanned(slot)) ++ slot->pages_scanned = 0; ++ ++ slot->last_scanned = slot->pages_scanned; ++ ++ /* If its deleted in above, then rung was already advanced. */ ++ if (!deleted) ++ advance_current_scan(rung); ++} ++ ++ ++static inline int hash_round_finished(void) ++{ ++ if (scanned_virtual_pages > (uksm_pages_total >> 2)) { ++ scanned_virtual_pages = 0; ++ if (uksm_pages_scanned) ++ fully_scanned_round++; ++ ++ return 1; ++ } else { ++ return 0; ++ } ++} ++ ++#define UKSM_MMSEM_BATCH 5 ++#define BUSY_RETRY 100 ++ ++/** ++ * uksm_do_scan() - the main worker function. ++ */ ++static noinline void uksm_do_scan(void) ++{ ++ struct vma_slot *slot, *iter; ++ struct mm_struct *busy_mm; ++ unsigned char round_finished, all_rungs_emtpy; ++ int i, err, mmsem_batch; ++ unsigned long pcost; ++ long long delta_exec; ++ unsigned long vpages, max_cpu_ratio; ++ unsigned long long start_time, end_time, scan_time; ++ unsigned int expected_jiffies; ++ ++ might_sleep(); ++ ++ vpages = 0; ++ ++ start_time = task_sched_runtime(current); ++ max_cpu_ratio = 0; ++ mmsem_batch = 0; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE;) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ unsigned long ratio; ++ int busy_retry; ++ ++ if (!rung->pages_to_scan) { ++ i++; ++ continue; ++ } ++ ++ if (!rung->vma_root.num) { ++ rung->pages_to_scan = 0; ++ i++; ++ continue; ++ } ++ ++ ratio = rung_real_ratio(rung->cpu_ratio); ++ if (ratio > max_cpu_ratio) ++ max_cpu_ratio = ratio; ++ ++ busy_retry = BUSY_RETRY; ++ /* ++ * Do not consider rung_round_finished() here, just used up the ++ * rung->pages_to_scan quota. ++ */ ++ while (rung->pages_to_scan && rung->vma_root.num && ++ likely(!freezing(current))) { ++ int reset = 0; ++ ++ slot = rung->current_scan; ++ ++ BUG_ON(vma_fully_scanned(slot)); ++ ++ if (mmsem_batch) ++ err = 0; ++ else ++ err = try_down_read_slot_mmap_sem(slot); ++ ++ if (err == -ENOENT) { ++rm_slot: ++ rung_rm_slot(slot); ++ continue; ++ } ++ ++ busy_mm = slot->mm; ++ ++ if (err == -EBUSY) { ++ /* skip other vmas on the same mm */ ++ do { ++ reset = advance_current_scan(rung); ++ iter = rung->current_scan; ++ busy_retry--; ++ if (iter->vma->vm_mm != busy_mm || ++ !busy_retry || reset) ++ break; ++ } while (1); ++ ++ if (iter->vma->vm_mm != busy_mm) { ++ continue; ++ } else { ++ /* scan round finsished */ ++ break; ++ } ++ } ++ ++ BUG_ON(!vma_can_enter(slot->vma)); ++ if (uksm_test_exit(slot->vma->vm_mm)) { ++ mmsem_batch = 0; ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ goto rm_slot; ++ } ++ ++ if (mmsem_batch) ++ mmsem_batch--; ++ else ++ mmsem_batch = UKSM_MMSEM_BATCH; ++ ++ /* Ok, we have take the mmap_sem, ready to scan */ ++ scan_vma_one_page(slot); ++ rung->pages_to_scan--; ++ vpages++; ++ ++ if (rung->current_offset + rung->step > slot->pages - 1 ++ || vma_fully_scanned(slot)) { ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ judge_slot(slot); ++ mmsem_batch = 0; ++ } else { ++ rung->current_offset += rung->step; ++ if (!mmsem_batch) ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ } ++ ++ busy_retry = BUSY_RETRY; ++ cond_resched(); ++ } ++ ++ if (mmsem_batch) { ++ up_read(&slot->vma->vm_mm->mmap_sem); ++ mmsem_batch = 0; ++ } ++ ++ if (freezing(current)) ++ break; ++ ++ cond_resched(); ++ } ++ end_time = task_sched_runtime(current); ++ delta_exec = end_time - start_time; ++ ++ if (freezing(current)) ++ return; ++ ++ cleanup_vma_slots(); ++ uksm_enter_all_slots(); ++ ++ round_finished = 1; ++ all_rungs_emtpy = 1; ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ struct scan_rung *rung = &uksm_scan_ladder[i]; ++ ++ if (rung->vma_root.num) { ++ all_rungs_emtpy = 0; ++ if (!rung_round_finished(rung)) ++ round_finished = 0; ++ } ++ } ++ ++ if (all_rungs_emtpy) ++ round_finished = 0; ++ ++ if (round_finished) { ++ round_update_ladder(); ++ uksm_eval_round++; ++ ++ if (hash_round_finished() && rshash_adjust()) { ++ /* Reset the unstable root iff hash strength changed */ ++ uksm_hash_round++; ++ root_unstable_tree = RB_ROOT; ++ free_all_tree_nodes(&unstable_tree_node_list); ++ } ++ ++ /* ++ * A number of pages can hang around indefinitely on per-cpu ++ * pagevecs, raised page count preventing write_protect_page ++ * from merging them. Though it doesn't really matter much, ++ * it is puzzling to see some stuck in pages_volatile until ++ * other activity jostles them out, and they also prevented ++ * LTP's KSM test from succeeding deterministically; so drain ++ * them here (here rather than on entry to uksm_do_scan(), ++ * so we don't IPI too often when pages_to_scan is set low). ++ */ ++ lru_add_drain_all(); ++ } ++ ++ ++ if (vpages && delta_exec > 0) { ++ pcost = (unsigned long) delta_exec / vpages; ++ if (likely(uksm_ema_page_time)) ++ uksm_ema_page_time = ema(pcost, uksm_ema_page_time); ++ else ++ uksm_ema_page_time = pcost; ++ } ++ ++ uksm_calc_scan_pages(); ++ uksm_sleep_real = uksm_sleep_jiffies; ++ /* in case of radical cpu bursts, apply the upper bound */ ++ end_time = task_sched_runtime(current); ++ if (max_cpu_ratio && end_time > start_time) { ++ scan_time = end_time - start_time; ++ expected_jiffies = msecs_to_jiffies( ++ scan_time_to_sleep(scan_time, max_cpu_ratio)); ++ ++ if (expected_jiffies > uksm_sleep_real) ++ uksm_sleep_real = expected_jiffies; ++ ++ /* We have a 1 second up bound for responsiveness. */ ++ if (jiffies_to_msecs(uksm_sleep_real) > MSEC_PER_SEC) ++ uksm_sleep_real = msecs_to_jiffies(1000); ++ } ++ ++ return; ++} ++ ++static int ksmd_should_run(void) ++{ ++ return uksm_run & UKSM_RUN_MERGE; ++} ++ ++static int uksm_scan_thread(void *nothing) ++{ ++ set_freezable(); ++ set_user_nice(current, 5); ++ ++ while (!kthread_should_stop()) { ++ mutex_lock(&uksm_thread_mutex); ++ if (ksmd_should_run()) ++ uksm_do_scan(); ++ mutex_unlock(&uksm_thread_mutex); ++ ++ try_to_freeze(); ++ ++ if (ksmd_should_run()) { ++ schedule_timeout_interruptible(uksm_sleep_real); ++ uksm_sleep_times++; ++ } else { ++ wait_event_freezable(uksm_thread_wait, ++ ksmd_should_run() || kthread_should_stop()); ++ } ++ } ++ return 0; ++} ++ ++void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) ++{ ++ struct stable_node *stable_node; ++ struct node_vma *node_vma; ++ struct rmap_item *rmap_item; ++ int search_new_forks = 0; ++ unsigned long address; ++ ++ VM_BUG_ON_PAGE(!PageKsm(page), page); ++ VM_BUG_ON_PAGE(!PageLocked(page), page); ++ ++ stable_node = page_stable_node(page); ++ if (!stable_node) ++ return; ++again: ++ hlist_for_each_entry(node_vma, &stable_node->hlist, hlist) { ++ hlist_for_each_entry(rmap_item, &node_vma->rmap_hlist, hlist) { ++ struct anon_vma *anon_vma = rmap_item->anon_vma; ++ struct anon_vma_chain *vmac; ++ struct vm_area_struct *vma; ++ ++ cond_resched(); ++ anon_vma_lock_read(anon_vma); ++ anon_vma_interval_tree_foreach(vmac, &anon_vma->rb_root, ++ 0, ULONG_MAX) { ++ cond_resched(); ++ vma = vmac->vma; ++ address = get_rmap_addr(rmap_item); ++ ++ if (address < vma->vm_start || ++ address >= vma->vm_end) ++ continue; ++ ++ if ((rmap_item->slot->vma == vma) == ++ search_new_forks) ++ continue; ++ ++ if (rwc->invalid_vma && rwc->invalid_vma(vma, rwc->arg)) ++ continue; ++ ++ if (!rwc->rmap_one(page, vma, address, rwc->arg)) { ++ anon_vma_unlock_read(anon_vma); ++ return; ++ } ++ ++ if (rwc->done && rwc->done(page)) { ++ anon_vma_unlock_read(anon_vma); ++ return; ++ } ++ } ++ anon_vma_unlock_read(anon_vma); ++ } ++ } ++ if (!search_new_forks++) ++ goto again; ++} ++ ++#ifdef CONFIG_MIGRATION ++/* Common ksm interface but may be specific to uksm */ ++void ksm_migrate_page(struct page *newpage, struct page *oldpage) ++{ ++ struct stable_node *stable_node; ++ ++ VM_BUG_ON_PAGE(!PageLocked(oldpage), oldpage); ++ VM_BUG_ON_PAGE(!PageLocked(newpage), newpage); ++ VM_BUG_ON(newpage->mapping != oldpage->mapping); ++ ++ stable_node = page_stable_node(newpage); ++ if (stable_node) { ++ VM_BUG_ON(stable_node->kpfn != page_to_pfn(oldpage)); ++ stable_node->kpfn = page_to_pfn(newpage); ++ /* ++ * newpage->mapping was set in advance; now we need smp_wmb() ++ * to make sure that the new stable_node->kpfn is visible ++ * to get_ksm_page() before it can see that oldpage->mapping ++ * has gone stale (or that PageSwapCache has been cleared). ++ */ ++ smp_wmb(); ++ set_page_stable_node(oldpage, NULL); ++ } ++} ++#endif /* CONFIG_MIGRATION */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++static struct stable_node *uksm_check_stable_tree(unsigned long start_pfn, ++ unsigned long end_pfn) ++{ ++ struct rb_node *node; ++ ++ for (node = rb_first(root_stable_treep); node; node = rb_next(node)) { ++ struct stable_node *stable_node; ++ ++ stable_node = rb_entry(node, struct stable_node, node); ++ if (stable_node->kpfn >= start_pfn && ++ stable_node->kpfn < end_pfn) ++ return stable_node; ++ } ++ return NULL; ++} ++ ++static int uksm_memory_callback(struct notifier_block *self, ++ unsigned long action, void *arg) ++{ ++ struct memory_notify *mn = arg; ++ struct stable_node *stable_node; ++ ++ switch (action) { ++ case MEM_GOING_OFFLINE: ++ /* ++ * Keep it very simple for now: just lock out ksmd and ++ * MADV_UNMERGEABLE while any memory is going offline. ++ * mutex_lock_nested() is necessary because lockdep was alarmed ++ * that here we take uksm_thread_mutex inside notifier chain ++ * mutex, and later take notifier chain mutex inside ++ * uksm_thread_mutex to unlock it. But that's safe because both ++ * are inside mem_hotplug_mutex. ++ */ ++ mutex_lock_nested(&uksm_thread_mutex, SINGLE_DEPTH_NESTING); ++ break; ++ ++ case MEM_OFFLINE: ++ /* ++ * Most of the work is done by page migration; but there might ++ * be a few stable_nodes left over, still pointing to struct ++ * pages which have been offlined: prune those from the tree. ++ */ ++ while ((stable_node = uksm_check_stable_tree(mn->start_pfn, ++ mn->start_pfn + mn->nr_pages)) != NULL) ++ remove_node_from_stable_tree(stable_node, 1, 1); ++ /* fallthrough */ ++ ++ case MEM_CANCEL_OFFLINE: ++ mutex_unlock(&uksm_thread_mutex); ++ break; ++ } ++ return NOTIFY_OK; ++} ++#endif /* CONFIG_MEMORY_HOTREMOVE */ ++ ++#ifdef CONFIG_SYSFS ++/* ++ * This all compiles without CONFIG_SYSFS, but is a waste of space. ++ */ ++ ++#define UKSM_ATTR_RO(_name) \ ++ static struct kobj_attribute _name##_attr = __ATTR_RO(_name) ++#define UKSM_ATTR(_name) \ ++ static struct kobj_attribute _name##_attr = \ ++ __ATTR(_name, 0644, _name##_show, _name##_store) ++ ++static ssize_t max_cpu_percentage_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_max_cpu_percentage); ++} ++ ++static ssize_t max_cpu_percentage_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long max_cpu_percentage; ++ int err; ++ ++ err = kstrtoul(buf, 10, &max_cpu_percentage); ++ if (err || max_cpu_percentage > 100) ++ return -EINVAL; ++ ++ if (max_cpu_percentage == 100) ++ max_cpu_percentage = 99; ++ else if (max_cpu_percentage < 10) ++ max_cpu_percentage = 10; ++ ++ uksm_max_cpu_percentage = max_cpu_percentage; ++ ++ return count; ++} ++UKSM_ATTR(max_cpu_percentage); ++ ++static ssize_t sleep_millisecs_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", jiffies_to_msecs(uksm_sleep_jiffies)); ++} ++ ++static ssize_t sleep_millisecs_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ unsigned long msecs; ++ int err; ++ ++ err = kstrtoul(buf, 10, &msecs); ++ if (err || msecs > MSEC_PER_SEC) ++ return -EINVAL; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(msecs); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ return count; ++} ++UKSM_ATTR(sleep_millisecs); ++ ++ ++static ssize_t cpu_governor_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ int i; ++ ++ buf[0] = '\0'; ++ for (i = 0; i < n ; i++) { ++ if (uksm_cpu_governor == i) ++ strcat(buf, "["); ++ ++ strcat(buf, uksm_cpu_governor_str[i]); ++ ++ if (uksm_cpu_governor == i) ++ strcat(buf, "]"); ++ ++ strcat(buf, " "); ++ } ++ strcat(buf, "\n"); ++ ++ return strlen(buf); ++} ++ ++static inline void init_performance_values(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ struct uksm_cpu_preset_s *preset = uksm_cpu_preset + uksm_cpu_governor; ++ ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ rung->cpu_ratio = preset->cpu_ratio[i]; ++ rung->cover_msecs = preset->cover_msecs[i]; ++ } ++ ++ uksm_max_cpu_percentage = preset->max_cpu; ++} ++ ++static ssize_t cpu_governor_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int n = sizeof(uksm_cpu_governor_str) / sizeof(char *); ++ ++ for (n--; n >= 0 ; n--) { ++ if (!strncmp(buf, uksm_cpu_governor_str[n], ++ strlen(uksm_cpu_governor_str[n]))) ++ break; ++ } ++ ++ if (n < 0) ++ return -EINVAL; ++ else ++ uksm_cpu_governor = n; ++ ++ init_performance_values(); ++ ++ return count; ++} ++UKSM_ATTR(cpu_governor); ++ ++static ssize_t run_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_run); ++} ++ ++static ssize_t run_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > UINT_MAX) ++ return -EINVAL; ++ if (flags > UKSM_RUN_MERGE) ++ return -EINVAL; ++ ++ mutex_lock(&uksm_thread_mutex); ++ if (uksm_run != flags) ++ uksm_run = flags; ++ mutex_unlock(&uksm_thread_mutex); ++ ++ if (flags & UKSM_RUN_MERGE) ++ wake_up_interruptible(&uksm_thread_wait); ++ ++ return count; ++} ++UKSM_ATTR(run); ++ ++static ssize_t abundant_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_abundant_threshold); ++} ++ ++static ssize_t abundant_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_abundant_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(abundant_threshold); ++ ++static ssize_t thrash_threshold_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", uksm_thrash_threshold); ++} ++ ++static ssize_t thrash_threshold_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ unsigned long flags; ++ ++ err = kstrtoul(buf, 10, &flags); ++ if (err || flags > 99) ++ return -EINVAL; ++ ++ uksm_thrash_threshold = flags; ++ ++ return count; ++} ++UKSM_ATTR(thrash_threshold); ++ ++static ssize_t cpu_ratios_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ if (rung->cpu_ratio > 0) ++ size = sprintf(p, "%d ", rung->cpu_ratio); ++ else ++ size = sprintf(p, "MAX/%d ", ++ TIME_RATIO_SCALE / -rung->cpu_ratio); ++ ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t cpu_ratios_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, cpuratios[SCAN_LADDER_SIZE], err; ++ unsigned long value; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ++ p = kzalloc(count, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE - 1) { ++ end = strchr(p, ' '); ++ if (!end) ++ return -EINVAL; ++ ++ *end = '\0'; ++ } ++ ++ if (strstr(p, "MAX/")) { ++ p = strchr(p, '/') + 1; ++ err = kstrtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = -(int) (TIME_RATIO_SCALE / value); ++ } else { ++ err = kstrtoul(p, 10, &value); ++ if (err || value > TIME_RATIO_SCALE || !value) ++ return -EINVAL; ++ ++ cpuratios[i] = value; ++ } ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cpu_ratio = cpuratios[i]; ++ } ++ ++ return count; ++} ++UKSM_ATTR(cpu_ratios); ++ ++static ssize_t eval_intervals_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ int i, size; ++ struct scan_rung *rung; ++ char *p = buf; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ size = sprintf(p, "%u ", rung->cover_msecs); ++ p += size; ++ } ++ ++ *p++ = '\n'; ++ *p = '\0'; ++ ++ return p - buf; ++} ++ ++static ssize_t eval_intervals_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int i, err; ++ unsigned long values[SCAN_LADDER_SIZE]; ++ struct scan_rung *rung; ++ char *p, *end = NULL; ++ ssize_t ret = count; ++ ++ p = kzalloc(count + 2, GFP_KERNEL); ++ if (!p) ++ return -ENOMEM; ++ ++ memcpy(p, buf, count); ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ if (i != SCAN_LADDER_SIZE - 1) { ++ end = strchr(p, ' '); ++ if (!end) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ *end = '\0'; ++ } ++ ++ err = kstrtoul(p, 10, &values[i]); ++ if (err) { ++ ret = -EINVAL; ++ goto out; ++ } ++ ++ p = end + 1; ++ } ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = &uksm_scan_ladder[i]; ++ ++ rung->cover_msecs = values[i]; ++ } ++ ++out: ++ kfree(p); ++ return ret; ++} ++UKSM_ATTR(eval_intervals); ++ ++static ssize_t ema_per_page_time_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_ema_page_time); ++} ++UKSM_ATTR_RO(ema_per_page_time); ++ ++static ssize_t pages_shared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_shared); ++} ++UKSM_ATTR_RO(pages_shared); ++ ++static ssize_t pages_sharing_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_sharing); ++} ++UKSM_ATTR_RO(pages_sharing); ++ ++static ssize_t pages_unshared_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", uksm_pages_unshared); ++} ++UKSM_ATTR_RO(pages_unshared); ++ ++static ssize_t full_scans_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", fully_scanned_round); ++} ++UKSM_ATTR_RO(full_scans); ++ ++static ssize_t pages_scanned_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ unsigned long base = 0; ++ u64 delta, ret; ++ ++ if (pages_scanned_stored) { ++ base = pages_scanned_base; ++ ret = pages_scanned_stored; ++ delta = uksm_pages_scanned >> base; ++ if (CAN_OVERFLOW_U64(ret, delta)) { ++ ret >>= 1; ++ delta >>= 1; ++ base++; ++ ret += delta; ++ } ++ } else { ++ ret = uksm_pages_scanned; ++ } ++ ++ while (ret > ULONG_MAX) { ++ ret >>= 1; ++ base++; ++ } ++ ++ if (base) ++ return sprintf(buf, "%lu * 2^%lu\n", (unsigned long)ret, base); ++ else ++ return sprintf(buf, "%lu\n", (unsigned long)ret); ++} ++UKSM_ATTR_RO(pages_scanned); ++ ++static ssize_t hash_strength_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%lu\n", hash_strength); ++} ++UKSM_ATTR_RO(hash_strength); ++ ++static ssize_t sleep_times_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%llu\n", uksm_sleep_times); ++} ++UKSM_ATTR_RO(sleep_times); ++ ++ ++static struct attribute *uksm_attrs[] = { ++ &max_cpu_percentage_attr.attr, ++ &sleep_millisecs_attr.attr, ++ &cpu_governor_attr.attr, ++ &run_attr.attr, ++ &ema_per_page_time_attr.attr, ++ &pages_shared_attr.attr, ++ &pages_sharing_attr.attr, ++ &pages_unshared_attr.attr, ++ &full_scans_attr.attr, ++ &pages_scanned_attr.attr, ++ &hash_strength_attr.attr, ++ &sleep_times_attr.attr, ++ &thrash_threshold_attr.attr, ++ &abundant_threshold_attr.attr, ++ &cpu_ratios_attr.attr, ++ &eval_intervals_attr.attr, ++ NULL, ++}; ++ ++static struct attribute_group uksm_attr_group = { ++ .attrs = uksm_attrs, ++ .name = "uksm", ++}; ++#endif /* CONFIG_SYSFS */ ++ ++static inline void init_scan_ladder(void) ++{ ++ int i; ++ struct scan_rung *rung; ++ ++ for (i = 0; i < SCAN_LADDER_SIZE; i++) { ++ rung = uksm_scan_ladder + i; ++ slot_tree_init_root(&rung->vma_root); ++ } ++ ++ init_performance_values(); ++ uksm_calc_scan_pages(); ++} ++ ++static inline int cal_positive_negative_costs(void) ++{ ++ struct page *p1, *p2; ++ unsigned char *addr1, *addr2; ++ unsigned long i, time_start, hash_cost; ++ unsigned long loopnum = 0; ++ ++ /*IMPORTANT: volatile is needed to prevent over-optimization by gcc. */ ++ volatile u32 hash; ++ volatile int ret; ++ ++ p1 = alloc_page(GFP_KERNEL); ++ if (!p1) ++ return -ENOMEM; ++ ++ p2 = alloc_page(GFP_KERNEL); ++ if (!p2) ++ return -ENOMEM; ++ ++ addr1 = kmap_atomic(p1); ++ addr2 = kmap_atomic(p2); ++ memset(addr1, prandom_u32(), PAGE_SIZE); ++ memcpy(addr2, addr1, PAGE_SIZE); ++ ++ /* make sure that the two pages differ in last byte */ ++ addr2[PAGE_SIZE-1] = ~addr2[PAGE_SIZE-1]; ++ kunmap_atomic(addr2); ++ kunmap_atomic(addr1); ++ ++ time_start = jiffies; ++ while (jiffies - time_start < 100) { ++ for (i = 0; i < 100; i++) ++ hash = page_hash(p1, HASH_STRENGTH_FULL, 0); ++ loopnum += 100; ++ } ++ hash_cost = (jiffies - time_start); ++ ++ time_start = jiffies; ++ for (i = 0; i < loopnum; i++) ++ ret = pages_identical(p1, p2); ++ memcmp_cost = HASH_STRENGTH_FULL * (jiffies - time_start); ++ memcmp_cost /= hash_cost; ++ pr_info("UKSM: relative memcmp_cost = %lu " ++ "hash=%u cmp_ret=%d.\n", ++ memcmp_cost, hash, ret); ++ ++ __free_page(p1); ++ __free_page(p2); ++ return 0; ++} ++ ++static int init_zeropage_hash_table(void) ++{ ++ struct page *page; ++ char *addr; ++ int i; ++ ++ page = alloc_page(GFP_KERNEL); ++ if (!page) ++ return -ENOMEM; ++ ++ addr = kmap_atomic(page); ++ memset(addr, 0, PAGE_SIZE); ++ kunmap_atomic(addr); ++ ++ zero_hash_table = kmalloc_array(HASH_STRENGTH_MAX, sizeof(u32), ++ GFP_KERNEL); ++ if (!zero_hash_table) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_MAX; i++) ++ zero_hash_table[i] = page_hash(page, i, 0); ++ ++ __free_page(page); ++ ++ return 0; ++} ++ ++static inline int init_random_sampling(void) ++{ ++ unsigned long i; ++ ++ random_nums = kmalloc(PAGE_SIZE, GFP_KERNEL); ++ if (!random_nums) ++ return -ENOMEM; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) ++ random_nums[i] = i; ++ ++ for (i = 0; i < HASH_STRENGTH_FULL; i++) { ++ unsigned long rand_range, swap_index, tmp; ++ ++ rand_range = HASH_STRENGTH_FULL - i; ++ swap_index = i + prandom_u32() % rand_range; ++ tmp = random_nums[i]; ++ random_nums[i] = random_nums[swap_index]; ++ random_nums[swap_index] = tmp; ++ } ++ ++ rshash_state.state = RSHASH_NEW; ++ rshash_state.below_count = 0; ++ rshash_state.lookup_window_index = 0; ++ ++ return cal_positive_negative_costs(); ++} ++ ++static int __init uksm_slab_init(void) ++{ ++ rmap_item_cache = UKSM_KMEM_CACHE(rmap_item, 0); ++ if (!rmap_item_cache) ++ goto out; ++ ++ stable_node_cache = UKSM_KMEM_CACHE(stable_node, 0); ++ if (!stable_node_cache) ++ goto out_free1; ++ ++ node_vma_cache = UKSM_KMEM_CACHE(node_vma, 0); ++ if (!node_vma_cache) ++ goto out_free2; ++ ++ vma_slot_cache = UKSM_KMEM_CACHE(vma_slot, 0); ++ if (!vma_slot_cache) ++ goto out_free3; ++ ++ tree_node_cache = UKSM_KMEM_CACHE(tree_node, 0); ++ if (!tree_node_cache) ++ goto out_free4; ++ ++ return 0; ++ ++out_free4: ++ kmem_cache_destroy(vma_slot_cache); ++out_free3: ++ kmem_cache_destroy(node_vma_cache); ++out_free2: ++ kmem_cache_destroy(stable_node_cache); ++out_free1: ++ kmem_cache_destroy(rmap_item_cache); ++out: ++ return -ENOMEM; ++} ++ ++static void __init uksm_slab_free(void) ++{ ++ kmem_cache_destroy(stable_node_cache); ++ kmem_cache_destroy(rmap_item_cache); ++ kmem_cache_destroy(node_vma_cache); ++ kmem_cache_destroy(vma_slot_cache); ++ kmem_cache_destroy(tree_node_cache); ++} ++ ++/* Common interface to ksm, different to it. */ ++int ksm_madvise(struct vm_area_struct *vma, unsigned long start, ++ unsigned long end, int advice, unsigned long *vm_flags) ++{ ++ int err; ++ ++ switch (advice) { ++ case MADV_MERGEABLE: ++ return 0; /* just ignore the advice */ ++ ++ case MADV_UNMERGEABLE: ++ if (!(*vm_flags & VM_MERGEABLE) || !uksm_flags_can_scan(*vm_flags)) ++ return 0; /* just ignore the advice */ ++ ++ if (vma->anon_vma) { ++ err = unmerge_uksm_pages(vma, start, end); ++ if (err) ++ return err; ++ } ++ ++ uksm_remove_vma(vma); ++ *vm_flags &= ~VM_MERGEABLE; ++ break; ++ } ++ ++ return 0; ++} ++ ++/* Common interface to ksm, actually the same. */ ++struct page *ksm_might_need_to_copy(struct page *page, ++ struct vm_area_struct *vma, unsigned long address) ++{ ++ struct anon_vma *anon_vma = page_anon_vma(page); ++ struct page *new_page; ++ ++ if (PageKsm(page)) { ++ if (page_stable_node(page)) ++ return page; /* no need to copy it */ ++ } else if (!anon_vma) { ++ return page; /* no need to copy it */ ++ } else if (anon_vma->root == vma->anon_vma->root && ++ page->index == linear_page_index(vma, address)) { ++ return page; /* still no need to copy it */ ++ } ++ if (!PageUptodate(page)) ++ return page; /* let do_swap_page report the error */ ++ ++ new_page = alloc_page_vma(GFP_HIGHUSER_MOVABLE, vma, address); ++ if (new_page) { ++ copy_user_highpage(new_page, page, address, vma); ++ ++ SetPageDirty(new_page); ++ __SetPageUptodate(new_page); ++ __SetPageLocked(new_page); ++ } ++ ++ return new_page; ++} ++ ++static int __init uksm_init(void) ++{ ++ struct task_struct *uksm_thread; ++ int err; ++ ++ uksm_sleep_jiffies = msecs_to_jiffies(100); ++ uksm_sleep_saved = uksm_sleep_jiffies; ++ ++ slot_tree_init(); ++ init_scan_ladder(); ++ ++ ++ err = init_random_sampling(); ++ if (err) ++ goto out_free2; ++ ++ err = uksm_slab_init(); ++ if (err) ++ goto out_free1; ++ ++ err = init_zeropage_hash_table(); ++ if (err) ++ goto out_free0; ++ ++ uksm_thread = kthread_run(uksm_scan_thread, NULL, "uksmd"); ++ if (IS_ERR(uksm_thread)) { ++ pr_err("uksm: creating kthread failed\n"); ++ err = PTR_ERR(uksm_thread); ++ goto out_free; ++ } ++ ++#ifdef CONFIG_SYSFS ++ err = sysfs_create_group(mm_kobj, &uksm_attr_group); ++ if (err) { ++ pr_err("uksm: register sysfs failed\n"); ++ kthread_stop(uksm_thread); ++ goto out_free; ++ } ++#else ++ uksm_run = UKSM_RUN_MERGE; /* no way for user to start it */ ++ ++#endif /* CONFIG_SYSFS */ ++ ++#ifdef CONFIG_MEMORY_HOTREMOVE ++ /* ++ * Choose a high priority since the callback takes uksm_thread_mutex: ++ * later callbacks could only be taking locks which nest within that. ++ */ ++ hotplug_memory_notifier(uksm_memory_callback, 100); ++#endif ++ return 0; ++ ++out_free: ++ kfree(zero_hash_table); ++out_free0: ++ uksm_slab_free(); ++out_free1: ++ kfree(random_nums); ++out_free2: ++ kfree(uksm_scan_ladder); ++ return err; ++} ++ ++#ifdef MODULE ++subsys_initcall(ksm_init); ++#else ++late_initcall(uksm_init); ++#endif ++ +diff -Nur a/mm/vmstat.c b/mm/vmstat.c +--- a/mm/vmstat.c 2019-02-06 16:30:16.000000000 +0000 ++++ b/mm/vmstat.c 2019-02-09 17:23:06.736864024 +0000 +@@ -1163,6 +1163,9 @@ + "nr_written", + "", /* nr_indirectly_reclaimable */ + ++#ifdef CONFIG_UKSM ++ "nr_uksm_zero_pages", ++#endif + /* enum writeback_stat_item counters */ + "nr_dirty_threshold", + "nr_dirty_background_threshold", diff --git a/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.14.90.ebuild b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.14.95.ebuild index e3ff2748..318e5775 100644 --- a/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.14.90.ebuild +++ b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.14.95.ebuild @@ -7,6 +7,7 @@ inherit eutils EXTRAVERSION="redcore-lts" KV_FULL="${PV}-${EXTRAVERSION}" +KV_MAJOR="4.14" DESCRIPTION="Official Redcore Linux Kernel Image" HOMEPAGE="https://redcorelinux.org" @@ -32,32 +33,34 @@ DEPEND=" >=sys-kernel/linux-firmware-20180314" RDEPEND="${DEPEND}" -PATCHES=( "${FILESDIR}"/introduce-NUMA-identity-node-sched-domain.patch - "${FILESDIR}"/k10temp-add-ZEN-support.patch - "${FILESDIR}"/mute-pps_state_mismatch.patch - "${FILESDIR}"/restore-SD_PREFER_SIBLING-on-MC-domains.patch - "${FILESDIR}"/Revert-ath10k-activate-user-space-firmware-loading.patch - "${FILESDIR}"/linux-hardened.patch - "${FILESDIR}"/uksm-linux-hardened.patch - "${FILESDIR}"/0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch - "${FILESDIR}"/0002-Make-preemptible-kernel-default.patch - "${FILESDIR}"/0003-Expose-vmsplit-for-our-poor-32-bit-users.patch - "${FILESDIR}"/0004-Create-highres-timeout-variants-of-schedule_timeout-.patch - "${FILESDIR}"/0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch - "${FILESDIR}"/0006-Convert-msleep-to-use-hrtimers-when-active.patch - "${FILESDIR}"/0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch - "${FILESDIR}"/0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch - "${FILESDIR}"/0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch - "${FILESDIR}"/0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch - "${FILESDIR}"/0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch - "${FILESDIR}"/0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch - "${FILESDIR}"/0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch - "${FILESDIR}"/0014-Swap-sucks.patch - "${FILESDIR}"/0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch - "${FILESDIR}"/0016-unfuck-MuQSS-on-linux-4_14_15+.patch - "${FILESDIR}"/0017-unfuck-MuQSS-on-linux-4_14_75+.patch - "${FILESDIR}"/0001-BFQ-v8r12-20171108.patch - "${FILESDIR}"/0002-BFQ-v8r12-20180404.patch ) +PATCHES=( + "${FILESDIR}"/"${KV_MAJOR}"-introduce-NUMA-identity-node-sched-domain.patch + "${FILESDIR}"/"${KV_MAJOR}"-k10temp-add-ZEN-support.patch + "${FILESDIR}"/"${KV_MAJOR}"-mute-pps_state_mismatch.patch + "${FILESDIR}"/"${KV_MAJOR}"-restore-SD_PREFER_SIBLING-on-MC-domains.patch + "${FILESDIR}"/"${KV_MAJOR}"-Revert-ath10k-activate-user-space-firmware-loading.patch + "${FILESDIR}"/"${KV_MAJOR}"-linux-hardened.patch + "${FILESDIR}"/"${KV_MAJOR}"-uksm-linux-hardened.patch + "${FILESDIR}"/"${KV_MAJOR}"-0001-MuQSS-version-0.162-CPU-scheduler-linux-hardened.patch + "${FILESDIR}"/"${KV_MAJOR}"-0002-Make-preemptible-kernel-default.patch + "${FILESDIR}"/"${KV_MAJOR}"-0003-Expose-vmsplit-for-our-poor-32-bit-users.patch + "${FILESDIR}"/"${KV_MAJOR}"-0004-Create-highres-timeout-variants-of-schedule_timeout-.patch + "${FILESDIR}"/"${KV_MAJOR}"-0005-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch + "${FILESDIR}"/"${KV_MAJOR}"-0006-Convert-msleep-to-use-hrtimers-when-active.patch + "${FILESDIR}"/"${KV_MAJOR}"-0007-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch + "${FILESDIR}"/"${KV_MAJOR}"-0008-Replace-all-calls-to-schedule_timeout_interruptible-.patch + "${FILESDIR}"/"${KV_MAJOR}"-0009-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch + "${FILESDIR}"/"${KV_MAJOR}"-0010-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch + "${FILESDIR}"/"${KV_MAJOR}"-0011-Make-hrtimer-granularity-and-minimum-hrtimeout-confi.patch + "${FILESDIR}"/"${KV_MAJOR}"-0012-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch + "${FILESDIR}"/"${KV_MAJOR}"-0013-Make-threaded-IRQs-optionally-the-default-which-can-.patch + "${FILESDIR}"/"${KV_MAJOR}"-0014-Swap-sucks.patch + "${FILESDIR}"/"${KV_MAJOR}"-0015-MuQSS.c-needs-irq_regs.h-to-use-get_irq_regs.patch + "${FILESDIR}"/"${KV_MAJOR}"-0016-unfuck-MuQSS-on-linux-4_14_15+.patch + "${FILESDIR}"/"${KV_MAJOR}"-0017-unfuck-MuQSS-on-linux-4_14_75+.patch + "${FILESDIR}"/"${KV_MAJOR}"-0001-BFQ-v8r12-20171108.patch + "${FILESDIR}"/"${KV_MAJOR}"-0002-BFQ-v8r12-20180404.patch +) S="${WORKDIR}"/linux-"${PV}" @@ -73,7 +76,7 @@ src_prepare() { default emake mrproper sed -ri "s|^(EXTRAVERSION =).*|\1 -${EXTRAVERSION}|" Makefile - cp "${FILESDIR}"/"${EXTRAVERSION}"-amd64.config .config + cp "${FILESDIR}"/"${KV_MAJOR}"-"${EXTRAVERSION}"-amd64.config .config rm -rf $(find . -type f|grep -F \.orig) } diff --git a/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.19.20.ebuild b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.19.20.ebuild new file mode 100644 index 00000000..1d2bace9 --- /dev/null +++ b/sys-kernel/linux-image-redcore-lts/linux-image-redcore-lts-4.19.20.ebuild @@ -0,0 +1,167 @@ +# Copyright 1999-2017 Gentoo Foundation +# Distributed under the terms of the GNU General Public License v2 + +EAPI=6 + +inherit eutils + +EXTRAVERSION="redcore-lts" +KV_FULL="${PV}-${EXTRAVERSION}" +KV_MAJOR="4.19" + +DESCRIPTION="Official Redcore Linux Kernel Image" +HOMEPAGE="https://redcorelinux.org" +SRC_URI="https://cdn.kernel.org/pub/linux/kernel/v4.x/linux-${PV}.tar.xz" + +KEYWORDS="amd64" +LICENSE="GPL-2" +SLOT="${PV}" +IUSE="+cryptsetup +dmraid +dracut +dkms +mdadm" + +RESTRICT="binchecks strip mirror" +DEPEND=" + app-arch/lz4 + app-arch/xz-utils + sys-devel/autoconf + sys-devel/bc + sys-devel/make + cryptsetup? ( sys-fs/cryptsetup ) + dmraid? ( sys-fs/dmraid ) + dracut? ( >=sys-kernel/dracut-0.44-r8 ) + dkms? ( sys-kernel/dkms ~sys-kernel/linux-sources-redcore-lts-${PV} ) + mdadm? ( sys-fs/mdadm ) + >=sys-kernel/linux-firmware-20180314" +RDEPEND="${DEPEND}" + +PATCHES=( + "${FILESDIR}"/"${KV_MAJOR}"-ata-fix-NCQ-LOG-strings-and-move-to-debug.patch + "${FILESDIR}"/"${KV_MAJOR}"-ath10k-drop-WARN_ON-added-in-cd93b83ad927b2c7979e0add0343ace59328b461.patch + "${FILESDIR}"/"${KV_MAJOR}"-drop_ancient-and-wrong-msg.patch + "${FILESDIR}"/"${KV_MAJOR}"-enable_alx_wol.patch + "${FILESDIR}"/"${KV_MAJOR}"-mute-pps_state_mismatch.patch + "${FILESDIR}"/"${KV_MAJOR}"-nouveau-pascal-backlight.patch + "${FILESDIR}"/"${KV_MAJOR}"-radeon_dp_aux_transfer_native-no-ratelimited_debug.patch + "${FILESDIR}"/"${KV_MAJOR}"-revert-patches-causing-instant-reboot.patch + "${FILESDIR}"/"${KV_MAJOR}"-linux-hardened.patch + "${FILESDIR}"/"${KV_MAJOR}"-uksm-linux-hardened.patch + "${FILESDIR}"/"${KV_MAJOR}"-bfq-sq-mq-v9r1-2K190204-rc1.patch + "${FILESDIR}"/"${KV_MAJOR}"-0001-MultiQueue-Skiplist-Scheduler-version-v0.180-linux-hardened.patch + "${FILESDIR}"/"${KV_MAJOR}"-0002-Fix-Werror-build-failure-in-tools.patch + "${FILESDIR}"/"${KV_MAJOR}"-0003-Make-preemptible-kernel-default.patch + "${FILESDIR}"/"${KV_MAJOR}"-0004-Expose-vmsplit-for-our-poor-32-bit-users.patch + "${FILESDIR}"/"${KV_MAJOR}"-0005-Create-highres-timeout-variants-of-schedule_timeout-.patch + "${FILESDIR}"/"${KV_MAJOR}"-0006-Special-case-calls-of-schedule_timeout-1-to-use-the-.patch + "${FILESDIR}"/"${KV_MAJOR}"-0007-Convert-msleep-to-use-hrtimers-when-active.patch + "${FILESDIR}"/"${KV_MAJOR}"-0008-Replace-all-schedule-timeout-1-with-schedule_min_hrt.patch + "${FILESDIR}"/"${KV_MAJOR}"-0009-Replace-all-calls-to-schedule_timeout_interruptible-.patch + "${FILESDIR}"/"${KV_MAJOR}"-0010-Replace-all-calls-to-schedule_timeout_uninterruptibl.patch + "${FILESDIR}"/"${KV_MAJOR}"-0011-Don-t-use-hrtimer-overlay-when-pm_freezing-since-som.patch + "${FILESDIR}"/"${KV_MAJOR}"-0012-Make-threaded-IRQs-optionally-the-default-which-can-.patch + "${FILESDIR}"/"${KV_MAJOR}"-0013-Reinstate-default-Hz-of-100-in-combination-with-MuQS.patch + "${FILESDIR}"/"${KV_MAJOR}"-0014-Swap-sucks.patch + "${FILESDIR}"/"${KV_MAJOR}"-0015-unfuck-MuQSS-on-linux-4_19_10+.patch +) + +S="${WORKDIR}"/linux-"${PV}" + +pkg_setup() { + export KBUILD_BUILD_USER="nexus" + export KBUILD_BUILD_HOST="nexus.redcorelinux.org" + + export REAL_ARCH="$ARCH" + unset ARCH ; unset LDFLAGS #will interfere with Makefile if set +} + +src_prepare() { + default + emake mrproper + sed -ri "s|^(EXTRAVERSION =).*|\1 -${EXTRAVERSION}|" Makefile + cp "${FILESDIR}"/"${KV_MAJOR}"-"${EXTRAVERSION}"-amd64.config .config + rm -rf $(find . -type f|grep -F \.orig) +} + +src_compile() { + emake prepare modules_prepare bzImage modules +} + +src_install() { + dodir boot + insinto boot + newins .config config-"${KV_FULL}" + newins System.map System.map-"${KV_FULL}" + newins arch/x86/boot/bzImage vmlinuz-"${KV_FULL}" + + dodir usr/src/linux-"${KV_FULL}" + insinto usr/src/linux-"${KV_FULL}" + doins Module.symvers + doins System.map + exeinto usr/src/linux-"${KV_FULL}" + doexe vmlinux + + emake INSTALL_MOD_PATH="${D}" modules_install + + rm -f "${D}"lib/modules/"${KV_FULL}"/build + rm -f "${D}"lib/modules/"${KV_FULL}"/source + export local KSYMS + for KSYMS in build source ; do + dosym ../../../usr/src/linux-"${KV_FULL}" lib/modules/"${KV_FULL}"/"${KSYMS}" + done +} + +_grub2_update_grubcfg() { + if [[ -x $(which grub2-mkconfig) ]]; then + elog "Updating GRUB-2 bootloader configuration, please wait" + grub2-mkconfig -o "${ROOT}"boot/grub/grub.cfg + else + elog "It looks like you're not using GRUB-2, you must update bootloader configuration by hand" + fi +} + +_dracut_initrd_create() { + if [[ -x $(which dracut) ]]; then + elog "Generating initrd for "${KV_FULL}", please wait" + addpredict /etc/ld.so.cache~ + dracut -N -f --kver="${KV_FULL}" "${ROOT}"boot/initrd-"${KV_FULL}" + else + elog "It looks like you're not using dracut, you must generate an initrd by hand" + fi +} + +_dracut_initrd_delete() { + rm -rf "${ROOT}"boot/initrd-"${KV_FULL}" +} + +_dkms_modules_delete() { + if [[ -x $(which dkms) ]] ; then + export local DKMSMOD + for DKMSMOD in $(dkms status | cut -d " " -f1,2 | sed -e 's/,//g' | sed -e 's/ /\//g' | sed -e 's/://g') ; do + dkms remove "${DKMSMOD}" -k "${KV_FULL}" + done + fi +} + +_kernel_modules_delete() { + rm -rf "${ROOT}"lib/modules/"${KV_FULL}" +} + +pkg_postinst() { + if [ $(stat -c %d:%i /) == $(stat -c %d:%i /proc/1/root/.) ]; then + if use dracut; then + _dracut_initrd_create + fi + _grub2_update_grubcfg + fi +} + +pkg_postrm() { + if [ $(stat -c %d:%i /) == $(stat -c %d:%i /proc/1/root/.) ]; then + if use dracut; then + _dracut_initrd_delete + fi + _grub2_update_grubcfg + fi + if use dkms; then + _dkms_modules_delete + fi + _kernel_modules_delete +} |