From c15ee3945bfefa1a62143fb606a1f865fdbac94d Mon Sep 17 00:00:00 2001 From: Sangbum Kim Date: Fri, 28 Jul 2017 12:57:24 +0900 Subject: [PATCH] bump up --- PKGBUILD | 4 +- config.saved.x86_64 | 168 +- linux-spica.install | 2 +- ...onfig-build-bits-for-BFQ-v7r11-4.10..patch | 103 - ...e-the-BFQ-v7r11-I-O-sched-for-4.10.0.patch | 7109 ------------- ...rly-Queue-Merge-EQM-to-BFQ-v7r11-for.patch | 1101 -- ...-for-4.10.0-into-BFQ-v8r8-for-4.10.0.patch | 9187 ----------------- ...izations_for_gcc_v4.9+_kernel_v3.15+.patch | 533 - ...f_broadcasted_mce_after_system_panic.patch | 43 - 9 files changed, 127 insertions(+), 18123 deletions(-) delete mode 100644 patches/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.10..patch delete mode 100644 patches/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.10.0.patch delete mode 100644 patches/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch delete mode 100644 patches/0004-Turn-BFQ-v7r11-for-4.10.0-into-BFQ-v8r8-for-4.10.0.patch delete mode 100644 patches/enable_additional_cpu_optimizations_for_gcc_v4.9+_kernel_v3.15+.patch delete mode 100644 patches/mce_Keep_quiet_in_case_of_broadcasted_mce_after_system_panic.patch diff --git a/PKGBUILD b/PKGBUILD index 67906a2..21c9ae4 100644 --- a/PKGBUILD +++ b/PKGBUILD @@ -9,7 +9,7 @@ pkgname=$pkgbase pkgdesc="The Linux Kernel and modules from Linus' git tree" depends=('coreutils' 'linux-firmware' 'module-init-tools' 'mkinitcpio') -pkgver=4.11.rc7 +pkgver=4.13.rc1 pkgrel=1 url="http://www.kernel.org/" arch=(i686 x86_64) @@ -24,7 +24,7 @@ sha256sums=('SKIP') # set _gitrev to a git revision (man gitrevisions) like a tag, a commit sha1 # hash or a branch name to build from this tree instead of master -_gitrev="v4.10.12" +_gitrev="v4.12.4" #################################################################### # KERNEL CONFIG FILES diff --git a/config.saved.x86_64 b/config.saved.x86_64 index 5867120..2c0a382 100644 --- a/config.saved.x86_64 +++ b/config.saved.x86_64 @@ -1,6 +1,6 @@ # # Automatically generated file; DO NOT EDIT. -# Linux/x86 4.10.12 Kernel Configuration +# Linux/x86 4.12.4 Kernel Configuration # CONFIG_64BIT=y CONFIG_X86_64=y @@ -41,7 +41,6 @@ CONFIG_HAVE_INTEL_TXT=y CONFIG_X86_64_SMP=y CONFIG_ARCH_SUPPORTS_UPROBES=y CONFIG_FIX_EARLYCON_MEM=y -CONFIG_DEBUG_RODATA=y CONFIG_PGTABLE_LEVELS=4 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" CONFIG_IRQ_WORK=y @@ -128,21 +127,23 @@ CONFIG_BSD_PROCESS_ACCT_V3=y CONFIG_TREE_RCU=y CONFIG_RCU_EXPERT=y CONFIG_SRCU=y +# CONFIG_CLASSIC_SRCU is not set +CONFIG_TREE_SRCU=y # CONFIG_TASKS_RCU is not set CONFIG_RCU_STALL_COMMON=y +CONFIG_RCU_NEED_SEGCBLIST=y CONFIG_RCU_FANOUT=64 CONFIG_RCU_FANOUT_LEAF=16 # CONFIG_RCU_FAST_NO_HZ is not set # CONFIG_TREE_RCU_TRACE is not set CONFIG_RCU_KTHREAD_PRIO=0 # CONFIG_RCU_NOCB_CPU is not set -# CONFIG_RCU_EXPEDITE_BOOT is not set CONFIG_BUILD_BIN2C=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=19 CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 -CONFIG_NMI_LOG_BUF_SHIFT=13 +CONFIG_PRINTK_SAFE_LOG_BUF_SHIFT=12 CONFIG_HAVE_UNSTABLE_SCHED_CLOCK=y CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH=y @@ -161,6 +162,7 @@ CONFIG_FAIR_GROUP_SCHED=y # CONFIG_CFS_BANDWIDTH is not set # CONFIG_RT_GROUP_SCHED is not set CONFIG_CGROUP_PIDS=y +CONFIG_CGROUP_RDMA=y CONFIG_CGROUP_FREEZER=y CONFIG_CGROUP_HUGETLB=y CONFIG_CPUSETS=y @@ -189,7 +191,6 @@ CONFIG_RD_GZIP=y # CONFIG_RD_XZ is not set # CONFIG_RD_LZO is not set CONFIG_RD_LZ4=y -CONFIG_INITRAMFS_COMPRESSION=".gz" CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE=y # CONFIG_CC_OPTIMIZE_FOR_SIZE is not set CONFIG_SYSCTL=y @@ -223,6 +224,7 @@ CONFIG_PCI_QUIRKS=y CONFIG_MEMBARRIER=y CONFIG_EMBEDDED=y CONFIG_HAVE_PERF_EVENTS=y +# CONFIG_PC104 is not set # # Kernel Performance Events And Counters @@ -231,6 +233,7 @@ CONFIG_PERF_EVENTS=y # CONFIG_DEBUG_PERF_USE_VMALLOC is not set CONFIG_VM_EVENT_COUNTERS=y # CONFIG_SLUB_DEBUG is not set +# CONFIG_SLUB_MEMCG_SYSFS_ON is not set # CONFIG_COMPAT_BRK is not set # CONFIG_SLAB is not set CONFIG_SLUB=y @@ -256,6 +259,7 @@ CONFIG_HAVE_NMI=y CONFIG_HAVE_ARCH_TRACEHOOK=y CONFIG_HAVE_DMA_CONTIGUOUS=y CONFIG_GENERIC_SMP_IDLE_THREAD=y +CONFIG_ARCH_HAS_SET_MEMORY=y CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT=y CONFIG_HAVE_REGS_AND_STACK_ACCESS_API=y CONFIG_HAVE_CLK=y @@ -285,6 +289,7 @@ CONFIG_HAVE_CONTEXT_TRACKING=y CONFIG_HAVE_VIRT_CPU_ACCOUNTING_GEN=y CONFIG_HAVE_IRQ_TIME_ACCOUNTING=y CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE=y +CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD=y CONFIG_HAVE_ARCH_HUGE_VMAP=y CONFIG_HAVE_ARCH_SOFT_DIRTY=y CONFIG_MODULES_USE_ELF_RELA=y @@ -300,6 +305,11 @@ CONFIG_HAVE_STACK_VALIDATION=y # CONFIG_CPU_NO_EFFICIENT_FFS is not set CONFIG_HAVE_ARCH_VMAP_STACK=y CONFIG_VMAP_STACK=y +# CONFIG_ARCH_OPTIONAL_KERNEL_RWX is not set +# CONFIG_ARCH_OPTIONAL_KERNEL_RWX_DEFAULT is not set +CONFIG_ARCH_HAS_STRICT_KERNEL_RWX=y +CONFIG_STRICT_KERNEL_RWX=y +CONFIG_ARCH_HAS_STRICT_MODULE_RWX=y # # GCOV-based kernel profiling @@ -311,15 +321,18 @@ CONFIG_BASE_SMALL=0 # CONFIG_MODULES is not set CONFIG_MODULES_TREE_LOOKUP=y CONFIG_BLOCK=y +CONFIG_BLK_SCSI_REQUEST=y CONFIG_BLK_DEV_BSG=y # CONFIG_BLK_DEV_BSGLIB is not set # CONFIG_BLK_DEV_INTEGRITY is not set # CONFIG_BLK_DEV_ZONED is not set CONFIG_BLK_DEV_THROTTLING=y +# CONFIG_BLK_DEV_THROTTLING_LOW is not set # CONFIG_BLK_CMDLINE_PARSER is not set CONFIG_BLK_WBT=y CONFIG_BLK_WBT_SQ=y CONFIG_BLK_WBT_MQ=y +# CONFIG_BLK_SED_OPAL is not set # # Partition Types @@ -350,13 +363,15 @@ CONFIG_BLK_MQ_PCI=y # IO Schedulers # CONFIG_IOSCHED_NOOP=y -# CONFIG_IOSCHED_DEADLINE is not set +CONFIG_IOSCHED_DEADLINE=y # CONFIG_IOSCHED_CFQ is not set +CONFIG_DEFAULT_DEADLINE=y +# CONFIG_DEFAULT_NOOP is not set +CONFIG_DEFAULT_IOSCHED="deadline" +CONFIG_MQ_IOSCHED_DEADLINE=y +CONFIG_MQ_IOSCHED_KYBER=y CONFIG_IOSCHED_BFQ=y CONFIG_BFQ_GROUP_IOSCHED=y -CONFIG_DEFAULT_BFQ=y -# CONFIG_DEFAULT_NOOP is not set -CONFIG_DEFAULT_IOSCHED="bfq" CONFIG_PADATA=y CONFIG_ASN1=y CONFIG_INLINE_SPIN_UNLOCK_IRQ=y @@ -449,9 +464,9 @@ CONFIG_X86_LOCAL_APIC=y CONFIG_X86_IO_APIC=y CONFIG_X86_REROUTE_FOR_BROKEN_BOOT_IRQS=y CONFIG_X86_MCE=y +# CONFIG_X86_MCELOG_LEGACY is not set CONFIG_X86_MCE_INTEL=y CONFIG_X86_MCE_THRESHOLD=y -# CONFIG_X86_MCE_INJECT is not set CONFIG_X86_THERMAL_VECTOR=y # @@ -569,9 +584,7 @@ CONFIG_X86_NEED_RELOCS=y CONFIG_PHYSICAL_ALIGN=0x1000000 CONFIG_RANDOMIZE_MEMORY=y CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING=0x0 -CONFIG_HOTPLUG_CPU=y -# CONFIG_BOOTPARAM_HOTPLUG_CPU0 is not set -# CONFIG_DEBUG_HOTPLUG_CPU0 is not set +# CONFIG_HOTPLUG_CPU is not set # CONFIG_LEGACY_VSYSCALL_NATIVE is not set # CONFIG_LEGACY_VSYSCALL_EMULATE is not set CONFIG_LEGACY_VSYSCALL_NONE=y @@ -609,7 +622,6 @@ CONFIG_ACPI_PROCESSOR_IDLE=y CONFIG_ACPI_CPPC_LIB=y CONFIG_ACPI_PROCESSOR=y CONFIG_ACPI_IPMI=y -CONFIG_ACPI_HOTPLUG_CPU=y CONFIG_ACPI_PROCESSOR_AGGREGATOR=y CONFIG_ACPI_THERMAL=y CONFIG_ACPI_NUMA=y @@ -697,8 +709,9 @@ CONFIG_PCIE_ECRC=y CONFIG_PCIEAER_INJECT=y CONFIG_PCIEASPM=y # CONFIG_PCIEASPM_DEBUG is not set -CONFIG_PCIEASPM_DEFAULT=y +# CONFIG_PCIEASPM_DEFAULT is not set # CONFIG_PCIEASPM_POWERSAVE is not set +CONFIG_PCIEASPM_POWER_SUPERSAVE=y # CONFIG_PCIEASPM_PERFORMANCE is not set CONFIG_PCIE_PME=y # CONFIG_PCIE_DPC is not set @@ -722,10 +735,24 @@ CONFIG_HOTPLUG_PCI_ACPI=y CONFIG_HOTPLUG_PCI_SHPC=y # -# PCI host controller drivers +# DesignWare PCI Core Support # # CONFIG_PCIE_DW_PLAT is not set + +# +# PCI host controller drivers +# # CONFIG_VMD is not set + +# +# PCI Endpoint +# +# CONFIG_PCI_ENDPOINT is not set + +# +# PCI switch controller drivers +# +# CONFIG_PCI_SW_SWITCHTEC is not set # CONFIG_ISA_BUS is not set CONFIG_ISA_DMA_API=y # CONFIG_PCCARD is not set @@ -744,7 +771,6 @@ CONFIG_BINFMT_MISC=y # CONFIG_IA32_EMULATION is not set # CONFIG_X86_X32 is not set CONFIG_X86_DEV_DMA_OPS=y -CONFIG_PMC_ATOM=y CONFIG_NET=y CONFIG_NET_INGRESS=y CONFIG_NET_EGRESS=y @@ -757,6 +783,7 @@ CONFIG_PACKET_DIAG=y CONFIG_UNIX=y CONFIG_UNIX_DIAG=y CONFIG_XFRM=y +CONFIG_XFRM_OFFLOAD=y CONFIG_XFRM_ALGO=y # CONFIG_XFRM_USER is not set # CONFIG_XFRM_SUB_POLICY is not set @@ -822,6 +849,7 @@ CONFIG_IPV6_ROUTE_INFO=y CONFIG_IPV6_OPTIMISTIC_DAD=y CONFIG_INET6_AH=y CONFIG_INET6_ESP=y +CONFIG_INET6_ESP_OFFLOAD=y CONFIG_INET6_IPCOMP=y CONFIG_IPV6_MIP6=y CONFIG_IPV6_ILA=y @@ -1010,6 +1038,7 @@ CONFIG_BRIDGE=y CONFIG_BRIDGE_IGMP_SNOOPING=y CONFIG_BRIDGE_VLAN_FILTERING=y CONFIG_HAVE_NET_DSA=y +# CONFIG_NET_DSA is not set CONFIG_VLAN_8021Q=y CONFIG_VLAN_8021Q_GVRP=y CONFIG_VLAN_8021Q_MVRP=y @@ -1052,6 +1081,7 @@ CONFIG_NET_SCH_HHF=y CONFIG_NET_SCH_PIE=y CONFIG_NET_SCH_INGRESS=y CONFIG_NET_SCH_PLUG=y +# CONFIG_NET_SCH_DEFAULT is not set # # Classification @@ -1084,6 +1114,7 @@ CONFIG_NET_ACT_POLICE=y CONFIG_NET_ACT_GACT=y CONFIG_GACT_PROB=y CONFIG_NET_ACT_MIRRED=y +# CONFIG_NET_ACT_SAMPLE is not set CONFIG_NET_ACT_IPT=y CONFIG_NET_ACT_NAT=y CONFIG_NET_ACT_PEDIT=y @@ -1136,9 +1167,12 @@ CONFIG_FIB_RULES=y # CONFIG_CAIF is not set # CONFIG_CEPH_LIB is not set # CONFIG_NFC is not set +# CONFIG_PSAMPLE is not set +# CONFIG_NET_IFE is not set CONFIG_LWTUNNEL=y CONFIG_LWTUNNEL_BPF=y CONFIG_DST_CACHE=y +CONFIG_GRO_CELLS=y CONFIG_NET_DEVLINK=y CONFIG_MAY_USE_DEVLINK=y CONFIG_HAVE_EBPF_JIT=y @@ -1207,7 +1241,6 @@ CONFIG_BLK_DEV_CRYPTOLOOP=y # CONFIG_BLK_DEV_RAM is not set # CONFIG_CDROM_PKTCDVD is not set # CONFIG_ATA_OVER_ETH is not set -# CONFIG_BLK_DEV_HD is not set # CONFIG_BLK_DEV_RBD is not set # CONFIG_BLK_DEV_RSXX is not set # CONFIG_BLK_DEV_NVME is not set @@ -1237,6 +1270,7 @@ CONFIG_BLK_DEV_CRYPTOLOOP=y # CONFIG_DS1682 is not set # CONFIG_USB_SWITCH_FSA9480 is not set # CONFIG_SRAM is not set +# CONFIG_PCI_ENDPOINT_TEST is not set # CONFIG_C2PORT is not set # @@ -1246,6 +1280,7 @@ CONFIG_EEPROM_AT24=y # CONFIG_EEPROM_LEGACY is not set # CONFIG_EEPROM_MAX6875 is not set # CONFIG_EEPROM_93CX6 is not set +# CONFIG_EEPROM_IDT_89HPESX is not set # CONFIG_CB710_CORE is not set # @@ -1378,6 +1413,7 @@ CONFIG_NET_CORE=y # CONFIG_NET_TEAM is not set # CONFIG_MACVLAN is not set CONFIG_IPVLAN=y +CONFIG_IPVTAP=y # CONFIG_VXLAN is not set # CONFIG_GENEVE is not set # CONFIG_GTP is not set @@ -1386,6 +1422,7 @@ CONFIG_MACSEC=y # CONFIG_NETPOLL is not set # CONFIG_NET_POLL_CONTROLLER is not set # CONFIG_TUN is not set +CONFIG_TAP=y # CONFIG_TUN_VNET_CROSS_LE is not set # CONFIG_VETH is not set # CONFIG_NLMON is not set @@ -1408,6 +1445,7 @@ CONFIG_ETHERNET=y # CONFIG_ALTERA_TSE is not set # CONFIG_NET_VENDOR_AMAZON is not set # CONFIG_NET_VENDOR_AMD is not set +# CONFIG_NET_VENDOR_AQUANTIA is not set # CONFIG_NET_VENDOR_ARC is not set # CONFIG_NET_VENDOR_ATHEROS is not set # CONFIG_NET_VENDOR_AURORA is not set @@ -1469,25 +1507,20 @@ CONFIG_R8169=y # CONFIG_NET_VENDOR_SMSC is not set # CONFIG_NET_VENDOR_STMICRO is not set # CONFIG_NET_VENDOR_SUN is not set -# CONFIG_NET_VENDOR_SYNOPSYS is not set # CONFIG_NET_VENDOR_TEHUTI is not set # CONFIG_NET_VENDOR_TI is not set # CONFIG_NET_VENDOR_VIA is not set # CONFIG_NET_VENDOR_WIZNET is not set +# CONFIG_NET_VENDOR_SYNOPSYS is not set # CONFIG_FDDI is not set # CONFIG_HIPPI is not set # CONFIG_NET_SB1000 is not set +CONFIG_MDIO_DEVICE=y +# CONFIG_MDIO_BITBANG is not set +# CONFIG_MDIO_THUNDER is not set CONFIG_PHYLIB=y # CONFIG_LED_TRIGGER_PHY is not set -# -# MDIO bus device drivers -# -# CONFIG_MDIO_BCM_UNIMAC is not set -# CONFIG_MDIO_BITBANG is not set -# CONFIG_MDIO_OCTEON is not set -# CONFIG_MDIO_THUNDER is not set - # # MII PHY device drivers # @@ -1575,6 +1608,7 @@ CONFIG_INPUT_KEYBOARD=y # CONFIG_KEYBOARD_SAMSUNG is not set # CONFIG_KEYBOARD_STOWAWAY is not set # CONFIG_KEYBOARD_SUNKBD is not set +# CONFIG_KEYBOARD_TM2_TOUCHKEY is not set # CONFIG_KEYBOARD_XTKBD is not set # CONFIG_INPUT_MOUSE is not set # CONFIG_INPUT_JOYSTICK is not set @@ -1586,7 +1620,6 @@ CONFIG_INPUT_MISC=y # CONFIG_INPUT_E3X0_BUTTON is not set CONFIG_INPUT_PCSPKR=y # CONFIG_INPUT_MMA8450 is not set -# CONFIG_INPUT_MPU3050 is not set # CONFIG_INPUT_APANEL is not set # CONFIG_INPUT_ATLAS_BTNS is not set # CONFIG_INPUT_ATI_REMOTE2 is not set @@ -1651,6 +1684,7 @@ CONFIG_SERIAL_8250_PNP=y CONFIG_SERIAL_8250_CONSOLE=y CONFIG_SERIAL_8250_DMA=y CONFIG_SERIAL_8250_PCI=y +# CONFIG_SERIAL_8250_EXAR is not set CONFIG_SERIAL_8250_NR_UARTS=4 CONFIG_SERIAL_8250_RUNTIME_UARTS=4 # CONFIG_SERIAL_8250_EXTENDED is not set @@ -1675,6 +1709,7 @@ CONFIG_SERIAL_CORE_CONSOLE=y # CONFIG_SERIAL_ARC is not set # CONFIG_SERIAL_RP2 is not set # CONFIG_SERIAL_FSL_LPUART is not set +# CONFIG_SERIAL_DEV_BUS is not set # CONFIG_TTY_PRINTK is not set CONFIG_IPMI_HANDLER=y CONFIG_IPMI_PANIC_EVENT=y @@ -1823,6 +1858,7 @@ CONFIG_PINCTRL=y # CONFIG_DEBUG_PINCTRL is not set # CONFIG_PINCTRL_CHERRYVIEW is not set # CONFIG_PINCTRL_BROXTON is not set +# CONFIG_PINCTRL_GEMINILAKE is not set # CONFIG_PINCTRL_SUNRISEPOINT is not set # CONFIG_GPIOLIB is not set # CONFIG_W1 is not set @@ -1837,6 +1873,7 @@ CONFIG_POWER_SUPPLY=y # CONFIG_BATTERY_DS2781 is not set # CONFIG_BATTERY_DS2782 is not set # CONFIG_BATTERY_SBS is not set +# CONFIG_CHARGER_SBS is not set # CONFIG_BATTERY_BQ27XXX is not set # CONFIG_BATTERY_MAX17040 is not set # CONFIG_BATTERY_MAX17042 is not set @@ -1872,6 +1909,7 @@ CONFIG_HWMON_VID=y # CONFIG_SENSORS_K10TEMP is not set # CONFIG_SENSORS_APPLESMC is not set # CONFIG_SENSORS_ASB100 is not set +CONFIG_SENSORS_ASPEED=y # CONFIG_SENSORS_ATXP1 is not set # CONFIG_SENSORS_DS620 is not set # CONFIG_SENSORS_DS1621 is not set @@ -1952,6 +1990,7 @@ CONFIG_SENSORS_NCT6775=y # CONFIG_SENSORS_SCH56XX_COMMON is not set # CONFIG_SENSORS_SCH5627 is not set # CONFIG_SENSORS_SCH5636 is not set +# CONFIG_SENSORS_STTS751 is not set # CONFIG_SENSORS_SMM665 is not set # CONFIG_SENSORS_ADC128D818 is not set # CONFIG_SENSORS_ADS1015 is not set @@ -1988,6 +2027,7 @@ CONFIG_SENSORS_NCT6775=y CONFIG_SENSORS_ACPI_POWER=y # CONFIG_SENSORS_ATK0110 is not set CONFIG_THERMAL=y +CONFIG_THERMAL_EMERGENCY_POWEROFF_DELAY_MS=0 CONFIG_THERMAL_HWMON=y CONFIG_THERMAL_WRITABLE_TRIPS=y CONFIG_THERMAL_DEFAULT_GOV_STEP_WISE=y @@ -2057,6 +2097,7 @@ CONFIG_WATCHDOG_SYSFS=y # CONFIG_MACHZ_WDT is not set # CONFIG_SBC_EPX_C3_WATCHDOG is not set # CONFIG_NI903X_WDT is not set +# CONFIG_NIC7018_WDT is not set # # PCI-based Watchdog Cards @@ -2107,6 +2148,7 @@ CONFIG_MFD_CORE=y # CONFIG_MFD_INTEL_QUARK_I2C_GPIO is not set CONFIG_LPC_ICH=y # CONFIG_LPC_SCH is not set +# CONFIG_INTEL_SOC_PMIC_BXTWC is not set # CONFIG_MFD_INTEL_LPSS_ACPI is not set # CONFIG_MFD_INTEL_LPSS_PCI is not set # CONFIG_MFD_JANZ_CMODIO is not set @@ -2141,6 +2183,7 @@ CONFIG_LPC_ICH=y # CONFIG_MFD_TI_AM335X_TSCADC is not set # CONFIG_MFD_LP3943 is not set # CONFIG_MFD_LP8788 is not set +# CONFIG_MFD_TI_LMU is not set # CONFIG_MFD_PALMAS is not set # CONFIG_TPS6105X is not set # CONFIG_TPS6507X is not set @@ -2176,9 +2219,11 @@ CONFIG_VGA_ARB_MAX_GPUS=16 CONFIG_DRM=y CONFIG_DRM_DP_AUX_CHARDEV=y # CONFIG_DRM_DEBUG_MM is not set +# CONFIG_DRM_DEBUG_MM_SELFTEST is not set CONFIG_DRM_KMS_HELPER=y CONFIG_DRM_KMS_FB_HELPER=y CONFIG_DRM_FBDEV_EMULATION=y +CONFIG_DRM_FBDEV_OVERALLOC=100 # CONFIG_DRM_LOAD_EDID_FIRMWARE is not set CONFIG_DRM_TTM=y @@ -2212,7 +2257,9 @@ CONFIG_DRM_BRIDGE=y # # CONFIG_DRM_ANALOGIX_ANX78XX is not set # CONFIG_DRM_HISI_HIBMC is not set +# CONFIG_DRM_TINYDRM is not set # CONFIG_DRM_LEGACY is not set +# CONFIG_DRM_LIB_RANDOM is not set # # Frame buffer Devices @@ -2230,6 +2277,7 @@ CONFIG_FB_CFB_IMAGEBLIT=y CONFIG_FB_SYS_FILLRECT=y CONFIG_FB_SYS_COPYAREA=y CONFIG_FB_SYS_IMAGEBLIT=y +# CONFIG_FB_PROVIDE_GET_FB_UNMAPPED_AREA is not set # CONFIG_FB_FOREIGN_ENDIAN is not set CONFIG_FB_SYS_FOPS=y CONFIG_FB_DEFERRED_IO=y @@ -2296,6 +2344,7 @@ CONFIG_HDMI=y CONFIG_VGA_CONSOLE=y CONFIG_VGACON_SOFT_SCROLLBACK=y CONFIG_VGACON_SOFT_SCROLLBACK_SIZE=512 +CONFIG_VGACON_SOFT_SCROLLBACK_PERSISTENT_ENABLE_BY_DEFAULT=y CONFIG_DUMMY_CONSOLE=y CONFIG_DUMMY_CONSOLE_COLUMNS=80 CONFIG_DUMMY_CONSOLE_ROWS=25 @@ -2318,9 +2367,11 @@ CONFIG_HID_GENERIC=y # Special HID drivers # # CONFIG_HID_A4TECH is not set +# CONFIG_HID_ACCUTOUCH is not set # CONFIG_HID_ACRUX is not set # CONFIG_HID_APPLE is not set # CONFIG_HID_APPLEIR is not set +# CONFIG_HID_ASUS is not set # CONFIG_HID_AUREAL is not set # CONFIG_HID_BELKIN is not set # CONFIG_HID_BETOP_FF is not set @@ -2355,6 +2406,7 @@ CONFIG_HID_GENERIC=y # CONFIG_HID_MICROSOFT is not set # CONFIG_HID_MONTEREY is not set # CONFIG_HID_MULTITOUCH is not set +# CONFIG_HID_NTI is not set # CONFIG_HID_NTRIG is not set # CONFIG_HID_ORTEK is not set # CONFIG_HID_PANTHERLORD is not set @@ -2407,6 +2459,7 @@ CONFIG_USB_SUPPORT=y CONFIG_USB_COMMON=y CONFIG_USB_ARCH_HAS_HCD=y CONFIG_USB=y +# CONFIG_USB_PCI is not set CONFIG_USB_ANNOUNCE_NEW_DEVICES=y # @@ -2426,7 +2479,6 @@ CONFIG_USB_DYNAMIC_MINORS=y # # CONFIG_USB_C67X00_HCD is not set CONFIG_USB_XHCI_HCD=y -CONFIG_USB_XHCI_PCI=y # CONFIG_USB_XHCI_PLATFORM is not set # CONFIG_USB_EHCI_HCD is not set # CONFIG_USB_OXU210HP_HCD is not set @@ -2434,7 +2486,6 @@ CONFIG_USB_XHCI_PCI=y # CONFIG_USB_ISP1362_HCD is not set # CONFIG_USB_FOTG210_HCD is not set # CONFIG_USB_OHCI_HCD is not set -# CONFIG_USB_UHCI_HCD is not set # CONFIG_USB_SL811_HCD is not set # CONFIG_USB_R8A66597_HCD is not set # CONFIG_USB_HCD_TEST_MODE is not set @@ -2510,6 +2561,7 @@ CONFIG_USB_UAS=y # CONFIG_USB_ISIGHTFW is not set # CONFIG_USB_YUREX is not set # CONFIG_USB_EZUSB_FX2 is not set +# CONFIG_USB_HUB_USB251XB is not set # CONFIG_USB_HSIC_USB3503 is not set # CONFIG_USB_HSIC_USB4604 is not set # CONFIG_USB_LINK_LAYER_TEST is not set @@ -2523,6 +2575,10 @@ CONFIG_USB_UAS=y # CONFIG_NOP_USB_XCEIV is not set # CONFIG_USB_ISP1301 is not set # CONFIG_USB_GADGET is not set + +# +# USB Power Delivery and Type-C drivers +# # CONFIG_USB_LED_TRIG is not set # CONFIG_USB_ULPI_BUS is not set # CONFIG_UWB is not set @@ -2531,6 +2587,7 @@ CONFIG_USB_UAS=y CONFIG_NEW_LEDS=y CONFIG_LEDS_CLASS=y CONFIG_LEDS_CLASS_FLASH=y +CONFIG_LEDS_BRIGHTNESS_HW_CHANGED=y # # LED drivers @@ -2586,7 +2643,6 @@ CONFIG_EDAC_SUPPORT=y CONFIG_EDAC=y # CONFIG_EDAC_LEGACY_SYSFS is not set # CONFIG_EDAC_DEBUG is not set -CONFIG_EDAC_MM_EDAC=y # CONFIG_EDAC_GHES is not set # CONFIG_EDAC_E752X is not set # CONFIG_EDAC_I82975X is not set @@ -2601,6 +2657,7 @@ CONFIG_EDAC_MM_EDAC=y # CONFIG_EDAC_I7300 is not set CONFIG_EDAC_SBRIDGE=y # CONFIG_EDAC_SKX is not set +# CONFIG_EDAC_PND2 is not set CONFIG_RTC_LIB=y CONFIG_RTC_MC146818_LIB=y CONFIG_RTC_CLASS=y @@ -2726,7 +2783,9 @@ CONFIG_DCA=y # # Microsoft Hyper-V guest support # +# CONFIG_HYPERV_TSCPAGE is not set CONFIG_STAGING=y +# CONFIG_COMEDI is not set # CONFIG_RTS5208 is not set # CONFIG_FB_SM750 is not set # CONFIG_FB_XGI is not set @@ -2746,10 +2805,17 @@ CONFIG_STAGING=y # CONFIG_UNISYSSPAR is not set # CONFIG_MOST is not set # CONFIG_GREYBUS is not set + +# +# USB Power Delivery and Type-C drivers +# +# CONFIG_TYPEC_TCPM is not set CONFIG_X86_PLATFORM_DEVICES=y # CONFIG_ACERHDF is not set # CONFIG_ALIENWARE_WMI is not set +# CONFIG_DELL_WMI is not set # CONFIG_DELL_WMI_AIO is not set +# CONFIG_DELL_WMI_LED is not set # CONFIG_DELL_SMO8800 is not set # CONFIG_FUJITSU_TABLET is not set # CONFIG_HP_ACCEL is not set @@ -2764,6 +2830,7 @@ CONFIG_ACPI_WMI=y # CONFIG_TOSHIBA_HAPS is not set # CONFIG_TOSHIBA_WMI is not set # CONFIG_ACPI_CMPC is not set +# CONFIG_INTEL_CHT_INT33FE is not set # CONFIG_INTEL_HID_EVENT is not set # CONFIG_INTEL_VBTN is not set CONFIG_INTEL_IPS=y @@ -2780,6 +2847,8 @@ CONFIG_INTEL_PUNIT_IPC=y # CONFIG_INTEL_TELEMETRY is not set # CONFIG_MLX_PLATFORM is not set # CONFIG_MLX_CPLD_PLATFORM is not set +CONFIG_INTEL_TURBO_MAX_3=y +CONFIG_PMC_ATOM=y # CONFIG_CHROME_PLATFORMS is not set CONFIG_CLKDEV_LOOKUP=y CONFIG_HAVE_CLK_PREPARE=y @@ -2794,13 +2863,6 @@ CONFIG_COMMON_CLK=y # CONFIG_COMMON_CLK_NXP is not set # CONFIG_COMMON_CLK_PXA is not set # CONFIG_COMMON_CLK_PIC32 is not set -# CONFIG_COMMON_CLK_MT2701 is not set -# CONFIG_COMMON_CLK_MT2701_MMSYS is not set -# CONFIG_COMMON_CLK_MT2701_IMGSYS is not set -# CONFIG_COMMON_CLK_MT2701_VDECSYS is not set -# CONFIG_COMMON_CLK_MT2701_HIFSYS is not set -# CONFIG_COMMON_CLK_MT2701_ETHSYS is not set -# CONFIG_COMMON_CLK_MT2701_BDPSYS is not set # # Hardware Spinlock drivers @@ -2851,8 +2913,13 @@ CONFIG_IRQ_REMAP=y # # Broadcom SoC drivers # + +# +# i.MX SoC drivers +# # CONFIG_SUNXI_SRAM is not set # CONFIG_SOC_TI is not set +# CONFIG_SOC_ZTE is not set # CONFIG_PM_DEVFREQ is not set # CONFIG_EXTCON is not set # CONFIG_MEMORY is not set @@ -2892,6 +2959,7 @@ CONFIG_ND_BLK=y CONFIG_ND_CLAIM=y CONFIG_ND_BTT=y CONFIG_BTT=y +CONFIG_DAX=y # CONFIG_DEV_DAX is not set CONFIG_NVMEM=y # CONFIG_STM is not set @@ -2902,10 +2970,14 @@ CONFIG_NVMEM=y # # CONFIG_FPGA is not set +# +# FSI support +# +# CONFIG_FSI is not set + # # Firmware Drivers # -# CONFIG_ARM_SCPI_PROTOCOL is not set # CONFIG_EDD is not set CONFIG_FIRMWARE_MEMMAP=y # CONFIG_DELL_RBU is not set @@ -2986,7 +3058,8 @@ CONFIG_FANOTIFY=y # CONFIG_QUOTA is not set # CONFIG_QUOTACTL is not set CONFIG_AUTOFS4_FS=y -# CONFIG_FUSE_FS is not set +CONFIG_FUSE_FS=y +CONFIG_CUSE=y CONFIG_OVERLAY_FS=y CONFIG_OVERLAY_FS_REDIRECT_DIR=y @@ -3177,6 +3250,7 @@ CONFIG_STACK_VALIDATION=y # CONFIG_DEBUG_FORCE_WEAK_PER_CPU is not set CONFIG_MAGIC_SYSRQ=y CONFIG_MAGIC_SYSRQ_DEFAULT_ENABLE=0x1 +CONFIG_MAGIC_SYSRQ_SERIAL=y CONFIG_DEBUG_KERNEL=y # @@ -3185,12 +3259,14 @@ CONFIG_DEBUG_KERNEL=y # CONFIG_PAGE_EXTENSION is not set # CONFIG_DEBUG_PAGEALLOC is not set # CONFIG_PAGE_POISONING is not set +# CONFIG_DEBUG_RODATA_TEST is not set # CONFIG_DEBUG_OBJECTS is not set # CONFIG_SLUB_STATS is not set CONFIG_HAVE_DEBUG_KMEMLEAK=y # CONFIG_DEBUG_KMEMLEAK is not set # CONFIG_DEBUG_STACK_USAGE is not set # CONFIG_DEBUG_VM is not set +CONFIG_ARCH_HAS_DEBUG_VIRTUAL=y # CONFIG_DEBUG_VIRTUAL is not set # CONFIG_DEBUG_MEMORY_INIT is not set # CONFIG_DEBUG_PER_CPU_MAPS is not set @@ -3223,7 +3299,6 @@ CONFIG_PANIC_TIMEOUT=15 # CONFIG_SCHEDSTATS is not set # CONFIG_SCHED_STACK_END_CHECK is not set # CONFIG_DEBUG_TIMEKEEPING is not set -CONFIG_TIMER_STATS=y # # Lock Debugging (spinlocks, mutexes, etc...) @@ -3238,6 +3313,7 @@ CONFIG_TIMER_STATS=y # CONFIG_DEBUG_ATOMIC_SLEEP is not set # CONFIG_DEBUG_LOCKING_API_SELFTESTS is not set # CONFIG_LOCK_TORTURE_TEST is not set +# CONFIG_WW_MUTEX_SELFTEST is not set # CONFIG_STACKTRACE is not set # CONFIG_DEBUG_KOBJECT is not set # CONFIG_DEBUG_BUGVERBOSE is not set @@ -3260,7 +3336,6 @@ CONFIG_RCU_CPU_STALL_TIMEOUT=21 # CONFIG_RCU_EQS_DEBUG is not set # CONFIG_DEBUG_WQ_FORCE_RR_CPU is not set # CONFIG_DEBUG_BLOCK_EXT_DEVT is not set -# CONFIG_CPU_HOTPLUG_STATE_CONTROL is not set # CONFIG_NOTIFIER_ERROR_INJECTION is not set # CONFIG_FAULT_INJECTION is not set # CONFIG_LATENCYTOP is not set @@ -3280,6 +3355,7 @@ CONFIG_TRACING_SUPPORT=y # Runtime Testing # # CONFIG_TEST_LIST_SORT is not set +# CONFIG_TEST_SORT is not set # CONFIG_BACKTRACE_SELF_TEST is not set # CONFIG_RBTREE_TEST is not set # CONFIG_ATOMIC64_SELFTEST is not set @@ -3309,7 +3385,6 @@ CONFIG_X86_VERBOSE_BOOTUP=y CONFIG_X86_PTDUMP_CORE=y # CONFIG_X86_PTDUMP is not set # CONFIG_EFI_PGT_DUMP is not set -# CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_WX=y # CONFIG_DOUBLEFAULT is not set # CONFIG_DEBUG_TLBFLUSH is not set @@ -3345,9 +3420,9 @@ CONFIG_SECURITY_DMESG_RESTRICT=y CONFIG_SECURITYFS=y CONFIG_INTEL_TXT=y CONFIG_HAVE_HARDENED_USERCOPY_ALLOCATOR=y -CONFIG_HAVE_ARCH_HARDENED_USERCOPY=y CONFIG_HARDENED_USERCOPY=y CONFIG_HARDENED_USERCOPY_PAGESPAN=y +# CONFIG_STATIC_USERMODEHELPER is not set CONFIG_DEFAULT_SECURITY_DAC=y CONFIG_DEFAULT_SECURITY="" CONFIG_XOR_BLOCKS=y @@ -3458,6 +3533,7 @@ CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL=y # Ciphers # CONFIG_CRYPTO_AES=y +CONFIG_CRYPTO_AES_TI=y CONFIG_CRYPTO_AES_X86_64=y CONFIG_CRYPTO_AES_NI_INTEL=y CONFIG_CRYPTO_ANUBIS=y @@ -3541,6 +3617,8 @@ CONFIG_SYSTEM_TRUSTED_KEYRING=y CONFIG_SYSTEM_TRUSTED_KEYS="" # CONFIG_SYSTEM_EXTRA_CERTIFICATE is not set CONFIG_SECONDARY_TRUSTED_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_KEYRING=y +CONFIG_SYSTEM_BLACKLIST_HASH_LIST="" CONFIG_HAVE_KVM=y # CONFIG_VIRTUALIZATION is not set # CONFIG_BINARY_PRINTF is not set @@ -3609,6 +3687,8 @@ CONFIG_ASSOCIATIVE_ARRAY=y CONFIG_HAS_IOMEM=y CONFIG_HAS_IOPORT_MAP=y CONFIG_HAS_DMA=y +# CONFIG_DMA_NOOP_OPS is not set +# CONFIG_DMA_VIRT_OPS is not set CONFIG_CHECK_SIGNATURE=y CONFIG_CPUMASK_OFFSTACK=y CONFIG_CPU_RMAP=y diff --git a/linux-spica.install b/linux-spica.install index c3c0483..a38c800 100644 --- a/linux-spica.install +++ b/linux-spica.install @@ -1,5 +1,5 @@ pkgname=linux-spica -kernver=4.10.12spica-dirty +kernver=4.12.4spica-dirty #bootdevice="BOOT_IMAGE=/boot/vmlinuz-$pkgname root=UUID=d670564f-2cb3-4981-9d51-6ed9c1327d47" #option="rw quiet clocksource=hpet initrd=EFI/spi-ca/initrd intel_iommu=on pci-stub.ids=1002:683f,1002:aab0 vfio_iommu_type1.allow_unsafe_interrupts=1,kvm.ignore_msrs=1" #option="rw quiet clocksource=hpet initrd=EFI/spi-ca/initrd quiet intremap=no_x2apic_optout zswap.enabled=1 zswap.max_pool_percent=25 zswap.compressor=lz4" diff --git a/patches/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.10..patch b/patches/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.10..patch deleted file mode 100644 index 45f4fd2..0000000 --- a/patches/0001-block-cgroups-kconfig-build-bits-for-BFQ-v7r11-4.10..patch +++ /dev/null @@ -1,103 +0,0 @@ -From 8500f47272575b4616beb487c483019248d8c501 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Tue, 7 Apr 2015 13:39:12 +0200 -Subject: [PATCH 1/4] block: cgroups, kconfig, build bits for BFQ-v7r11-4.10.0 - -Update Kconfig.iosched and do the related Makefile changes to include -kernel configuration options for BFQ. Also increase the number of -policies supported by the blkio controller so that BFQ can add its -own. - -Signed-off-by: Paolo Valente -Signed-off-by: Arianna Avanzini ---- - block/Kconfig.iosched | 32 ++++++++++++++++++++++++++++++++ - block/Makefile | 1 + - include/linux/blkdev.h | 2 +- - 3 files changed, 34 insertions(+), 1 deletion(-) - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index 421bef9..0ee5f0f 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -39,6 +39,27 @@ config CFQ_GROUP_IOSCHED - ---help--- - Enable group IO scheduling in CFQ. - -+config IOSCHED_BFQ -+ tristate "BFQ I/O scheduler" -+ default n -+ ---help--- -+ The BFQ I/O scheduler tries to distribute bandwidth among -+ all processes according to their weights. -+ It aims at distributing the bandwidth as desired, independently of -+ the disk parameters and with any workload. It also tries to -+ guarantee low latency to interactive and soft real-time -+ applications. If compiled built-in (saying Y here), BFQ can -+ be configured to support hierarchical scheduling. -+ -+config CGROUP_BFQIO -+ bool "BFQ hierarchical scheduling support" -+ depends on CGROUPS && IOSCHED_BFQ=y -+ default n -+ ---help--- -+ Enable hierarchical scheduling in BFQ, using the cgroups -+ filesystem interface. The name of the subsystem will be -+ bfqio. -+ - choice - prompt "Default I/O scheduler" - default DEFAULT_CFQ -@@ -52,6 +73,16 @@ choice - config DEFAULT_CFQ - bool "CFQ" if IOSCHED_CFQ=y - -+ config DEFAULT_BFQ -+ bool "BFQ" if IOSCHED_BFQ=y -+ help -+ Selects BFQ as the default I/O scheduler which will be -+ used by default for all block devices. -+ The BFQ I/O scheduler aims at distributing the bandwidth -+ as desired, independently of the disk parameters and with -+ any workload. It also tries to guarantee low latency to -+ interactive and soft real-time applications. -+ - config DEFAULT_NOOP - bool "No-op" - -@@ -61,6 +92,7 @@ config DEFAULT_IOSCHED - string - default "deadline" if DEFAULT_DEADLINE - default "cfq" if DEFAULT_CFQ -+ default "bfq" if DEFAULT_BFQ - default "noop" if DEFAULT_NOOP - - endmenu -diff --git a/block/Makefile b/block/Makefile -index a827f98..3b14703 100644 ---- a/block/Makefile -+++ b/block/Makefile -@@ -18,6 +18,7 @@ obj-$(CONFIG_BLK_DEV_THROTTLING) += blk-throttle.o - obj-$(CONFIG_IOSCHED_NOOP) += noop-iosched.o - obj-$(CONFIG_IOSCHED_DEADLINE) += deadline-iosched.o - obj-$(CONFIG_IOSCHED_CFQ) += cfq-iosched.o -+obj-$(CONFIG_IOSCHED_BFQ) += bfq-iosched.o - - obj-$(CONFIG_BLOCK_COMPAT) += compat_ioctl.o - obj-$(CONFIG_BLK_CMDLINE_PARSER) += cmdline-parser.o -diff --git a/include/linux/blkdev.h b/include/linux/blkdev.h -index 1ca8e8f..8e2d6ed 100644 ---- a/include/linux/blkdev.h -+++ b/include/linux/blkdev.h -@@ -47,7 +47,7 @@ struct rq_wb; - * Maximum number of blkcg policies allowed to be registered concurrently. - * Defined here to simplify include dependency. - */ --#define BLKCG_MAX_POLS 2 -+#define BLKCG_MAX_POLS 3 - - typedef void (rq_end_io_fn)(struct request *, int); - --- -2.10.0 - diff --git a/patches/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.10.0.patch b/patches/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.10.0.patch deleted file mode 100644 index 0812a57..0000000 --- a/patches/0002-block-introduce-the-BFQ-v7r11-I-O-sched-for-4.10.0.patch +++ /dev/null @@ -1,7109 +0,0 @@ -From 2f56e91506b329ffc29d0f184924ad0123c9ba9e Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Thu, 9 May 2013 19:10:02 +0200 -Subject: [PATCH 2/4] block: introduce the BFQ-v7r11 I/O sched for 4.10.0 - -The general structure is borrowed from CFQ, as much of the code for -handling I/O contexts. Over time, several useful features have been -ported from CFQ as well (details in the changelog in README.BFQ). A -(bfq_)queue is associated to each task doing I/O on a device, and each -time a scheduling decision has to be made a queue is selected and served -until it expires. - - - Slices are given in the service domain: tasks are assigned - budgets, measured in number of sectors. Once got the disk, a task - must however consume its assigned budget within a configurable - maximum time (by default, the maximum possible value of the - budgets is automatically computed to comply with this timeout). - This allows the desired latency vs "throughput boosting" tradeoff - to be set. - - - Budgets are scheduled according to a variant of WF2Q+, implemented - using an augmented rb-tree to take eligibility into account while - preserving an O(log N) overall complexity. - - - A low-latency tunable is provided; if enabled, both interactive - and soft real-time applications are guaranteed a very low latency. - - - Latency guarantees are preserved also in the presence of NCQ. - - - Also with flash-based devices, a high throughput is achieved - while still preserving latency guarantees. - - - BFQ features Early Queue Merge (EQM), a sort of fusion of the - cooperating-queue-merging and the preemption mechanisms present - in CFQ. EQM is in fact a unified mechanism that tries to get a - sequential read pattern, and hence a high throughput, with any - set of processes performing interleaved I/O over a contiguous - sequence of sectors. - - - BFQ supports full hierarchical scheduling, exporting a cgroups - interface. Since each node has a full scheduler, each group can - be assigned its own weight. - - - If the cgroups interface is not used, only I/O priorities can be - assigned to processes, with ioprio values mapped to weights - with the relation weight = IOPRIO_BE_NR - ioprio. - - - ioprio classes are served in strict priority order, i.e., lower - priority queues are not served as long as there are higher - priority queues. Among queues in the same class the bandwidth is - distributed in proportion to the weight of each queue. A very - thin extra bandwidth is however guaranteed to the Idle class, to - prevent it from starving. - -Signed-off-by: Paolo Valente -Signed-off-by: Arianna Avanzini ---- - block/Kconfig.iosched | 6 +- - block/bfq-cgroup.c | 1186 ++++++++++++++++ - block/bfq-ioc.c | 36 + - block/bfq-iosched.c | 3763 +++++++++++++++++++++++++++++++++++++++++++++++++ - block/bfq-sched.c | 1199 ++++++++++++++++ - block/bfq.h | 801 +++++++++++ - 6 files changed, 6987 insertions(+), 4 deletions(-) - create mode 100644 block/bfq-cgroup.c - create mode 100644 block/bfq-ioc.c - create mode 100644 block/bfq-iosched.c - create mode 100644 block/bfq-sched.c - create mode 100644 block/bfq.h - -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index 0ee5f0f..f78cd1a 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -51,14 +51,12 @@ config IOSCHED_BFQ - applications. If compiled built-in (saying Y here), BFQ can - be configured to support hierarchical scheduling. - --config CGROUP_BFQIO -+config BFQ_GROUP_IOSCHED - bool "BFQ hierarchical scheduling support" - depends on CGROUPS && IOSCHED_BFQ=y - default n - ---help--- -- Enable hierarchical scheduling in BFQ, using the cgroups -- filesystem interface. The name of the subsystem will be -- bfqio. -+ Enable hierarchical scheduling in BFQ, using the blkio controller. - - choice - prompt "Default I/O scheduler" -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c -new file mode 100644 -index 0000000..8b08a57 ---- /dev/null -+++ b/block/bfq-cgroup.c -@@ -0,0 +1,1186 @@ -+/* -+ * BFQ: CGROUPS support. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ */ -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ -+/* bfqg stats flags */ -+enum bfqg_stats_flags { -+ BFQG_stats_waiting = 0, -+ BFQG_stats_idling, -+ BFQG_stats_empty, -+}; -+ -+#define BFQG_FLAG_FNS(name) \ -+static void bfqg_stats_mark_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags |= (1 << BFQG_stats_##name); \ -+} \ -+static void bfqg_stats_clear_##name(struct bfqg_stats *stats) \ -+{ \ -+ stats->flags &= ~(1 << BFQG_stats_##name); \ -+} \ -+static int bfqg_stats_##name(struct bfqg_stats *stats) \ -+{ \ -+ return (stats->flags & (1 << BFQG_stats_##name)) != 0; \ -+} \ -+ -+BFQG_FLAG_FNS(waiting) -+BFQG_FLAG_FNS(idling) -+BFQG_FLAG_FNS(empty) -+#undef BFQG_FLAG_FNS -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_update_group_wait_time(struct bfqg_stats *stats) -+{ -+ unsigned long long now; -+ -+ if (!bfqg_stats_waiting(stats)) -+ return; -+ -+ now = sched_clock(); -+ if (time_after64(now, stats->start_group_wait_time)) -+ blkg_stat_add(&stats->group_wait_time, -+ now - stats->start_group_wait_time); -+ bfqg_stats_clear_waiting(stats); -+} -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_waiting(stats)) -+ return; -+ if (bfqg == curr_bfqg) -+ return; -+ stats->start_group_wait_time = sched_clock(); -+ bfqg_stats_mark_waiting(stats); -+} -+ -+/* This should be called with the queue_lock held. */ -+static void bfqg_stats_end_empty_time(struct bfqg_stats *stats) -+{ -+ unsigned long long now; -+ -+ if (!bfqg_stats_empty(stats)) -+ return; -+ -+ now = sched_clock(); -+ if (time_after64(now, stats->start_empty_time)) -+ blkg_stat_add(&stats->empty_time, -+ now - stats->start_empty_time); -+ bfqg_stats_clear_empty(stats); -+} -+ -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg) -+{ -+ blkg_stat_add(&bfqg->stats.dequeue, 1); -+} -+ -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (blkg_rwstat_total(&stats->queued)) -+ return; -+ -+ /* -+ * group is already marked empty. This can happen if bfqq got new -+ * request in parent group and moved to this group while being added -+ * to service tree. Just ignore the event and move on. -+ */ -+ if (bfqg_stats_empty(stats)) -+ return; -+ -+ stats->start_empty_time = sched_clock(); -+ bfqg_stats_mark_empty(stats); -+} -+ -+static void bfqg_stats_update_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ if (bfqg_stats_idling(stats)) { -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, stats->start_idle_time)) -+ blkg_stat_add(&stats->idle_time, -+ now - stats->start_idle_time); -+ bfqg_stats_clear_idling(stats); -+ } -+} -+ -+static void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ stats->start_idle_time = sched_clock(); -+ bfqg_stats_mark_idling(stats); -+} -+ -+static void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ -+ blkg_stat_add(&stats->avg_queue_size_sum, -+ blkg_rwstat_total(&stats->queued)); -+ blkg_stat_add(&stats->avg_queue_size_samples, 1); -+ bfqg_stats_update_group_wait_time(stats); -+} -+ -+static struct blkcg_policy blkcg_policy_bfq; -+ -+/* -+ * blk-cgroup policy-related handlers -+ * The following functions help in converting between blk-cgroup -+ * internal structures and BFQ-specific structures. -+ */ -+ -+static struct bfq_group *pd_to_bfqg(struct blkg_policy_data *pd) -+{ -+ return pd ? container_of(pd, struct bfq_group, pd) : NULL; -+} -+ -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg) -+{ -+ return pd_to_blkg(&bfqg->pd); -+} -+ -+static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) -+{ -+ struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); -+ -+ BUG_ON(!pd); -+ -+ return pd_to_bfqg(pd); -+} -+ -+/* -+ * bfq_group handlers -+ * The following functions help in navigating the bfq_group hierarchy -+ * by allowing to find the parent of a bfq_group or the bfq_group -+ * associated to a bfq_queue. -+ */ -+ -+static struct bfq_group *bfqg_parent(struct bfq_group *bfqg) -+{ -+ struct blkcg_gq *pblkg = bfqg_to_blkg(bfqg)->parent; -+ -+ return pblkg ? blkg_to_bfqg(pblkg) : NULL; -+} -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ return group_entity ? container_of(group_entity, struct bfq_group, -+ entity) : -+ bfqq->bfqd->root_group; -+} -+ -+/* -+ * The following two functions handle get and put of a bfq_group by -+ * wrapping the related blk-cgroup hooks. -+ */ -+ -+static void bfqg_get(struct bfq_group *bfqg) -+{ -+ return blkg_get(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_put(struct bfq_group *bfqg) -+{ -+ return blkg_put(bfqg_to_blkg(bfqg)); -+} -+ -+static void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, -+ int rw) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, rw, 1); -+ bfqg_stats_end_empty_time(&bfqg->stats); -+ if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) -+ bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); -+} -+ -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) -+{ -+ blkg_rwstat_add(&bfqg->stats.queued, rw, -1); -+} -+ -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) -+{ -+ blkg_rwstat_add(&bfqg->stats.merged, rw, 1); -+} -+ -+static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, -+ uint64_t bytes, int rw) -+{ -+ blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); -+ blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); -+ blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); -+} -+ -+static void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, int rw) -+{ -+ struct bfqg_stats *stats = &bfqg->stats; -+ unsigned long long now = sched_clock(); -+ -+ if (time_after64(now, io_start_time)) -+ blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); -+ if (time_after64(io_start_time, start_time)) -+ blkg_rwstat_add(&stats->wait_time, rw, -+ io_start_time - start_time); -+} -+ -+/* @stats = 0 */ -+static void bfqg_stats_reset(struct bfqg_stats *stats) -+{ -+ if (!stats) -+ return; -+ -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_reset(&stats->service_bytes); -+ blkg_rwstat_reset(&stats->serviced); -+ blkg_rwstat_reset(&stats->merged); -+ blkg_rwstat_reset(&stats->service_time); -+ blkg_rwstat_reset(&stats->wait_time); -+ blkg_stat_reset(&stats->time); -+ blkg_stat_reset(&stats->unaccounted_time); -+ blkg_stat_reset(&stats->avg_queue_size_sum); -+ blkg_stat_reset(&stats->avg_queue_size_samples); -+ blkg_stat_reset(&stats->dequeue); -+ blkg_stat_reset(&stats->group_wait_time); -+ blkg_stat_reset(&stats->idle_time); -+ blkg_stat_reset(&stats->empty_time); -+} -+ -+/* @to += @from */ -+static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) -+{ -+ if (!to || !from) -+ return; -+ -+ /* queued stats shouldn't be cleared */ -+ blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); -+ blkg_rwstat_add_aux(&to->serviced, &from->serviced); -+ blkg_rwstat_add_aux(&to->merged, &from->merged); -+ blkg_rwstat_add_aux(&to->service_time, &from->service_time); -+ blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); -+ blkg_stat_add_aux(&from->time, &from->time); -+ blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); -+ blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); -+ blkg_stat_add_aux(&to->avg_queue_size_samples, -+ &from->avg_queue_size_samples); -+ blkg_stat_add_aux(&to->dequeue, &from->dequeue); -+ blkg_stat_add_aux(&to->group_wait_time, &from->group_wait_time); -+ blkg_stat_add_aux(&to->idle_time, &from->idle_time); -+ blkg_stat_add_aux(&to->empty_time, &from->empty_time); -+} -+ -+/* -+ * Transfer @bfqg's stats to its parent's dead_stats so that the ancestors' -+ * recursive stats can still account for the amount used by this bfqg after -+ * it's gone. -+ */ -+static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) -+{ -+ struct bfq_group *parent; -+ -+ if (!bfqg) /* root_group */ -+ return; -+ -+ parent = bfqg_parent(bfqg); -+ -+ lockdep_assert_held(bfqg_to_blkg(bfqg)->q->queue_lock); -+ -+ if (unlikely(!parent)) -+ return; -+ -+ bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); -+ bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); -+ bfqg_stats_reset(&bfqg->stats); -+ bfqg_stats_reset(&bfqg->dead_stats); -+} -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ bfqg_get(bfqg); -+ } -+ entity->parent = bfqg->my_entity; -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static void bfqg_stats_exit(struct bfqg_stats *stats) -+{ -+ blkg_rwstat_exit(&stats->service_bytes); -+ blkg_rwstat_exit(&stats->serviced); -+ blkg_rwstat_exit(&stats->merged); -+ blkg_rwstat_exit(&stats->service_time); -+ blkg_rwstat_exit(&stats->wait_time); -+ blkg_rwstat_exit(&stats->queued); -+ blkg_stat_exit(&stats->sectors); -+ blkg_stat_exit(&stats->time); -+ blkg_stat_exit(&stats->unaccounted_time); -+ blkg_stat_exit(&stats->avg_queue_size_sum); -+ blkg_stat_exit(&stats->avg_queue_size_samples); -+ blkg_stat_exit(&stats->dequeue); -+ blkg_stat_exit(&stats->group_wait_time); -+ blkg_stat_exit(&stats->idle_time); -+ blkg_stat_exit(&stats->empty_time); -+} -+ -+static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) -+{ -+ if (blkg_rwstat_init(&stats->service_bytes, gfp) || -+ blkg_rwstat_init(&stats->serviced, gfp) || -+ blkg_rwstat_init(&stats->merged, gfp) || -+ blkg_rwstat_init(&stats->service_time, gfp) || -+ blkg_rwstat_init(&stats->wait_time, gfp) || -+ blkg_rwstat_init(&stats->queued, gfp) || -+ blkg_stat_init(&stats->sectors, gfp) || -+ blkg_stat_init(&stats->time, gfp) || -+ blkg_stat_init(&stats->unaccounted_time, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_sum, gfp) || -+ blkg_stat_init(&stats->avg_queue_size_samples, gfp) || -+ blkg_stat_init(&stats->dequeue, gfp) || -+ blkg_stat_init(&stats->group_wait_time, gfp) || -+ blkg_stat_init(&stats->idle_time, gfp) || -+ blkg_stat_init(&stats->empty_time, gfp)) { -+ bfqg_stats_exit(stats); -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+static struct bfq_group_data *cpd_to_bfqgd(struct blkcg_policy_data *cpd) -+{ -+ return cpd ? container_of(cpd, struct bfq_group_data, pd) : NULL; -+} -+ -+static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) -+{ -+ return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); -+} -+ -+static void bfq_cpd_init(struct blkcg_policy_data *cpd) -+{ -+ struct bfq_group_data *d = cpd_to_bfqgd(cpd); -+ -+ d->weight = BFQ_DEFAULT_GRP_WEIGHT; -+} -+ -+static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -+{ -+ struct bfq_group *bfqg; -+ -+ bfqg = kzalloc_node(sizeof(*bfqg), gfp, node); -+ if (!bfqg) -+ return NULL; -+ -+ if (bfqg_stats_init(&bfqg->stats, gfp) || -+ bfqg_stats_init(&bfqg->dead_stats, gfp)) { -+ kfree(bfqg); -+ return NULL; -+ } -+ -+ return &bfqg->pd; -+} -+ -+static void bfq_group_set_parent(struct bfq_group *bfqg, -+ struct bfq_group *parent) -+{ -+ struct bfq_entity *entity; -+ -+ BUG_ON(!parent); -+ BUG_ON(!bfqg); -+ BUG_ON(bfqg == parent); -+ -+ entity = &bfqg->entity; -+ entity->parent = parent->my_entity; -+ entity->sched_data = &parent->sched_data; -+} -+ -+static void bfq_pd_init(struct blkg_policy_data *pd) -+{ -+ struct blkcg_gq *blkg = pd_to_blkg(pd); -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ struct bfq_data *bfqd = blkg->q->elevator->elevator_data; -+ struct bfq_entity *entity = &bfqg->entity; -+ struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); -+ -+ entity->orig_weight = entity->weight = entity->new_weight = d->weight; -+ entity->my_sched_data = &bfqg->sched_data; -+ bfqg->my_entity = entity; /* -+ * the root_group's will be set to NULL -+ * in bfq_init_queue() -+ */ -+ bfqg->bfqd = bfqd; -+ bfqg->active_entities = 0; -+} -+ -+static void bfq_pd_free(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_exit(&bfqg->stats); -+ bfqg_stats_exit(&bfqg->dead_stats); -+ -+ return kfree(bfqg); -+} -+ -+/* offset delta from bfqg->stats to bfqg->dead_stats */ -+static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - -+ offsetof(struct bfq_group, stats); -+ -+/* to be used by recursive prfill, sums live and dead stats recursively */ -+static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -+{ -+ u64 sum = 0; -+ -+ sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); -+ sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, -+ off + dead_stats_off_delta); -+ return sum; -+} -+ -+/* to be used by recursive prfill, sums live and dead rwstats recursively */ -+static struct blkg_rwstat -+bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat a, b; -+ -+ a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); -+ b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, -+ off + dead_stats_off_delta); -+ blkg_rwstat_add_aux(&a, &b); -+ return a; -+} -+ -+static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ -+ bfqg_stats_reset(&bfqg->stats); -+ bfqg_stats_reset(&bfqg->dead_stats); -+} -+ -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ struct request_queue *q = bfqd->queue; -+ struct bfq_group *bfqg = NULL, *parent; -+ struct bfq_entity *entity = NULL; -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ /* avoid lookup for the common case where there's no blkcg */ -+ if (blkcg == &blkcg_root) { -+ bfqg = bfqd->root_group; -+ } else { -+ struct blkcg_gq *blkg; -+ -+ blkg = blkg_lookup_create(blkcg, q); -+ if (!IS_ERR(blkg)) -+ bfqg = blkg_to_bfqg(blkg); -+ else /* fallback to root_group */ -+ bfqg = bfqd->root_group; -+ } -+ -+ BUG_ON(!bfqg); -+ -+ /* -+ * Update chain of bfq_groups as we might be handling a leaf group -+ * which, along with some of its relatives, has not been hooked yet -+ * to the private hierarchy of BFQ. -+ */ -+ entity = &bfqg->entity; -+ for_each_entity(entity) { -+ bfqg = container_of(entity, struct bfq_group, entity); -+ BUG_ON(!bfqg); -+ if (bfqg != bfqd->root_group) { -+ parent = bfqg_parent(bfqg); -+ if (!parent) -+ parent = bfqd->root_group; -+ BUG_ON(!parent); -+ bfq_group_set_parent(bfqg, parent); -+ } -+ } -+ -+ return bfqg; -+} -+ -+/** -+ * bfq_bfqq_move - migrate @bfqq to @bfqg. -+ * @bfqd: queue descriptor. -+ * @bfqq: the queue to move. -+ * @entity: @bfqq's entity. -+ * @bfqg: the group to move to. -+ * -+ * Move @bfqq to @bfqg, deactivating it from its old group and reactivating -+ * it on the new one. Avoid putting the entity on the old group idle tree. -+ * -+ * Must be called under the queue lock; the cgroup owning @bfqg must -+ * not disappear (by now this just means that we are called under -+ * rcu_read_lock()). -+ */ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_entity *entity, struct bfq_group *bfqg) -+{ -+ int busy, resume; -+ -+ busy = bfq_bfqq_busy(bfqq); -+ resume = !RB_EMPTY_ROOT(&bfqq->sort_list); -+ -+ BUG_ON(resume && !entity->on_st); -+ BUG_ON(busy && !resume && entity->on_st && -+ bfqq != bfqd->in_service_queue); -+ -+ if (busy) { -+ BUG_ON(atomic_read(&bfqq->ref) < 2); -+ -+ if (!resume) -+ bfq_del_bfqq_busy(bfqd, bfqq, 0); -+ else -+ bfq_deactivate_bfqq(bfqd, bfqq, 0); -+ } else if (entity->on_st) -+ bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); -+ bfqg_put(bfqq_group(bfqq)); -+ -+ /* -+ * Here we use a reference to bfqg. We don't need a refcounter -+ * as the cgroup reference will not be dropped, so that its -+ * destroy() callback will not be invoked. -+ */ -+ entity->parent = bfqg->my_entity; -+ entity->sched_data = &bfqg->sched_data; -+ bfqg_get(bfqg); -+ -+ if (busy) { -+ if (resume) -+ bfq_activate_bfqq(bfqd, bfqq); -+ } -+ -+ if (!bfqd->in_service_queue && !bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+} -+ -+/** -+ * __bfq_bic_change_cgroup - move @bic to @cgroup. -+ * @bfqd: the queue descriptor. -+ * @bic: the bic to move. -+ * @blkcg: the blk-cgroup to move to. -+ * -+ * Move bic to blkcg, assuming that bfqd->queue is locked; the caller -+ * has to make sure that the reference to cgroup is valid across the call. -+ * -+ * NOTE: an alternative approach might have been to store the current -+ * cgroup in bfqq and getting a reference to it, reducing the lookup -+ * time here, at the price of slightly more complex code. -+ */ -+static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic, -+ struct blkcg *blkcg) -+{ -+ struct bfq_queue *async_bfqq = bic_to_bfqq(bic, 0); -+ struct bfq_queue *sync_bfqq = bic_to_bfqq(bic, 1); -+ struct bfq_group *bfqg; -+ struct bfq_entity *entity; -+ -+ lockdep_assert_held(bfqd->queue->queue_lock); -+ -+ bfqg = bfq_find_alloc_group(bfqd, blkcg); -+ if (async_bfqq) { -+ entity = &async_bfqq->entity; -+ -+ if (entity->sched_data != &bfqg->sched_data) { -+ bic_set_bfqq(bic, NULL, 0); -+ bfq_log_bfqq(bfqd, async_bfqq, -+ "bic_change_group: %p %d", -+ async_bfqq, atomic_read(&async_bfqq->ref)); -+ bfq_put_queue(async_bfqq); -+ } -+ } -+ -+ if (sync_bfqq) { -+ entity = &sync_bfqq->entity; -+ if (entity->sched_data != &bfqg->sched_data) -+ bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); -+ } -+ -+ return bfqg; -+} -+ -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct blkcg *blkcg; -+ struct bfq_group *bfqg = NULL; -+ uint64_t id; -+ -+ rcu_read_lock(); -+ blkcg = bio_blkcg(bio); -+ id = blkcg->css.serial_nr; -+ rcu_read_unlock(); -+ -+ /* -+ * Check whether blkcg has changed. The condition may trigger -+ * spuriously on a newly created cic but there's no harm. -+ */ -+ if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) -+ return; -+ -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); -+ BUG_ON(!bfqg); -+ bic->blkcg_id = id; -+} -+ -+/** -+ * bfq_flush_idle_tree - deactivate any entity on the idle tree of @st. -+ * @st: the service tree being flushed. -+ */ -+static void bfq_flush_idle_tree(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entity = st->first_idle; -+ -+ for (; entity ; entity = st->first_idle) -+ __bfq_deactivate_entity(entity, 0); -+} -+ -+/** -+ * bfq_reparent_leaf_entity - move leaf entity to the root_group. -+ * @bfqd: the device data structure with the root group. -+ * @entity: the entity to move. -+ */ -+static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ BUG_ON(!bfqq); -+ bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); -+} -+ -+/** -+ * bfq_reparent_active_entities - move to the root group all active -+ * entities. -+ * @bfqd: the device data structure with the root group. -+ * @bfqg: the group to move from. -+ * @st: the service tree with the entities. -+ * -+ * Needs queue_lock to be taken and reference to be valid over the call. -+ */ -+static void bfq_reparent_active_entities(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ struct bfq_service_tree *st) -+{ -+ struct rb_root *active = &st->active; -+ struct bfq_entity *entity = NULL; -+ -+ if (!RB_EMPTY_ROOT(&st->active)) -+ entity = bfq_entity_of(rb_first(active)); -+ -+ for (; entity ; entity = bfq_entity_of(rb_first(active))) -+ bfq_reparent_leaf_entity(bfqd, entity); -+ -+ if (bfqg->sched_data.in_service_entity) -+ bfq_reparent_leaf_entity(bfqd, -+ bfqg->sched_data.in_service_entity); -+} -+ -+/** -+ * bfq_destroy_group - destroy @bfqg. -+ * @bfqg: the group being destroyed. -+ * -+ * Destroy @bfqg, making sure that it is not referenced from its parent. -+ * blkio already grabs the queue_lock for us, so no need to use RCU-based magic -+ */ -+static void bfq_pd_offline(struct blkg_policy_data *pd) -+{ -+ struct bfq_service_tree *st; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; -+ struct bfq_entity *entity; -+ int i; -+ -+ BUG_ON(!pd); -+ bfqg = pd_to_bfqg(pd); -+ BUG_ON(!bfqg); -+ bfqd = bfqg->bfqd; -+ BUG_ON(bfqd && !bfqd->root_group); -+ -+ entity = bfqg->my_entity; -+ -+ if (!entity) /* root group */ -+ return; -+ -+ /* -+ * Empty all service_trees belonging to this group before -+ * deactivating the group itself. -+ */ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) { -+ BUG_ON(!bfqg->sched_data.service_tree); -+ st = bfqg->sched_data.service_tree + i; -+ /* -+ * The idle tree may still contain bfq_queues belonging -+ * to exited task because they never migrated to a different -+ * cgroup from the one being destroyed now. No one else -+ * can access them so it's safe to act without any lock. -+ */ -+ bfq_flush_idle_tree(st); -+ -+ /* -+ * It may happen that some queues are still active -+ * (busy) upon group destruction (if the corresponding -+ * processes have been forced to terminate). We move -+ * all the leaf entities corresponding to these queues -+ * to the root_group. -+ * Also, it may happen that the group has an entity -+ * in service, which is disconnected from the active -+ * tree: it must be moved, too. -+ * There is no need to put the sync queues, as the -+ * scheduler has taken no reference. -+ */ -+ bfq_reparent_active_entities(bfqd, bfqg, st); -+ BUG_ON(!RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(!RB_EMPTY_ROOT(&st->idle)); -+ } -+ BUG_ON(bfqg->sched_data.next_in_service); -+ BUG_ON(bfqg->sched_data.in_service_entity); -+ -+ __bfq_deactivate_entity(entity, 0); -+ bfq_put_async_queues(bfqd, bfqg); -+ BUG_ON(entity->tree); -+ -+ bfqg_stats_xfer_dead(bfqg); -+} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ struct blkcg_gq *blkg; -+ -+ list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ -+ bfq_end_wr_async_queues(bfqd, bfqg); -+ } -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, -+ struct cftype *cftype) -+{ -+ struct blkcg *blkcg = css_to_blkcg(css); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ int ret = -EINVAL; -+ -+ spin_lock_irq(&blkcg->lock); -+ ret = bfqgd->weight; -+ spin_unlock_irq(&blkcg->lock); -+ -+ return ret; -+} -+ -+static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) -+{ -+ struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ -+ spin_lock_irq(&blkcg->lock); -+ seq_printf(sf, "%u\n", bfqgd->weight); -+ spin_unlock_irq(&blkcg->lock); -+ -+ return 0; -+} -+ -+static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, -+ struct cftype *cftype, -+ u64 val) -+{ -+ struct blkcg *blkcg = css_to_blkcg(css); -+ struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ struct blkcg_gq *blkg; -+ int ret = -EINVAL; -+ -+ if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) -+ return ret; -+ -+ ret = 0; -+ spin_lock_irq(&blkcg->lock); -+ bfqgd->weight = (unsigned short)val; -+ hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { -+ struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ -+ if (!bfqg) -+ continue; -+ /* -+ * Setting the prio_changed flag of the entity -+ * to 1 with new_weight == weight would re-set -+ * the value of the weight to its ioprio mapping. -+ * Set the flag only if necessary. -+ */ -+ if ((unsigned short)val != bfqg->entity.new_weight) { -+ bfqg->entity.new_weight = (unsigned short)val; -+ /* -+ * Make sure that the above new value has been -+ * stored in bfqg->entity.new_weight before -+ * setting the prio_changed flag. In fact, -+ * this flag may be read asynchronously (in -+ * critical sections protected by a different -+ * lock than that held here), and finding this -+ * flag set may cause the execution of the code -+ * for updating parameters whose value may -+ * depend also on bfqg->entity.new_weight (in -+ * __bfq_entity_update_weight_prio). -+ * This barrier makes sure that the new value -+ * of bfqg->entity.new_weight is correctly -+ * seen in that code. -+ */ -+ smp_wmb(); -+ bfqg->entity.prio_changed = 1; -+ } -+ } -+ spin_unlock_irq(&blkcg->lock); -+ -+ return ret; -+} -+ -+static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, -+ loff_t off) -+{ -+ /* First unsigned long found in the file is used */ -+ return bfqio_cgroup_weight_write(of_css(of), NULL, -+ simple_strtoull(strim(buf), NULL, 0)); -+} -+ -+static int bfqg_print_stat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_stat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), blkg_prfill_rwstat, -+ &blkcg_policy_bfq, seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ u64 sum = bfqg_stat_pd_recursive_sum(pd, off); -+ -+ return __blkg_prfill_u64(sf, pd, sum); -+} -+ -+static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); -+ -+ return __blkg_prfill_rwstat(sf, pd, &sum); -+} -+ -+static int bfqg_print_stat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_stat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, false); -+ return 0; -+} -+ -+static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_rwstat_recursive, &blkcg_policy_bfq, -+ seq_cft(sf)->private, true); -+ return 0; -+} -+ -+static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct bfq_group *bfqg = pd_to_bfqg(pd); -+ u64 samples = blkg_stat_read(&bfqg->stats.avg_queue_size_samples); -+ u64 v = 0; -+ -+ if (samples) { -+ v = blkg_stat_read(&bfqg->stats.avg_queue_size_sum); -+ v = div64_u64(v, samples); -+ } -+ __blkg_prfill_u64(sf, pd, v); -+ return 0; -+} -+ -+/* print avg_queue_size */ -+static int bfqg_print_avg_queue_size(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_avg_queue_size, &blkcg_policy_bfq, -+ 0, false); -+ return 0; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ int ret; -+ -+ ret = blkcg_activate_policy(bfqd->queue, &blkcg_policy_bfq); -+ if (ret) -+ return NULL; -+ -+ return blkg_to_bfqg(bfqd->queue->root_blkg); -+} -+ -+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -+{ -+ struct bfq_group_data *bgd; -+ -+ bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); -+ if (!bgd) -+ return NULL; -+ return &bgd->pd; -+} -+ -+static void bfq_cpd_free(struct blkcg_policy_data *cpd) -+{ -+ kfree(cpd_to_bfqgd(cpd)); -+} -+ -+static struct cftype bfqio_files_dfl[] = { -+ { -+ .name = "weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfqio_cgroup_weight_read_dfl, -+ .write = bfqio_cgroup_weight_write_dfl, -+ }, -+ {} /* terminate */ -+}; -+ -+static struct cftype bfqio_files[] = { -+ { -+ .name = "bfq.weight", -+ .read_u64 = bfqio_cgroup_weight_read, -+ .write_u64 = bfqio_cgroup_weight_write, -+ }, -+ /* statistics, cover only the tasks in the bfqg */ -+ { -+ .name = "bfq.time", -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.sectors", -+ .private = offsetof(struct bfq_group, stats.sectors), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.io_service_bytes", -+ .private = offsetof(struct bfq_group, stats.service_bytes), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_serviced", -+ .private = offsetof(struct bfq_group, stats.serviced), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_service_time", -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_wait_time", -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_merged", -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ { -+ .name = "bfq.io_queued", -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat, -+ }, -+ -+ /* the same statictics which cover the bfqg and its descendants */ -+ { -+ .name = "bfq.time_recursive", -+ .private = offsetof(struct bfq_group, stats.time), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = "bfq.sectors_recursive", -+ .private = offsetof(struct bfq_group, stats.sectors), -+ .seq_show = bfqg_print_stat_recursive, -+ }, -+ { -+ .name = "bfq.io_service_bytes_recursive", -+ .private = offsetof(struct bfq_group, stats.service_bytes), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_serviced_recursive", -+ .private = offsetof(struct bfq_group, stats.serviced), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_service_time_recursive", -+ .private = offsetof(struct bfq_group, stats.service_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_wait_time_recursive", -+ .private = offsetof(struct bfq_group, stats.wait_time), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_merged_recursive", -+ .private = offsetof(struct bfq_group, stats.merged), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.io_queued_recursive", -+ .private = offsetof(struct bfq_group, stats.queued), -+ .seq_show = bfqg_print_rwstat_recursive, -+ }, -+ { -+ .name = "bfq.avg_queue_size", -+ .seq_show = bfqg_print_avg_queue_size, -+ }, -+ { -+ .name = "bfq.group_wait_time", -+ .private = offsetof(struct bfq_group, stats.group_wait_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.idle_time", -+ .private = offsetof(struct bfq_group, stats.idle_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.empty_time", -+ .private = offsetof(struct bfq_group, stats.empty_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.dequeue", -+ .private = offsetof(struct bfq_group, stats.dequeue), -+ .seq_show = bfqg_print_stat, -+ }, -+ { -+ .name = "bfq.unaccounted_time", -+ .private = offsetof(struct bfq_group, stats.unaccounted_time), -+ .seq_show = bfqg_print_stat, -+ }, -+ { } /* terminate */ -+}; -+ -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfqio_files_dfl, -+ .legacy_cftypes = bfqio_files, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+}; -+ -+#else -+ -+static void bfq_init_entity(struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ entity->weight = entity->new_weight; -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) { -+ bfqq->ioprio = bfqq->new_ioprio; -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ } -+ entity->sched_data = &bfqg->sched_data; -+} -+ -+static struct bfq_group * -+bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ return bfqd->root_group; -+} -+ -+static void bfq_bfqq_move(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_entity *entity, -+ struct bfq_group *bfqg) -+{ -+} -+ -+static void bfq_end_wr_async(struct bfq_data *bfqd) -+{ -+ bfq_end_wr_async_queues(bfqd, bfqd->root_group); -+} -+ -+static void bfq_disconnect_groups(struct bfq_data *bfqd) -+{ -+ bfq_put_async_queues(bfqd, bfqd->root_group); -+} -+ -+static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) -+{ -+ return bfqd->root_group; -+} -+ -+static struct bfq_group * -+bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) -+{ -+ struct bfq_group *bfqg; -+ int i; -+ -+ bfqg = kmalloc_node(sizeof(*bfqg), GFP_KERNEL | __GFP_ZERO, node); -+ if (!bfqg) -+ return NULL; -+ -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ bfqg->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ -+ return bfqg; -+} -+#endif -diff --git a/block/bfq-ioc.c b/block/bfq-ioc.c -new file mode 100644 -index 0000000..fb7bb8f ---- /dev/null -+++ b/block/bfq-ioc.c -@@ -0,0 +1,36 @@ -+/* -+ * BFQ: I/O context handling. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ */ -+ -+/** -+ * icq_to_bic - convert iocontext queue structure to bfq_io_cq. -+ * @icq: the iocontext queue. -+ */ -+static struct bfq_io_cq *icq_to_bic(struct io_cq *icq) -+{ -+ /* bic->icq is the first member, %NULL will convert to %NULL */ -+ return container_of(icq, struct bfq_io_cq, icq); -+} -+ -+/** -+ * bfq_bic_lookup - search into @ioc a bic associated to @bfqd. -+ * @bfqd: the lookup key. -+ * @ioc: the io_context of the process doing I/O. -+ * -+ * Queue lock must be held. -+ */ -+static struct bfq_io_cq *bfq_bic_lookup(struct bfq_data *bfqd, -+ struct io_context *ioc) -+{ -+ if (ioc) -+ return icq_to_bic(ioc_lookup_icq(ioc, bfqd->queue)); -+ return NULL; -+} -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -new file mode 100644 -index 0000000..85e2169 ---- /dev/null -+++ b/block/bfq-iosched.c -@@ -0,0 +1,3763 @@ -+/* -+ * Budget Fair Queueing (BFQ) disk scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ * -+ * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ -+ * file. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based on -+ * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, -+ * measured in number of sectors, to processes instead of time slices. The -+ * device is not granted to the in-service process for a given time slice, -+ * but until it has exhausted its assigned budget. This change from the time -+ * to the service domain allows BFQ to distribute the device throughput -+ * among processes as desired, without any distortion due to ZBR, workload -+ * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, -+ * called B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated to processes. Thanks to the -+ * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to -+ * I/O-bound processes issuing sequential requests (to boost the -+ * throughput), and yet guarantee a low latency to interactive and soft -+ * real-time applications. -+ * -+ * BFQ is described in [1], where also a reference to the initial, more -+ * theoretical paper on BFQ can be found. The interested reader can find -+ * in the latter paper full details on the main algorithm, as well as -+ * formulas of the guarantees and formal proofs of all the properties. -+ * With respect to the version of BFQ presented in these papers, this -+ * implementation adds a few more heuristics, such as the one that -+ * guarantees a low latency to soft real-time applications, and a -+ * hierarchical extension based on H-WF2Q+. -+ * -+ * B-WF2Q+ is based on WF2Q+, that is described in [2], together with -+ * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) -+ * complexity derives from the one introduced with EEVDF in [3]. -+ * -+ * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness -+ * with the BFQ Disk I/O Scheduler'', -+ * Proceedings of the 5th Annual International Systems and Storage -+ * Conference (SYSTOR '12), June 2012. -+ * -+ * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf -+ * -+ * [2] Jon C.R. Bennett and H. Zhang, ``Hierarchical Packet Fair Queueing -+ * Algorithms,'' IEEE/ACM Transactions on Networking, 5(5):675-689, -+ * Oct 1997. -+ * -+ * http://www.cs.cmu.edu/~hzhang/papers/TON-97-Oct.ps.gz -+ * -+ * [3] I. Stoica and H. Abdel-Wahab, ``Earliest Eligible Virtual Deadline -+ * First: A Flexible and Accurate Mechanism for Proportional Share -+ * Resource Allocation,'' technical report. -+ * -+ * http://www.cs.berkeley.edu/~istoica/papers/eevdf-tr-95.pdf -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "bfq.h" -+#include "blk.h" -+ -+/* Expiration time of sync (0) and async (1) requests, in jiffies. */ -+static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; -+ -+/* Maximum backwards seek, in KiB. */ -+static const int bfq_back_max = 16 * 1024; -+ -+/* Penalty of a backwards seek, in number of sectors. */ -+static const int bfq_back_penalty = 2; -+ -+/* Idling period duration, in jiffies. */ -+static int bfq_slice_idle = HZ / 125; -+ -+/* Minimum number of assigned budgets for which stats are safe to compute. */ -+static const int bfq_stats_min_budgets = 194; -+ -+/* Default maximum budget values, in sectors and number of requests. */ -+static const int bfq_default_max_budget = 16 * 1024; -+static const int bfq_max_budget_async_rq = 4; -+ -+/* -+ * Async to sync throughput distribution is controlled as follows: -+ * when an async request is served, the entity is charged the number -+ * of sectors of the request, multiplied by the factor below -+ */ -+static const int bfq_async_charge_factor = 10; -+ -+/* Default timeout values, in jiffies, approximating CFQ defaults. */ -+static const int bfq_timeout_sync = HZ / 8; -+static int bfq_timeout_async = HZ / 25; -+ -+struct kmem_cache *bfq_pool; -+ -+/* Below this threshold (in ms), we consider thinktime immediate. */ -+#define BFQ_MIN_TT 2 -+ -+/* hw_tag detection: parallel requests threshold and min samples needed. */ -+#define BFQ_HW_QUEUE_THRESHOLD 4 -+#define BFQ_HW_QUEUE_SAMPLES 32 -+ -+#define BFQQ_SEEK_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) -+ -+/* Min samples used for peak rate estimation (for autotuning). */ -+#define BFQ_PEAK_RATE_SAMPLES 32 -+ -+/* Shift used for peak rate fixed precision calculations. */ -+#define BFQ_RATE_SHIFT 16 -+ -+/* -+ * By default, BFQ computes the duration of the weight raising for -+ * interactive applications automatically, using the following formula: -+ * duration = (R / r) * T, where r is the peak rate of the device, and -+ * R and T are two reference parameters. -+ * In particular, R is the peak rate of the reference device (see below), -+ * and T is a reference time: given the systems that are likely to be -+ * installed on the reference device according to its speed class, T is -+ * about the maximum time needed, under BFQ and while reading two files in -+ * parallel, to load typical large applications on these systems. -+ * In practice, the slower/faster the device at hand is, the more/less it -+ * takes to load applications with respect to the reference device. -+ * Accordingly, the longer/shorter BFQ grants weight raising to interactive -+ * applications. -+ * -+ * BFQ uses four different reference pairs (R, T), depending on: -+ * . whether the device is rotational or non-rotational; -+ * . whether the device is slow, such as old or portable HDDs, as well as -+ * SD cards, or fast, such as newer HDDs and SSDs. -+ * -+ * The device's speed class is dynamically (re)detected in -+ * bfq_update_peak_rate() every time the estimated peak rate is updated. -+ * -+ * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] -+ * are the reference values for a slow/fast rotational device, whereas -+ * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for -+ * a slow/fast non-rotational device. Finally, device_speed_thresh are the -+ * thresholds used to switch between speed classes. -+ * Both the reference peak rates and the thresholds are measured in -+ * sectors/usec, left-shifted by BFQ_RATE_SHIFT. -+ */ -+static int R_slow[2] = {1536, 10752}; -+static int R_fast[2] = {17415, 34791}; -+/* -+ * To improve readability, a conversion function is used to initialize the -+ * following arrays, which entails that they can be initialized only in a -+ * function. -+ */ -+static int T_slow[2]; -+static int T_fast[2]; -+static int device_speed_thresh[2]; -+ -+#define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ -+ { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) -+ -+#define RQ_BIC(rq) ((struct bfq_io_cq *) (rq)->elv.priv[0]) -+#define RQ_BFQQ(rq) ((rq)->elv.priv[1]) -+ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd); -+ -+#include "bfq-ioc.c" -+#include "bfq-sched.c" -+#include "bfq-cgroup.c" -+ -+#define bfq_class_idle(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_IDLE) -+#define bfq_class_rt(bfqq) ((bfqq)->ioprio_class == IOPRIO_CLASS_RT) -+ -+#define bfq_sample_valid(samples) ((samples) > 80) -+ -+/* -+ * We regard a request as SYNC, if either it's a read or has the SYNC bit -+ * set (in which case it could also be a direct WRITE). -+ */ -+static int bfq_bio_sync(struct bio *bio) -+{ -+ if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) -+ return 1; -+ -+ return 0; -+} -+ -+/* -+ * Scheduler run of queue, if there are requests pending and no one in the -+ * driver that will restart queueing. -+ */ -+static void bfq_schedule_dispatch(struct bfq_data *bfqd) -+{ -+ if (bfqd->queued != 0) { -+ bfq_log(bfqd, "schedule dispatch"); -+ kblockd_schedule_work(&bfqd->unplug_work); -+ } -+} -+ -+/* -+ * Lifted from AS - choose which of rq1 and rq2 that is best served now. -+ * We choose the request that is closesr to the head right now. Distance -+ * behind the head is penalized and only allowed to a certain extent. -+ */ -+static struct request *bfq_choose_req(struct bfq_data *bfqd, -+ struct request *rq1, -+ struct request *rq2, -+ sector_t last) -+{ -+ sector_t s1, s2, d1 = 0, d2 = 0; -+ unsigned long back_max; -+#define BFQ_RQ1_WRAP 0x01 /* request 1 wraps */ -+#define BFQ_RQ2_WRAP 0x02 /* request 2 wraps */ -+ unsigned int wrap = 0; /* bit mask: requests behind the disk head? */ -+ -+ if (!rq1 || rq1 == rq2) -+ return rq2; -+ if (!rq2) -+ return rq1; -+ -+ if (rq_is_sync(rq1) && !rq_is_sync(rq2)) -+ return rq1; -+ else if (rq_is_sync(rq2) && !rq_is_sync(rq1)) -+ return rq2; -+ if ((rq1->cmd_flags & REQ_META) && !(rq2->cmd_flags & REQ_META)) -+ return rq1; -+ else if ((rq2->cmd_flags & REQ_META) && !(rq1->cmd_flags & REQ_META)) -+ return rq2; -+ -+ s1 = blk_rq_pos(rq1); -+ s2 = blk_rq_pos(rq2); -+ -+ /* -+ * By definition, 1KiB is 2 sectors. -+ */ -+ back_max = bfqd->bfq_back_max * 2; -+ -+ /* -+ * Strict one way elevator _except_ in the case where we allow -+ * short backward seeks which are biased as twice the cost of a -+ * similar forward seek. -+ */ -+ if (s1 >= last) -+ d1 = s1 - last; -+ else if (s1 + back_max >= last) -+ d1 = (last - s1) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ1_WRAP; -+ -+ if (s2 >= last) -+ d2 = s2 - last; -+ else if (s2 + back_max >= last) -+ d2 = (last - s2) * bfqd->bfq_back_penalty; -+ else -+ wrap |= BFQ_RQ2_WRAP; -+ -+ /* Found required data */ -+ -+ /* -+ * By doing switch() on the bit mask "wrap" we avoid having to -+ * check two variables for all permutations: --> faster! -+ */ -+ switch (wrap) { -+ case 0: /* common case for CFQ: rq1 and rq2 not wrapped */ -+ if (d1 < d2) -+ return rq1; -+ else if (d2 < d1) -+ return rq2; -+ -+ if (s1 >= s2) -+ return rq1; -+ else -+ return rq2; -+ -+ case BFQ_RQ2_WRAP: -+ return rq1; -+ case BFQ_RQ1_WRAP: -+ return rq2; -+ case (BFQ_RQ1_WRAP|BFQ_RQ2_WRAP): /* both rqs wrapped */ -+ default: -+ /* -+ * Since both rqs are wrapped, -+ * start with the one that's further behind head -+ * (--> only *one* back seek required), -+ * since back seek takes more time than forward. -+ */ -+ if (s1 <= s2) -+ return rq1; -+ else -+ return rq2; -+ } -+} -+ -+/* -+ * Tell whether there are active queues or groups with differentiated weights. -+ */ -+static bool bfq_differentiated_weights(struct bfq_data *bfqd) -+{ -+ /* -+ * For weights to differ, at least one of the trees must contain -+ * at least two nodes. -+ */ -+ return (!RB_EMPTY_ROOT(&bfqd->queue_weights_tree) && -+ (bfqd->queue_weights_tree.rb_node->rb_left || -+ bfqd->queue_weights_tree.rb_node->rb_right) -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ ) || -+ (!RB_EMPTY_ROOT(&bfqd->group_weights_tree) && -+ (bfqd->group_weights_tree.rb_node->rb_left || -+ bfqd->group_weights_tree.rb_node->rb_right) -+#endif -+ ); -+} -+ -+/* -+ * The following function returns true if every queue must receive the -+ * same share of the throughput (this condition is used when deciding -+ * whether idling may be disabled, see the comments in the function -+ * bfq_bfqq_may_idle()). -+ * -+ * Such a scenario occurs when: -+ * 1) all active queues have the same weight, -+ * 2) all active groups at the same level in the groups tree have the same -+ * weight, -+ * 3) all active groups at the same level in the groups tree have the same -+ * number of children. -+ * -+ * Unfortunately, keeping the necessary state for evaluating exactly the -+ * above symmetry conditions would be quite complex and time-consuming. -+ * Therefore this function evaluates, instead, the following stronger -+ * sub-conditions, for which it is much easier to maintain the needed -+ * state: -+ * 1) all active queues have the same weight, -+ * 2) all active groups have the same weight, -+ * 3) all active groups have at most one active child each. -+ * In particular, the last two conditions are always true if hierarchical -+ * support and the cgroups interface are not enabled, thus no state needs -+ * to be maintained in this case. -+ */ -+static bool bfq_symmetric_scenario(struct bfq_data *bfqd) -+{ -+ return -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ !bfqd->active_numerous_groups && -+#endif -+ !bfq_differentiated_weights(bfqd); -+} -+ -+/* -+ * If the weight-counter tree passed as input contains no counter for -+ * the weight of the input entity, then add that counter; otherwise just -+ * increment the existing counter. -+ * -+ * Note that weight-counter trees contain few nodes in mostly symmetric -+ * scenarios. For example, if all queues have the same weight, then the -+ * weight-counter tree for the queues may contain at most one node. -+ * This holds even if low_latency is on, because weight-raised queues -+ * are not inserted in the tree. -+ * In most scenarios, the rate at which nodes are created/destroyed -+ * should be low too. -+ */ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ struct rb_node **new = &(root->rb_node), *parent = NULL; -+ -+ /* -+ * Do not insert if the entity is already associated with a -+ * counter, which happens if: -+ * 1) the entity is associated with a queue, -+ * 2) a request arrival has caused the queue to become both -+ * non-weight-raised, and hence change its weight, and -+ * backlogged; in this respect, each of the two events -+ * causes an invocation of this function, -+ * 3) this is the invocation of this function caused by the -+ * second event. This second invocation is actually useless, -+ * and we handle this fact by exiting immediately. More -+ * efficient or clearer solutions might possibly be adopted. -+ */ -+ if (entity->weight_counter) -+ return; -+ -+ while (*new) { -+ struct bfq_weight_counter *__counter = container_of(*new, -+ struct bfq_weight_counter, -+ weights_node); -+ parent = *new; -+ -+ if (entity->weight == __counter->weight) { -+ entity->weight_counter = __counter; -+ goto inc_counter; -+ } -+ if (entity->weight < __counter->weight) -+ new = &((*new)->rb_left); -+ else -+ new = &((*new)->rb_right); -+ } -+ -+ entity->weight_counter = kzalloc(sizeof(struct bfq_weight_counter), -+ GFP_ATOMIC); -+ entity->weight_counter->weight = entity->weight; -+ rb_link_node(&entity->weight_counter->weights_node, parent, new); -+ rb_insert_color(&entity->weight_counter->weights_node, root); -+ -+inc_counter: -+ entity->weight_counter->num_active++; -+} -+ -+/* -+ * Decrement the weight counter associated with the entity, and, if the -+ * counter reaches 0, remove the counter from the tree. -+ * See the comments to the function bfq_weights_tree_add() for considerations -+ * about overhead. -+ */ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root) -+{ -+ if (!entity->weight_counter) -+ return; -+ -+ BUG_ON(RB_EMPTY_ROOT(root)); -+ BUG_ON(entity->weight_counter->weight != entity->weight); -+ -+ BUG_ON(!entity->weight_counter->num_active); -+ entity->weight_counter->num_active--; -+ if (entity->weight_counter->num_active > 0) -+ goto reset_entity_pointer; -+ -+ rb_erase(&entity->weight_counter->weights_node, root); -+ kfree(entity->weight_counter); -+ -+reset_entity_pointer: -+ entity->weight_counter = NULL; -+} -+ -+static struct request *bfq_find_next_rq(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct rb_node *rbnext = rb_next(&last->rb_node); -+ struct rb_node *rbprev = rb_prev(&last->rb_node); -+ struct request *next = NULL, *prev = NULL; -+ -+ BUG_ON(RB_EMPTY_NODE(&last->rb_node)); -+ -+ if (rbprev) -+ prev = rb_entry_rq(rbprev); -+ -+ if (rbnext) -+ next = rb_entry_rq(rbnext); -+ else { -+ rbnext = rb_first(&bfqq->sort_list); -+ if (rbnext && rbnext != &last->rb_node) -+ next = rb_entry_rq(rbnext); -+ } -+ -+ return bfq_choose_req(bfqd, next, prev, blk_rq_pos(last)); -+} -+ -+/* see the definition of bfq_async_charge_factor for details */ -+static unsigned long bfq_serv_to_charge(struct request *rq, -+ struct bfq_queue *bfqq) -+{ -+ return blk_rq_sectors(rq) * -+ (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * -+ bfq_async_charge_factor)); -+} -+ -+/** -+ * bfq_updated_next_req - update the queue after a new next_rq selection. -+ * @bfqd: the device data the queue belongs to. -+ * @bfqq: the queue to update. -+ * -+ * If the first request of a queue changes we make sure that the queue -+ * has enough budget to serve at least its first request (if the -+ * request has grown). We do this because if the queue has not enough -+ * budget for its first request, it has to go through two dispatch -+ * rounds to actually get it dispatched. -+ */ -+static void bfq_updated_next_req(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct request *next_rq = bfqq->next_rq; -+ unsigned long new_budget; -+ -+ if (!next_rq) -+ return; -+ -+ if (bfqq == bfqd->in_service_queue) -+ /* -+ * In order not to break guarantees, budgets cannot be -+ * changed after an entity has been selected. -+ */ -+ return; -+ -+ BUG_ON(entity->tree != &st->active); -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ -+ new_budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ if (entity->budget != new_budget) { -+ entity->budget = new_budget; -+ bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", -+ new_budget); -+ bfq_activate_bfqq(bfqd, bfqq); -+ } -+} -+ -+static unsigned int bfq_wr_duration(struct bfq_data *bfqd) -+{ -+ u64 dur; -+ -+ if (bfqd->bfq_wr_max_time > 0) -+ return bfqd->bfq_wr_max_time; -+ -+ dur = bfqd->RT_prod; -+ do_div(dur, bfqd->peak_rate); -+ -+ return dur; -+} -+ -+/* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ -+static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *item; -+ struct hlist_node *n; -+ -+ hlist_for_each_entry_safe(item, n, &bfqd->burst_list, burst_list_node) -+ hlist_del_init(&item->burst_list_node); -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+ bfqd->burst_size = 1; -+} -+ -+/* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -+static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ /* Increment burst size to take into account also bfqq */ -+ bfqd->burst_size++; -+ -+ if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { -+ struct bfq_queue *pos, *bfqq_item; -+ struct hlist_node *n; -+ -+ /* -+ * Enough queues have been activated shortly after each -+ * other to consider this burst as large. -+ */ -+ bfqd->large_burst = true; -+ -+ /* -+ * We can now mark all queues in the burst list as -+ * belonging to a large burst. -+ */ -+ hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -+ burst_list_node) -+ bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ -+ /* -+ * From now on, and until the current burst finishes, any -+ * new queue being activated shortly after the last queue -+ * was inserted in the burst can be immediately marked as -+ * belonging to a large burst. So the burst list is not -+ * needed any more. Remove it. -+ */ -+ hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, -+ burst_list_node) -+ hlist_del_init(&pos->burst_list_node); -+ } else /* burst not yet large: add bfqq to the burst list */ -+ hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); -+} -+ -+/* -+ * If many queues happen to become active shortly after each other, then, -+ * to help the processes associated to these queues get their job done as -+ * soon as possible, it is usually better to not grant either weight-raising -+ * or device idling to these queues. In this comment we describe, firstly, -+ * the reasons why this fact holds, and, secondly, the next function, which -+ * implements the main steps needed to properly mark these queues so that -+ * they can then be treated in a different way. -+ * -+ * As for the terminology, we say that a queue becomes active, i.e., -+ * switches from idle to backlogged, either when it is created (as a -+ * consequence of the arrival of an I/O request), or, if already existing, -+ * when a new request for the queue arrives while the queue is idle. -+ * Bursts of activations, i.e., activations of different queues occurring -+ * shortly after each other, are typically caused by services or applications -+ * that spawn or reactivate many parallel threads/processes. Examples are -+ * systemd during boot or git grep. -+ * -+ * These services or applications benefit mostly from a high throughput: -+ * the quicker the requests of the activated queues are cumulatively served, -+ * the sooner the target job of these queues gets completed. As a consequence, -+ * weight-raising any of these queues, which also implies idling the device -+ * for it, is almost always counterproductive: in most cases it just lowers -+ * throughput. -+ * -+ * On the other hand, a burst of activations may be also caused by the start -+ * of an application that does not consist in a lot of parallel I/O-bound -+ * threads. In fact, with a complex application, the burst may be just a -+ * consequence of the fact that several processes need to be executed to -+ * start-up the application. To start an application as quickly as possible, -+ * the best thing to do is to privilege the I/O related to the application -+ * with respect to all other I/O. Therefore, the best strategy to start as -+ * quickly as possible an application that causes a burst of activations is -+ * to weight-raise all the queues activated during the burst. This is the -+ * exact opposite of the best strategy for the other type of bursts. -+ * -+ * In the end, to take the best action for each of the two cases, the two -+ * types of bursts need to be distinguished. Fortunately, this seems -+ * relatively easy to do, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that bursts with a larger size -+ * than that threshold are apparently caused only by services or commands -+ * such as systemd or git grep. For brevity, hereafter we call just 'large' -+ * these bursts. BFQ *does not* weight-raise queues whose activations occur -+ * in a large burst. In addition, for each of these queues BFQ performs or -+ * does not perform idling depending on which choice boosts the throughput -+ * most. The exact choice depends on the device and request pattern at -+ * hand. -+ * -+ * Turning back to the next function, it implements all the steps needed -+ * to detect the occurrence of a large burst and to properly mark all the -+ * queues belonging to it (so that they can then be treated in a different -+ * way). This goal is achieved by maintaining a special "burst list" that -+ * holds, temporarily, the queues that belong to the burst in progress. The -+ * list is then used to mark these queues as belonging to a large burst if -+ * the burst does become large. The main steps are the following. -+ * -+ * . when the very first queue is activated, the queue is inserted into the -+ * list (as it could be the first queue in a possible burst) -+ * -+ * . if the current burst has not yet become large, and a queue Q that does -+ * not yet belong to the burst is activated shortly after the last time -+ * at which a new queue entered the burst list, then the function appends -+ * Q to the burst list -+ * -+ * . if, as a consequence of the previous step, the burst size reaches -+ * the large-burst threshold, then -+ * -+ * . all the queues in the burst list are marked as belonging to a -+ * large burst -+ * -+ * . the burst list is deleted; in fact, the burst list already served -+ * its purpose (keeping temporarily track of the queues in a burst, -+ * so as to be able to mark them as belonging to a large burst in the -+ * previous sub-step), and now is not needed any more -+ * -+ * . the device enters a large-burst mode -+ * -+ * . if a queue Q that does not belong to the burst is activated while -+ * the device is in large-burst mode and shortly after the last time -+ * at which a queue either entered the burst list or was marked as -+ * belonging to the current large burst, then Q is immediately marked -+ * as belonging to a large burst. -+ * -+ * . if a queue Q that does not belong to the burst is activated a while -+ * later, i.e., not shortly after, than the last time at which a queue -+ * either entered the burst list or was marked as belonging to the -+ * current large burst, then the current burst is deemed as finished and: -+ * -+ * . the large-burst mode is reset if set -+ * -+ * . the burst list is emptied -+ * -+ * . Q is inserted in the burst list, as Q may be the first queue -+ * in a possible new burst (then the burst list contains just Q -+ * after this step). -+ */ -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool idle_for_long_time) -+{ -+ /* -+ * If bfqq happened to be activated in a burst, but has been idle -+ * for at least as long as an interactive queue, then we assume -+ * that, in the overall I/O initiated in the burst, the I/O -+ * associated to bfqq is finished. So bfqq does not need to be -+ * treated as a queue belonging to a burst anymore. Accordingly, -+ * we reset bfqq's in_large_burst flag if set, and remove bfqq -+ * from the burst list if it's there. We do not decrement instead -+ * burst_size, because the fact that bfqq does not need to belong -+ * to the burst list any more does not invalidate the fact that -+ * bfqq may have been activated during the current burst. -+ */ -+ if (idle_for_long_time) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ /* -+ * If bfqq is already in the burst list or is part of a large -+ * burst, then there is nothing else to do. -+ */ -+ if (!hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq)) -+ return; -+ -+ /* -+ * If bfqq's activation happens late enough, then the current -+ * burst is finished, and related data structures must be reset. -+ * -+ * In this respect, consider the special case where bfqq is the very -+ * first queue being activated. In this case, last_ins_in_burst is -+ * not yet significant when we get here. But it is easy to verify -+ * that, whether or not the following condition is true, bfqq will -+ * end up being inserted into the burst list. In particular the -+ * list will happen to contain only bfqq. And this is exactly what -+ * has to happen, as bfqq may be the first queue in a possible -+ * burst. -+ */ -+ if (time_is_before_jiffies(bfqd->last_ins_in_burst + -+ bfqd->bfq_burst_interval)) { -+ bfqd->large_burst = false; -+ bfq_reset_burst_list(bfqd, bfqq); -+ return; -+ } -+ -+ /* -+ * If we get here, then bfqq is being activated shortly after the -+ * last queue. So, if the current burst is also large, we can mark -+ * bfqq as belonging to this large burst immediately. -+ */ -+ if (bfqd->large_burst) { -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ return; -+ } -+ -+ /* -+ * If we get here, then a large-burst state has not yet been -+ * reached, but bfqq is being activated shortly after the last -+ * queue. Then we add bfqq to the burst. -+ */ -+ bfq_add_to_burst(bfqd, bfqq); -+} -+ -+static void bfq_add_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *next_rq, *prev; -+ unsigned long old_wr_coeff = bfqq->wr_coeff; -+ bool interactive = false; -+ -+ bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); -+ bfqq->queued[rq_is_sync(rq)]++; -+ bfqd->queued++; -+ -+ elv_rb_add(&bfqq->sort_list, rq); -+ -+ /* -+ * Check if this request is a better next-serve candidate. -+ */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ -+ if (!bfq_bfqq_busy(bfqq)) { -+ bool soft_rt, in_burst, -+ idle_for_long_time = time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, -+ rq->cmd_flags); -+#endif -+ if (bfq_bfqq_sync(bfqq)) { -+ bool already_in_burst = -+ !hlist_unhashed(&bfqq->burst_list_node) || -+ bfq_bfqq_in_large_burst(bfqq); -+ bfq_handle_burst(bfqd, bfqq, idle_for_long_time); -+ /* -+ * If bfqq was not already in the current burst, -+ * then, at this point, bfqq either has been -+ * added to the current burst or has caused the -+ * current burst to terminate. In particular, in -+ * the second case, bfqq has become the first -+ * queue in a possible new burst. -+ * In both cases last_ins_in_burst needs to be -+ * moved forward. -+ */ -+ if (!already_in_burst) -+ bfqd->last_ins_in_burst = jiffies; -+ } -+ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start); -+ interactive = !in_burst && idle_for_long_time; -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (time_before(jiffies, -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle)) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ } -+ -+ if (!bfqd->low_latency) -+ goto add_bfqq_busy; -+ -+ /* -+ * If the queue: -+ * - is not being boosted, -+ * - has been idle for enough time, -+ * - is not a sync queue or is linked to a bfq_io_cq (it is -+ * shared "for its nature" or it is not shared and its -+ * requests have not been redirected to a shared queue) -+ * start a weight-raising period. -+ */ -+ if (old_wr_coeff == 1 && (interactive || soft_rt) && -+ (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ if (interactive) -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ else -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ else if (in_burst || -+ (bfqq->wr_cur_max_time == -+ bfqd->bfq_wr_rt_max_time && -+ !soft_rt)) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (time_before( -+ bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time, -+ jiffies + -+ bfqd->bfq_wr_rt_max_time) && -+ soft_rt) { -+ /* -+ * -+ * The remaining weight-raising time is lower -+ * than bfqd->bfq_wr_rt_max_time, which means -+ * that the application is enjoying weight -+ * raising either because deemed soft-rt in -+ * the near past, or because deemed interactive -+ * a long ago. -+ * In both cases, resetting now the current -+ * remaining weight-raising time for the -+ * application to the weight-raising duration -+ * for soft rt applications would not cause any -+ * latency increase for the application (as the -+ * new duration would be higher than the -+ * remaining time). -+ * -+ * In addition, the application is now meeting -+ * the requirements for being deemed soft rt. -+ * In the end we can correctly and safely -+ * (re)charge the weight-raising duration for -+ * the application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ } -+ if (old_wr_coeff != bfqq->wr_coeff) -+ entity->prio_changed = 1; -+add_bfqq_busy: -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ } else { -+ if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && -+ time_is_before_jiffies( -+ bfqq->last_wr_start_finish + -+ bfqd->bfq_wr_min_inter_arr_async)) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ -+ bfqd->wr_busy_queues++; -+ entity->prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "non-idle wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ if (prev != bfqq->next_rq) -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (bfqd->low_latency && -+ (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) -+ bfqq->last_wr_start_finish = jiffies; -+} -+ -+static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, -+ struct bio *bio) -+{ -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return NULL; -+ -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); -+ if (bfqq) -+ return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); -+ -+ return NULL; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ bfqd->rq_in_driver++; -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", -+ (unsigned long long) bfqd->last_position); -+} -+ -+static void bfq_deactivate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ -+ BUG_ON(bfqd->rq_in_driver == 0); -+ bfqd->rq_in_driver--; -+} -+ -+static void bfq_remove_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ const int sync = rq_is_sync(rq); -+ -+ if (bfqq->next_rq == rq) { -+ bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); -+ bfq_updated_next_req(bfqd, bfqq); -+ } -+ -+ if (rq->queuelist.prev != &rq->queuelist) -+ list_del_init(&rq->queuelist); -+ BUG_ON(bfqq->queued[sync] == 0); -+ bfqq->queued[sync]--; -+ bfqd->queued--; -+ elv_rb_del(&bfqq->sort_list, rq); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) -+ bfq_del_bfqq_busy(bfqd, bfqq, 1); -+ /* -+ * Remove queue from request-position tree as it is empty. -+ */ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ } -+ -+ if (rq->cmd_flags & REQ_META) { -+ BUG_ON(bfqq->meta_pending == 0); -+ bfqq->meta_pending--; -+ } -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); -+#endif -+} -+ -+static int bfq_merge(struct request_queue *q, struct request **req, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct request *__rq; -+ -+ __rq = bfq_find_rq_fmerge(bfqd, bio); -+ if (__rq && elv_rq_merge_ok(__rq, bio)) { -+ *req = __rq; -+ return ELEVATOR_FRONT_MERGE; -+ } -+ -+ return ELEVATOR_NO_MERGE; -+} -+ -+static void bfq_merged_request(struct request_queue *q, struct request *req, -+ int type) -+{ -+ if (type == ELEVATOR_FRONT_MERGE && -+ rb_prev(&req->rb_node) && -+ blk_rq_pos(req) < -+ blk_rq_pos(container_of(rb_prev(&req->rb_node), -+ struct request, rb_node))) { -+ struct bfq_queue *bfqq = RQ_BFQQ(req); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ struct request *prev, *next_rq; -+ -+ /* Reposition request in its sort_list */ -+ elv_rb_del(&bfqq->sort_list, req); -+ elv_rb_add(&bfqq->sort_list, req); -+ /* Choose next request to be served for bfqq */ -+ prev = bfqq->next_rq; -+ next_rq = bfq_choose_req(bfqd, bfqq->next_rq, req, -+ bfqd->last_position); -+ BUG_ON(!next_rq); -+ bfqq->next_rq = next_rq; -+ } -+} -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static void bfq_bio_merged(struct request_queue *q, struct request *req, -+ struct bio *bio) -+{ -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); -+} -+#endif -+ -+static void bfq_merged_requests(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *next_bfqq = RQ_BFQQ(next); -+ -+ /* -+ * If next and rq belong to the same bfq_queue and next is older -+ * than rq, then reposition rq in the fifo (by substituting next -+ * with rq). Otherwise, if next and rq belong to different -+ * bfq_queues, never reposition rq: in fact, we would have to -+ * reposition it with respect to next's position in its own fifo, -+ * which would most certainly be too expensive with respect to -+ * the benefits. -+ */ -+ if (bfqq == next_bfqq && -+ !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -+ time_before(next->fifo_time, rq->fifo_time)) { -+ list_del_init(&rq->queuelist); -+ list_replace_init(&next->queuelist, &rq->queuelist); -+ rq->fifo_time = next->fifo_time; -+ } -+ -+ if (bfqq->next_rq == next) -+ bfqq->next_rq = rq; -+ -+ bfq_remove_request(next); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); -+#endif -+} -+ -+/* Must be called with bfqq != NULL */ -+static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) -+{ -+ BUG_ON(!bfqq); -+ if (bfq_bfqq_busy(bfqq)) -+ bfqq->bfqd->wr_busy_queues--; -+ bfqq->wr_coeff = 1; -+ bfqq->wr_cur_max_time = 0; -+ /* Trigger a weight change on the next activation of the queue */ -+ bfqq->entity.prio_changed = 1; -+} -+ -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ if (bfqg->async_bfqq[i][j]) -+ bfq_bfqq_end_wr(bfqg->async_bfqq[i][j]); -+ if (bfqg->async_idle_bfqq) -+ bfq_bfqq_end_wr(bfqg->async_idle_bfqq); -+} -+ -+static void bfq_end_wr(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) -+ bfq_bfqq_end_wr(bfqq); -+ bfq_end_wr_async(bfqd); -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+} -+ -+static int bfq_allow_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic; -+ -+ /* -+ * Disallow merge of a sync bio into an async request. -+ */ -+ if (bfq_bio_sync(bio) && !rq_is_sync(rq)) -+ return 0; -+ -+ /* -+ * Lookup the bfqq that this bio will be queued with. Allow -+ * merge only if rq is queued there. -+ * Queue lock is held here. -+ */ -+ bic = bfq_bic_lookup(bfqd, current->io_context); -+ if (!bic) -+ return 0; -+ -+ return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq); -+} -+ -+static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ if (bfqq) { -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); -+#endif -+ bfq_mark_bfqq_must_alloc(bfqq); -+ bfq_mark_bfqq_budget_new(bfqq); -+ bfq_clear_bfqq_fifo_expire(bfqq); -+ -+ bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_in_service_queue, cur-budget = %d", -+ bfqq->entity.budget); -+ } -+ -+ bfqd->in_service_queue = bfqq; -+} -+ -+/* -+ * Get and set a new queue for service. -+ */ -+static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfq_get_next_queue(bfqd); -+ -+ __bfq_set_in_service_queue(bfqd, bfqq); -+ return bfqq; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_arm_slice_timer(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ struct bfq_io_cq *bic; -+ unsigned long sl; -+ -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Processes have exited, don't wait. */ -+ bic = bfqd->in_service_bic; -+ if (!bic || atomic_read(&bic->icq.ioc->active_ref) == 0) -+ return; -+ -+ bfq_mark_bfqq_wait_request(bfqq); -+ -+ /* -+ * We don't want to idle for seeks, but we do want to allow -+ * fair distribution of slice time for a process doing back-to-back -+ * seeks. So allow a little bit of time for him to submit a new rq. -+ * -+ * To prevent processes with (partly) seeky workloads from -+ * being too ill-treated, grant them a small fraction of the -+ * assigned budget before reducing the waiting time to -+ * BFQ_MIN_TT. This happened to help reduce latency. -+ */ -+ sl = bfqd->bfq_slice_idle; -+ /* -+ * Unless the queue is being weight-raised or the scenario is -+ * asymmetric, grant only minimum idle time if the queue either -+ * has been seeky for long enough or has already proved to be -+ * constantly seeky. -+ */ -+ if (bfq_sample_valid(bfqq->seek_samples) && -+ ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > -+ bfq_max_budget(bfqq->bfqd) / 8) || -+ bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && -+ bfq_symmetric_scenario(bfqd)) -+ sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); -+ else if (bfqq->wr_coeff > 1) -+ sl = sl * 3; -+ bfqd->last_idling_start = ktime_get(); -+ mod_timer(&bfqd->idle_slice_timer, jiffies + sl); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); -+#endif -+ bfq_log(bfqd, "arm idle: %u/%u ms", -+ jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the disk -+ * throughput (always guaranteed with a time slice scheme as in CFQ). -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq = bfqd->in_service_queue; -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfq_clear_bfqq_budget_new(bfqq); -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * -+ timeout_coeff)); -+} -+ -+/* -+ * Move request from internal lists to the request queue dispatch list. -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); -+ -+ if (bfq_bfqq_sync(bfqq)) -+ bfqd->sync_flight++; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), -+ rq->cmd_flags); -+#endif -+} -+ -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq) -+{ -+ struct request *rq = NULL; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ if (list_empty(&bfqq->fifo)) -+ return NULL; -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (time_before(jiffies, rq->fifo_time)) -+ return NULL; -+ -+ return rq; -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ return entity->budget - entity->service; -+} -+ -+static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ __bfq_bfqd_reset_in_service(bfqd); -+ -+ if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * Overloading budget_timeout field to store the time -+ * at which the queue remains with no backlog; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ bfq_del_bfqq_busy(bfqd, bfqq, 1); -+ } else -+ bfq_activate_bfqq(bfqd, bfqq); -+} -+ -+/** -+ * __bfq_bfqq_recalc_budget - try to adapt the budget to the @bfqq behavior. -+ * @bfqd: device data. -+ * @bfqq: queue to update. -+ * @reason: reason for expiration. -+ * -+ * Handle the feedback on @bfqq budget at queue expiration. -+ * See the body for detailed comments. -+ */ -+static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ enum bfqq_expiration reason) -+{ -+ struct request *next_rq; -+ int budget, min_budget; -+ -+ budget = bfqq->max_budget; -+ min_budget = bfq_min_budget(bfqd); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", -+ bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last max_budg %d, min budg %d", -+ budget, bfq_min_budget(bfqd)); -+ bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", -+ bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); -+ -+ if (bfq_bfqq_sync(bfqq)) { -+ switch (reason) { -+ /* -+ * Caveat: in all the following cases we trade latency -+ * for throughput. -+ */ -+ case BFQ_BFQQ_TOO_IDLE: -+ /* -+ * This is the only case where we may reduce -+ * the budget: if there is no request of the -+ * process still waiting for completion, then -+ * we assume (tentatively) that the timer has -+ * expired because the batch of requests of -+ * the process could have been served with a -+ * smaller budget. Hence, betting that -+ * process will behave in the same way when it -+ * becomes backlogged again, we reduce its -+ * next budget. As long as we guess right, -+ * this budget cut reduces the latency -+ * experienced by the process. -+ * -+ * However, if there are still outstanding -+ * requests, then the process may have not yet -+ * issued its next request just because it is -+ * still waiting for the completion of some of -+ * the still outstanding ones. So in this -+ * subcase we do not reduce its budget, on the -+ * contrary we increase it to possibly boost -+ * the throughput, as discussed in the -+ * comments to the BUDGET_TIMEOUT case. -+ */ -+ if (bfqq->dispatched > 0) /* still outstanding reqs */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ else { -+ if (budget > 5 * min_budget) -+ budget -= 4 * min_budget; -+ else -+ budget = min_budget; -+ } -+ break; -+ case BFQ_BFQQ_BUDGET_TIMEOUT: -+ /* -+ * We double the budget here because: 1) it -+ * gives the chance to boost the throughput if -+ * this is not a seeky process (which may have -+ * bumped into this timeout because of, e.g., -+ * ZBR), 2) together with charge_full_budget -+ * it helps give seeky processes higher -+ * timestamps, and hence be served less -+ * frequently. -+ */ -+ budget = min(budget * 2, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_BUDGET_EXHAUSTED: -+ /* -+ * The process still has backlog, and did not -+ * let either the budget timeout or the disk -+ * idling timeout expire. Hence it is not -+ * seeky, has a short thinktime and may be -+ * happy with a higher budget too. So -+ * definitely increase the budget of this good -+ * candidate to boost the disk throughput. -+ */ -+ budget = min(budget * 4, bfqd->bfq_max_budget); -+ break; -+ case BFQ_BFQQ_NO_MORE_REQUESTS: -+ /* -+ * Leave the budget unchanged. -+ */ -+ default: -+ return; -+ } -+ } else -+ /* -+ * Async queues get always the maximum possible budget -+ * (their ability to dispatch is limited by -+ * @bfqd->bfq_max_budget_async_rq). -+ */ -+ budget = bfqd->bfq_max_budget; -+ -+ bfqq->max_budget = budget; -+ -+ if (bfqd->budgets_assigned >= bfq_stats_min_budgets && -+ !bfqd->bfq_user_max_budget) -+ bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); -+ -+ /* -+ * Make sure that we have enough budget for the next request. -+ * Since the finish time of the bfqq must be kept in sync with -+ * the budget, be sure to call __bfq_bfqq_expire() after the -+ * update. -+ */ -+ next_rq = bfqq->next_rq; -+ if (next_rq) -+ bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(next_rq, bfqq)); -+ else -+ bfqq->entity.budget = bfqq->max_budget; -+ -+ bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", -+ next_rq ? blk_rq_sectors(next_rq) : 0, -+ bfqq->entity.budget); -+} -+ -+static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) -+{ -+ unsigned long max_budget; -+ -+ /* -+ * The max_budget calculated when autotuning is equal to the -+ * amount of sectors transfered in timeout_sync at the -+ * estimated peak rate. -+ */ -+ max_budget = (unsigned long)(peak_rate * 1000 * -+ timeout >> BFQ_RATE_SHIFT); -+ -+ return max_budget; -+} -+ -+/* -+ * In addition to updating the peak rate, checks whether the process -+ * is "slow", and returns 1 if so. This slow flag is used, in addition -+ * to the budget timeout, to reduce the amount of service provided to -+ * seeky processes, and hence reduce their chances to lower the -+ * throughput. See the code for more details. -+ */ -+static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason) -+{ -+ u64 bw, usecs, expected, timeout; -+ ktime_t delta; -+ int update = 0; -+ -+ if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) -+ return false; -+ -+ if (compensate) -+ delta = bfqd->last_idling_start; -+ else -+ delta = ktime_get(); -+ delta = ktime_sub(delta, bfqd->last_budget_start); -+ usecs = ktime_to_us(delta); -+ -+ /* Don't trust short/unrealistic values. */ -+ if (usecs < 100 || usecs >= LONG_MAX) -+ return false; -+ -+ /* -+ * Calculate the bandwidth for the last slice. We use a 64 bit -+ * value to store the peak rate, in sectors per usec in fixed -+ * point math. We do so to have enough precision in the estimate -+ * and to avoid overflows. -+ */ -+ bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; -+ do_div(bw, (unsigned long)usecs); -+ -+ timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); -+ -+ /* -+ * Use only long (> 20ms) intervals to filter out spikes for -+ * the peak rate estimation. -+ */ -+ if (usecs > 20000) { -+ if (bw > bfqd->peak_rate || -+ (!BFQQ_SEEKY(bfqq) && -+ reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { -+ bfq_log(bfqd, "measured bw =%llu", bw); -+ /* -+ * To smooth oscillations use a low-pass filter with -+ * alpha=7/8, i.e., -+ * new_rate = (7/8) * old_rate + (1/8) * bw -+ */ -+ do_div(bw, 8); -+ if (bw == 0) -+ return 0; -+ bfqd->peak_rate *= 7; -+ do_div(bfqd->peak_rate, 8); -+ bfqd->peak_rate += bw; -+ update = 1; -+ bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); -+ } -+ -+ update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; -+ -+ if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) -+ bfqd->peak_rate_samples++; -+ -+ if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && -+ update) { -+ int dev_type = blk_queue_nonrot(bfqd->queue); -+ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd->peak_rate, -+ timeout); -+ bfq_log(bfqd, "new max_budget=%d", -+ bfqd->bfq_max_budget); -+ } -+ if (bfqd->device_speed == BFQ_BFQD_FAST && -+ bfqd->peak_rate < device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_SLOW; -+ bfqd->RT_prod = R_slow[dev_type] * -+ T_slow[dev_type]; -+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -+ bfqd->peak_rate > device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ bfqd->RT_prod = R_fast[dev_type] * -+ T_fast[dev_type]; -+ } -+ } -+ } -+ -+ /* -+ * If the process has been served for a too short time -+ * interval to let its possible sequential accesses prevail on -+ * the initial seek time needed to move the disk head on the -+ * first sector it requested, then give the process a chance -+ * and for the moment return false. -+ */ -+ if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) -+ return false; -+ -+ /* -+ * A process is considered ``slow'' (i.e., seeky, so that we -+ * cannot treat it fairly in the service domain, as it would -+ * slow down too much the other processes) if, when a slice -+ * ends for whatever reason, it has received service at a -+ * rate that would not be high enough to complete the budget -+ * before the budget timeout expiration. -+ */ -+ expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; -+ -+ /* -+ * Caveat: processes doing IO in the slower disk zones will -+ * tend to be slow(er) even if not seeky. And the estimated -+ * peak rate will actually be an average over the disk -+ * surface. Hence, to not be too harsh with unlucky processes, -+ * we keep a budget/3 margin of safety before declaring a -+ * process slow. -+ */ -+ return expected > (4 * bfqq->entity.budget) / 3; -+} -+ -+/* -+ * To be deemed as soft real-time, an application must meet two -+ * requirements. First, the application must not require an average -+ * bandwidth higher than the approximate bandwidth required to playback or -+ * record a compressed high-definition video. -+ * The next function is invoked on the completion of the last request of a -+ * batch, to compute the next-start time instant, soft_rt_next_start, such -+ * that, if the next request of the application does not arrive before -+ * soft_rt_next_start, then the above requirement on the bandwidth is met. -+ * -+ * The second requirement is that the request pattern of the application is -+ * isochronous, i.e., that, after issuing a request or a batch of requests, -+ * the application stops issuing new requests until all its pending requests -+ * have been completed. After that, the application may issue a new batch, -+ * and so on. -+ * For this reason the next function is invoked to compute -+ * soft_rt_next_start only for applications that meet this requirement, -+ * whereas soft_rt_next_start is set to infinity for applications that do -+ * not. -+ * -+ * Unfortunately, even a greedy application may happen to behave in an -+ * isochronous way if the CPU load is high. In fact, the application may -+ * stop issuing requests while the CPUs are busy serving other processes, -+ * then restart, then stop again for a while, and so on. In addition, if -+ * the disk achieves a low enough throughput with the request pattern -+ * issued by the application (e.g., because the request pattern is random -+ * and/or the device is slow), then the application may meet the above -+ * bandwidth requirement too. To prevent such a greedy application to be -+ * deemed as soft real-time, a further rule is used in the computation of -+ * soft_rt_next_start: soft_rt_next_start must be higher than the current -+ * time plus the maximum time for which the arrival of a request is waited -+ * for when a sync queue becomes idle, namely bfqd->bfq_slice_idle. -+ * This filters out greedy applications, as the latter issue instead their -+ * next request as soon as possible after the last one has been completed -+ * (in contrast, when a batch of requests is completed, a soft real-time -+ * application spends some time processing data). -+ * -+ * Unfortunately, the last filter may easily generate false positives if -+ * only bfqd->bfq_slice_idle is used as a reference time interval and one -+ * or both the following cases occur: -+ * 1) HZ is so low that the duration of a jiffy is comparable to or higher -+ * than bfqd->bfq_slice_idle. This happens, e.g., on slow devices with -+ * HZ=100. -+ * 2) jiffies, instead of increasing at a constant rate, may stop increasing -+ * for a while, then suddenly 'jump' by several units to recover the lost -+ * increments. This seems to happen, e.g., inside virtual machines. -+ * To address this issue, we do not use as a reference time interval just -+ * bfqd->bfq_slice_idle, but bfqd->bfq_slice_idle plus a few jiffies. In -+ * particular we add the minimum number of jiffies for which the filter -+ * seems to be quite precise also in embedded systems and KVM/QEMU virtual -+ * machines. -+ */ -+static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return max(bfqq->last_idle_bklogged + -+ HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies + bfqq->bfqd->bfq_slice_idle + 4); -+} -+ -+/* -+ * Return the largest-possible time instant such that, for as long as possible, -+ * the current time will be lower than this time instant according to the macro -+ * time_is_before_jiffies(). -+ */ -+static unsigned long bfq_infinity_from_now(unsigned long now) -+{ -+ return now + ULONG_MAX / 2; -+} -+ -+/** -+ * bfq_bfqq_expire - expire a queue. -+ * @bfqd: device owning the queue. -+ * @bfqq: the queue to expire. -+ * @compensate: if true, compensate for the time spent idling. -+ * @reason: the reason causing the expiration. -+ * -+ * -+ * If the process associated to the queue is slow (i.e., seeky), or in -+ * case of budget timeout, or, finally, if it is async, we -+ * artificially charge it an entire budget (independently of the -+ * actual service it received). As a consequence, the queue will get -+ * higher timestamps than the correct ones upon reactivation, and -+ * hence it will be rescheduled as if it had received more service -+ * than what it actually received. In the end, this class of processes -+ * will receive less service in proportion to how slowly they consume -+ * their budgets (and hence how seriously they tend to lower the -+ * throughput). -+ * -+ * In contrast, when a queue expires because it has been idling for -+ * too much or because it exhausted its budget, we do not touch the -+ * amount of service it has received. Hence when the queue will be -+ * reactivated and its timestamps updated, the latter will be in sync -+ * with the actual service received by the queue until expiration. -+ * -+ * Charging a full budget to the first type of queues and the exact -+ * service to the others has the effect of using the WF2Q+ policy to -+ * schedule the former on a timeslice basis, without violating the -+ * service domain guarantees of the latter. -+ */ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason) -+{ -+ bool slow; -+ -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ -+ /* -+ * Update disk peak rate for autotuning and check whether the -+ * process is slow (see bfq_update_peak_rate). -+ */ -+ slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); -+ -+ /* -+ * As above explained, 'punish' slow (i.e., seeky), timed-out -+ * and async queues, to favor sequential sync workloads. -+ * -+ * Processes doing I/O in the slower disk zones will tend to be -+ * slow(er) even if not seeky. Hence, since the estimated peak -+ * rate is actually an average over the disk surface, these -+ * processes may timeout just for bad luck. To avoid punishing -+ * them we do not charge a full budget to a process that -+ * succeeded in consuming at least 2/3 of its budget. -+ */ -+ if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) -+ bfq_bfqq_charge_full_budget(bfqq); -+ -+ bfqq->service_from_backlogged += bfqq->entity.service; -+ -+ if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ !bfq_bfqq_constantly_seeky(bfqq)) { -+ bfq_mark_bfqq_constantly_seeky(bfqq); -+ if (!blk_queue_nonrot(bfqd->queue)) -+ bfqd->const_seeky_busy_in_flight_queues++; -+ } -+ -+ if (reason == BFQ_BFQQ_TOO_IDLE && -+ bfqq->entity.service <= 2 * bfqq->entity.budget / 10) -+ bfq_clear_bfqq_IO_bound(bfqq); -+ -+ if (bfqd->low_latency && bfqq->wr_coeff == 1) -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) { -+ /* -+ * If we get here, and there are no outstanding requests, -+ * then the request pattern is isochronous (see the comments -+ * to the function bfq_bfqq_softrt_next_start()). Hence we -+ * can compute soft_rt_next_start. If, instead, the queue -+ * still has outstanding requests, then we have to wait -+ * for the completion of all the outstanding requests to -+ * discover whether the request pattern is actually -+ * isochronous. -+ */ -+ if (bfqq->dispatched == 0) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ else { -+ /* -+ * The application is still waiting for the -+ * completion of one or more requests: -+ * prevent it from possibly being incorrectly -+ * deemed as soft real-time by setting its -+ * soft_rt_next_start to infinity. In fact, -+ * without this assignment, the application -+ * would be incorrectly deemed as soft -+ * real-time if: -+ * 1) it issued a new request before the -+ * completion of all its in-flight -+ * requests, and -+ * 2) at that time, its soft_rt_next_start -+ * happened to be in the past. -+ */ -+ bfqq->soft_rt_next_start = -+ bfq_infinity_from_now(jiffies); -+ /* -+ * Schedule an update of soft_rt_next_start to when -+ * the task may be discovered to be isochronous. -+ */ -+ bfq_mark_bfqq_softrt_update(bfqq); -+ } -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, -+ slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); -+ -+ /* -+ * Increase, decrease or leave budget unchanged according to -+ * reason. -+ */ -+ __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ __bfq_bfqq_expire(bfqd, bfqq); -+} -+ -+/* -+ * Budget timeout is not implemented through a dedicated timer, but -+ * just checked on request arrivals and completions, as well as on -+ * idle timer expirations. -+ */ -+static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_budget_new(bfqq) || -+ time_before(jiffies, bfqq->budget_timeout)) -+ return false; -+ return true; -+} -+ -+/* -+ * If we expire a queue that is waiting for the arrival of a new -+ * request, we may prevent the fictitious timestamp back-shifting that -+ * allows the guarantees of the queue to be preserved (see [1] for -+ * this tricky aspect). Hence we return true only if this condition -+ * does not hold, or if the queue is slow enough to deserve only to be -+ * kicked off for preserving a high throughput. -+*/ -+static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "may_budget_timeout: wait_request %d left %d timeout %d", -+ bfq_bfqq_wait_request(bfqq), -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3, -+ bfq_bfqq_budget_timeout(bfqq)); -+ -+ return (!bfq_bfqq_wait_request(bfqq) || -+ bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3) -+ && -+ bfq_bfqq_budget_timeout(bfqq); -+} -+ -+/* -+ * For a queue that becomes empty, device idling is allowed only if -+ * this function returns true for that queue. As a consequence, since -+ * device idling plays a critical role for both throughput boosting -+ * and service guarantees, the return value of this function plays a -+ * critical role as well. -+ * -+ * In a nutshell, this function returns true only if idling is -+ * beneficial for throughput or, even if detrimental for throughput, -+ * idling is however necessary to preserve service guarantees (low -+ * latency, desired throughput distribution, ...). In particular, on -+ * NCQ-capable devices, this function tries to return false, so as to -+ * help keep the drives' internal queues full, whenever this helps the -+ * device boost the throughput without causing any service-guarantee -+ * issue. -+ * -+ * In more detail, the return value of this function is obtained by, -+ * first, computing a number of boolean variables that take into -+ * account throughput and service-guarantee issues, and, then, -+ * combining these variables in a logical expression. Most of the -+ * issues taken into account are not trivial. We discuss these issues -+ * while introducing the variables. -+ */ -+static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool idling_boosts_thr, idling_boosts_thr_without_issues, -+ all_queues_seeky, on_hdd_and_not_all_queues_seeky, -+ idling_needed_for_service_guarantees, -+ asymmetric_scenario; -+ -+ /* -+ * The next variable takes into account the cases where idling -+ * boosts the throughput. -+ * -+ * The value of the variable is computed considering, first, that -+ * idling is virtually always beneficial for the throughput if: -+ * (a) the device is not NCQ-capable, or -+ * (b) regardless of the presence of NCQ, the device is rotational -+ * and the request pattern for bfqq is I/O-bound and sequential. -+ * -+ * Secondly, and in contrast to the above item (b), idling an -+ * NCQ-capable flash-based device would not boost the -+ * throughput even with sequential I/O; rather it would lower -+ * the throughput in proportion to how fast the device -+ * is. Accordingly, the next variable is true if any of the -+ * above conditions (a) and (b) is true, and, in particular, -+ * happens to be false if bfqd is an NCQ-capable flash-based -+ * device. -+ */ -+ idling_boosts_thr = !bfqd->hw_tag || -+ (!blk_queue_nonrot(bfqd->queue) && bfq_bfqq_IO_bound(bfqq) && -+ bfq_bfqq_idle_window(bfqq)); -+ -+ /* -+ * The value of the next variable, -+ * idling_boosts_thr_without_issues, is equal to that of -+ * idling_boosts_thr, unless a special case holds. In this -+ * special case, described below, idling may cause problems to -+ * weight-raised queues. -+ * -+ * When the request pool is saturated (e.g., in the presence -+ * of write hogs), if the processes associated with -+ * non-weight-raised queues ask for requests at a lower rate, -+ * then processes associated with weight-raised queues have a -+ * higher probability to get a request from the pool -+ * immediately (or at least soon) when they need one. Thus -+ * they have a higher probability to actually get a fraction -+ * of the device throughput proportional to their high -+ * weight. This is especially true with NCQ-capable drives, -+ * which enqueue several requests in advance, and further -+ * reorder internally-queued requests. -+ * -+ * For this reason, we force to false the value of -+ * idling_boosts_thr_without_issues if there are weight-raised -+ * busy queues. In this case, and if bfqq is not weight-raised, -+ * this guarantees that the device is not idled for bfqq (if, -+ * instead, bfqq is weight-raised, then idling will be -+ * guaranteed by another variable, see below). Combined with -+ * the timestamping rules of BFQ (see [1] for details), this -+ * behavior causes bfqq, and hence any sync non-weight-raised -+ * queue, to get a lower number of requests served, and thus -+ * to ask for a lower number of requests from the request -+ * pool, before the busy weight-raised queues get served -+ * again. This often mitigates starvation problems in the -+ * presence of heavy write workloads and NCQ, thereby -+ * guaranteeing a higher application and system responsiveness -+ * in these hostile scenarios. -+ */ -+ idling_boosts_thr_without_issues = idling_boosts_thr && -+ bfqd->wr_busy_queues == 0; -+ -+ /* -+ * There are then two cases where idling must be performed not -+ * for throughput concerns, but to preserve service -+ * guarantees. In the description of these cases, we say, for -+ * short, that a queue is sequential/random if the process -+ * associated to the queue issues sequential/random requests -+ * (in the second case the queue may be tagged as seeky or -+ * even constantly_seeky). -+ * -+ * To introduce the first case, we note that, since -+ * bfq_bfqq_idle_window(bfqq) is false if the device is -+ * NCQ-capable and bfqq is random (see -+ * bfq_update_idle_window()), then, from the above two -+ * assignments it follows that -+ * idling_boosts_thr_without_issues is false if the device is -+ * NCQ-capable and bfqq is random. Therefore, for this case, -+ * device idling would never be allowed if we used just -+ * idling_boosts_thr_without_issues to decide whether to allow -+ * it. And, beneficially, this would imply that throughput -+ * would always be boosted also with random I/O on NCQ-capable -+ * HDDs. -+ * -+ * But we must be careful on this point, to avoid an unfair -+ * treatment for bfqq. In fact, because of the same above -+ * assignments, idling_boosts_thr_without_issues is, on the -+ * other hand, true if 1) the device is an HDD and bfqq is -+ * sequential, and 2) there are no busy weight-raised -+ * queues. As a consequence, if we used just -+ * idling_boosts_thr_without_issues to decide whether to idle -+ * the device, then with an HDD we might easily bump into a -+ * scenario where queues that are sequential and I/O-bound -+ * would enjoy idling, whereas random queues would not. The -+ * latter might then get a low share of the device throughput, -+ * simply because the former would get many requests served -+ * after being set as in service, while the latter would not. -+ * -+ * To address this issue, we start by setting to true a -+ * sentinel variable, on_hdd_and_not_all_queues_seeky, if the -+ * device is rotational and not all queues with pending or -+ * in-flight requests are constantly seeky (i.e., there are -+ * active sequential queues, and bfqq might then be mistreated -+ * if it does not enjoy idling because it is random). -+ */ -+ all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && -+ bfqd->busy_in_flight_queues == -+ bfqd->const_seeky_busy_in_flight_queues; -+ -+ on_hdd_and_not_all_queues_seeky = -+ !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; -+ -+ /* -+ * To introduce the second case where idling needs to be -+ * performed to preserve service guarantees, we can note that -+ * allowing the drive to enqueue more than one request at a -+ * time, and hence delegating de facto final scheduling -+ * decisions to the drive's internal scheduler, causes loss of -+ * control on the actual request service order. In particular, -+ * the critical situation is when requests from different -+ * processes happens to be present, at the same time, in the -+ * internal queue(s) of the drive. In such a situation, the -+ * drive, by deciding the service order of the -+ * internally-queued requests, does determine also the actual -+ * throughput distribution among these processes. But the -+ * drive typically has no notion or concern about per-process -+ * throughput distribution, and makes its decisions only on a -+ * per-request basis. Therefore, the service distribution -+ * enforced by the drive's internal scheduler is likely to -+ * coincide with the desired device-throughput distribution -+ * only in a completely symmetric scenario where: -+ * (i) each of these processes must get the same throughput as -+ * the others; -+ * (ii) all these processes have the same I/O pattern -+ * (either sequential or random). -+ * In fact, in such a scenario, the drive will tend to treat -+ * the requests of each of these processes in about the same -+ * way as the requests of the others, and thus to provide -+ * each of these processes with about the same throughput -+ * (which is exactly the desired throughput distribution). In -+ * contrast, in any asymmetric scenario, device idling is -+ * certainly needed to guarantee that bfqq receives its -+ * assigned fraction of the device throughput (see [1] for -+ * details). -+ * -+ * We address this issue by controlling, actually, only the -+ * symmetry sub-condition (i), i.e., provided that -+ * sub-condition (i) holds, idling is not performed, -+ * regardless of whether sub-condition (ii) holds. In other -+ * words, only if sub-condition (i) holds, then idling is -+ * allowed, and the device tends to be prevented from queueing -+ * many requests, possibly of several processes. The reason -+ * for not controlling also sub-condition (ii) is that, first, -+ * in the case of an HDD, the asymmetry in terms of types of -+ * I/O patterns is already taken in to account in the above -+ * sentinel variable -+ * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a -+ * flash-based device, we prefer however to privilege -+ * throughput (and idling lowers throughput for this type of -+ * devices), for the following reasons: -+ * 1) differently from HDDs, the service time of random -+ * requests is not orders of magnitudes lower than the service -+ * time of sequential requests; thus, even if processes doing -+ * sequential I/O get a preferential treatment with respect to -+ * others doing random I/O, the consequences are not as -+ * dramatic as with HDDs; -+ * 2) if a process doing random I/O does need strong -+ * throughput guarantees, it is hopefully already being -+ * weight-raised, or the user is likely to have assigned it a -+ * higher weight than the other processes (and thus -+ * sub-condition (i) is likely to be false, which triggers -+ * idling). -+ * -+ * According to the above considerations, the next variable is -+ * true (only) if sub-condition (i) holds. To compute the -+ * value of this variable, we not only use the return value of -+ * the function bfq_symmetric_scenario(), but also check -+ * whether bfqq is being weight-raised, because -+ * bfq_symmetric_scenario() does not take into account also -+ * weight-raised queues (see comments to -+ * bfq_weights_tree_add()). -+ * -+ * As a side note, it is worth considering that the above -+ * device-idling countermeasures may however fail in the -+ * following unlucky scenario: if idling is (correctly) -+ * disabled in a time period during which all symmetry -+ * sub-conditions hold, and hence the device is allowed to -+ * enqueue many requests, but at some later point in time some -+ * sub-condition stops to hold, then it may become impossible -+ * to let requests be served in the desired order until all -+ * the requests already queued in the device have been served. -+ */ -+ asymmetric_scenario = bfqq->wr_coeff > 1 || -+ !bfq_symmetric_scenario(bfqd); -+ -+ /* -+ * Finally, there is a case where maximizing throughput is the -+ * best choice even if it may cause unfairness toward -+ * bfqq. Such a case is when bfqq became active in a burst of -+ * queue activations. Queues that became active during a large -+ * burst benefit only from throughput, as discussed in the -+ * comments to bfq_handle_burst. Thus, if bfqq became active -+ * in a burst and not idling the device maximizes throughput, -+ * then the device must no be idled, because not idling the -+ * device provides bfqq and all other queues in the burst with -+ * maximum benefit. Combining this and the two cases above, we -+ * can now establish when idling is actually needed to -+ * preserve service guarantees. -+ */ -+ idling_needed_for_service_guarantees = -+ (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && -+ !bfq_bfqq_in_large_burst(bfqq); -+ -+ /* -+ * We have now all the components we need to compute the return -+ * value of the function, which is true only if both the following -+ * conditions hold: -+ * 1) bfqq is sync, because idling make sense only for sync queues; -+ * 2) idling either boosts the throughput (without issues), or -+ * is necessary to preserve service guarantees. -+ */ -+ return bfq_bfqq_sync(bfqq) && -+ (idling_boosts_thr_without_issues || -+ idling_needed_for_service_guarantees); -+} -+ -+/* -+ * If the in-service queue is empty but the function bfq_bfqq_may_idle -+ * returns true, then: -+ * 1) the queue must remain in service and cannot be expired, and -+ * 2) the device must be idled to wait for the possible arrival of a new -+ * request for the queue. -+ * See the comments to the function bfq_bfqq_may_idle for the reasons -+ * why performing device idling is the best choice to boost the throughput -+ * and preserve service guarantees when bfq_bfqq_may_idle itself -+ * returns true. -+ */ -+static bool bfq_bfqq_must_idle(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ return RB_EMPTY_ROOT(&bfqq->sort_list) && bfqd->bfq_slice_idle != 0 && -+ bfq_bfqq_may_idle(bfqq); -+} -+ -+/* -+ * Select a queue for service. If we have a current queue in service, -+ * check whether to continue servicing it, or retrieve and set a new one. -+ */ -+static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq; -+ struct request *next_rq; -+ enum bfqq_expiration reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ -+ bfqq = bfqd->in_service_queue; -+ if (!bfqq) -+ goto new_queue; -+ -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); -+ -+ if (bfq_may_expire_for_budg_timeout(bfqq) && -+ !timer_pending(&bfqd->idle_slice_timer) && -+ !bfq_bfqq_must_idle(bfqq)) -+ goto expire; -+ -+ next_rq = bfqq->next_rq; -+ /* -+ * If bfqq has requests queued and it has enough budget left to -+ * serve them, keep the queue, otherwise expire it. -+ */ -+ if (next_rq) { -+ if (bfq_serv_to_charge(next_rq, bfqq) > -+ bfq_bfqq_budget_left(bfqq)) { -+ reason = BFQ_BFQQ_BUDGET_EXHAUSTED; -+ goto expire; -+ } else { -+ /* -+ * The idle timer may be pending because we may -+ * not disable disk idling even when a new request -+ * arrives. -+ */ -+ if (timer_pending(&bfqd->idle_slice_timer)) { -+ /* -+ * If we get here: 1) at least a new request -+ * has arrived but we have not disabled the -+ * timer because the request was too small, -+ * 2) then the block layer has unplugged -+ * the device, causing the dispatch to be -+ * invoked. -+ * -+ * Since the device is unplugged, now the -+ * requests are probably large enough to -+ * provide a reasonable throughput. -+ * So we disable idling. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ del_timer(&bfqd->idle_slice_timer); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+#endif -+ } -+ goto keep_queue; -+ } -+ } -+ -+ /* -+ * No requests pending. However, if the in-service queue is idling -+ * for a new request, or has requests waiting for a completion and -+ * may idle after their completion, then keep it anyway. -+ */ -+ if (timer_pending(&bfqd->idle_slice_timer) || -+ (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { -+ bfqq = NULL; -+ goto keep_queue; -+ } -+ -+ reason = BFQ_BFQQ_NO_MORE_REQUESTS; -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, reason); -+new_queue: -+ bfqq = bfq_set_in_service_queue(bfqd); -+ bfq_log(bfqd, "select_queue: new queue %d returned", -+ bfqq ? bfqq->pid : 0); -+keep_queue: -+ return bfqq; -+} -+ -+static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ -+ BUG_ON(bfqq != bfqd->in_service_queue && entity->weight != -+ entity->orig_weight * bfqq->wr_coeff); -+ if (entity->prio_changed) -+ bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); -+ -+ /* -+ * If the queue was activated in a burst, or -+ * too much time has elapsed from the beginning -+ * of this weight-raising period, then end weight -+ * raising. -+ */ -+ if (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ bfqq->last_wr_start_finish = jiffies; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_bfqq_end_wr(bfqq); -+ } -+ } -+ /* Update weight both if it must be raised and if it must be lowered */ -+ if ((entity->weight > entity->orig_weight) != (bfqq->wr_coeff > 1)) -+ __bfq_entity_update_weight_prio( -+ bfq_entity_service_tree(entity), -+ entity); -+} -+ -+/* -+ * Dispatch one request from bfqq, moving it to the request queue -+ * dispatch list. -+ */ -+static int bfq_dispatch_request(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ struct request *rq; -+ unsigned long service_to_charge; -+ -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ /* Follow expired path, else get first next available. */ -+ rq = bfq_check_fifo(bfqq); -+ if (!rq) -+ rq = bfqq->next_rq; -+ service_to_charge = bfq_serv_to_charge(rq, bfqq); -+ -+ if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * This may happen if the next rq is chosen in fifo order -+ * instead of sector order. The budget is properly -+ * dimensioned to be always sufficient to serve the next -+ * request only if it is chosen in sector order. The reason -+ * is that it would be quite inefficient and little useful -+ * to always make sure that the budget is large enough to -+ * serve even the possible next rq in fifo order. -+ * In fact, requests are seldom served in fifo order. -+ * -+ * Expire the queue for budget exhaustion, and make sure -+ * that the next act_budget is enough to serve the next -+ * request, even if it comes from the fifo expired path. -+ */ -+ bfqq->next_rq = rq; -+ /* -+ * Since this dispatch is failed, make sure that -+ * a new one will be performed -+ */ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ goto expire; -+ } -+ -+ /* Finally, insert request into driver dispatch list. */ -+ bfq_bfqq_served(bfqq, service_to_charge); -+ bfq_dispatch_insert(bfqd->queue, rq); -+ -+ bfq_update_wr_data(bfqd, bfqq); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "dispatched %u sec req (%llu), budg left %d", -+ blk_rq_sectors(rq), -+ (unsigned long long) blk_rq_pos(rq), -+ bfq_bfqq_budget_left(bfqq)); -+ -+ dispatched++; -+ -+ if (!bfqd->in_service_bic) { -+ atomic_long_inc(&RQ_BIC(rq)->icq.ioc->refcount); -+ bfqd->in_service_bic = RQ_BIC(rq); -+ } -+ -+ if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && -+ dispatched >= bfqd->bfq_max_budget_async_rq) || -+ bfq_class_idle(bfqq))) -+ goto expire; -+ -+ return dispatched; -+ -+expire: -+ bfq_bfqq_expire(bfqd, bfqq, false, BFQ_BFQQ_BUDGET_EXHAUSTED); -+ return dispatched; -+} -+ -+static int __bfq_forced_dispatch_bfqq(struct bfq_queue *bfqq) -+{ -+ int dispatched = 0; -+ -+ while (bfqq->next_rq) { -+ bfq_dispatch_insert(bfqq->bfqd->queue, bfqq->next_rq); -+ dispatched++; -+ } -+ -+ BUG_ON(!list_empty(&bfqq->fifo)); -+ return dispatched; -+} -+ -+/* -+ * Drain our current requests. -+ * Used for barriers and when switching io schedulers on-the-fly. -+ */ -+static int bfq_forced_dispatch(struct bfq_data *bfqd) -+{ -+ struct bfq_queue *bfqq, *n; -+ struct bfq_service_tree *st; -+ int dispatched = 0; -+ -+ bfqq = bfqd->in_service_queue; -+ if (bfqq) -+ __bfq_bfqq_expire(bfqd, bfqq); -+ -+ /* -+ * Loop through classes, and be careful to leave the scheduler -+ * in a consistent state, as feedback mechanisms and vtime -+ * updates cannot be disabled during the process. -+ */ -+ list_for_each_entry_safe(bfqq, n, &bfqd->active_list, bfqq_list) { -+ st = bfq_entity_service_tree(&bfqq->entity); -+ -+ dispatched += __bfq_forced_dispatch_bfqq(bfqq); -+ bfqq->max_budget = bfq_max_budget(bfqd); -+ -+ bfq_forget_idle(st); -+ } -+ -+ BUG_ON(bfqd->busy_queues != 0); -+ -+ return dispatched; -+} -+ -+static int bfq_dispatch_requests(struct request_queue *q, int force) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq; -+ int max_dispatch; -+ -+ bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ if (bfqd->busy_queues == 0) -+ return 0; -+ -+ if (unlikely(force)) -+ return bfq_forced_dispatch(bfqd); -+ -+ bfqq = bfq_select_queue(bfqd); -+ if (!bfqq) -+ return 0; -+ -+ if (bfq_class_idle(bfqq)) -+ max_dispatch = 1; -+ -+ if (!bfq_bfqq_sync(bfqq)) -+ max_dispatch = bfqd->bfq_max_budget_async_rq; -+ -+ if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { -+ if (bfqd->busy_queues > 1) -+ return 0; -+ if (bfqq->dispatched >= 4 * max_dispatch) -+ return 0; -+ } -+ -+ if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) -+ return 0; -+ -+ bfq_clear_bfqq_wait_request(bfqq); -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); -+ -+ if (!bfq_dispatch_request(bfqd, bfqq)) -+ return 0; -+ -+ bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", -+ bfq_bfqq_sync(bfqq) ? "sync" : "async"); -+ -+ return 1; -+} -+ -+/* -+ * Task holds one reference to the queue, dropped when task exits. Each rq -+ * in-flight on this queue also holds a reference, dropped when rq is freed. -+ * -+ * Queue lock must be held here. -+ */ -+static void bfq_put_queue(struct bfq_queue *bfqq) -+{ -+ struct bfq_data *bfqd = bfqq->bfqd; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ struct bfq_group *bfqg = bfqq_group(bfqq); -+#endif -+ -+ BUG_ON(atomic_read(&bfqq->ref) <= 0); -+ -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, -+ atomic_read(&bfqq->ref)); -+ if (!atomic_dec_and_test(&bfqq->ref)) -+ return; -+ -+ BUG_ON(rb_first(&bfqq->sort_list)); -+ BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); -+ BUG_ON(bfqq->entity.tree); -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqd->in_service_queue == bfqq); -+ -+ if (bfq_bfqq_sync(bfqq)) -+ /* -+ * The fact that this queue is being destroyed does not -+ * invalidate the fact that this queue may have been -+ * activated during the current burst. As a consequence, -+ * although the queue does not exist anymore, and hence -+ * needs to be removed from the burst list if there, -+ * the burst size has not to be decremented. -+ */ -+ hlist_del_init(&bfqq->burst_list_node); -+ -+ bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); -+ -+ kmem_cache_free(bfq_pool, bfqq); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_put(bfqg); -+#endif -+} -+ -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ if (bfqq == bfqd->in_service_queue) { -+ __bfq_bfqq_expire(bfqd, bfqq); -+ bfq_schedule_dispatch(bfqd); -+ } -+ -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, -+ atomic_read(&bfqq->ref)); -+ -+ bfq_put_queue(bfqq); -+} -+ -+static void bfq_init_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ -+ bic->ttime.last_end_request = jiffies; -+} -+ -+static void bfq_exit_icq(struct io_cq *icq) -+{ -+ struct bfq_io_cq *bic = icq_to_bic(icq); -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ -+ if (bic->bfqq[BLK_RW_ASYNC]) { -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); -+ bic->bfqq[BLK_RW_ASYNC] = NULL; -+ } -+ -+ if (bic->bfqq[BLK_RW_SYNC]) { -+ bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); -+ bic->bfqq[BLK_RW_SYNC] = NULL; -+ } -+} -+ -+/* -+ * Update the entity prio values; note that the new values will not -+ * be used until the next (re)activation. -+ */ -+static void -+bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) -+{ -+ struct task_struct *tsk = current; -+ int ioprio_class; -+ -+ ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ switch (ioprio_class) { -+ default: -+ dev_err(bfqq->bfqd->queue->backing_dev_info.dev, -+ "bfq: bad prio class %d\n", ioprio_class); -+ case IOPRIO_CLASS_NONE: -+ /* -+ * No prio set, inherit CPU scheduling settings. -+ */ -+ bfqq->new_ioprio = task_nice_ioprio(tsk); -+ bfqq->new_ioprio_class = task_nice_ioclass(tsk); -+ break; -+ case IOPRIO_CLASS_RT: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_RT; -+ break; -+ case IOPRIO_CLASS_BE: -+ bfqq->new_ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ bfqq->new_ioprio_class = IOPRIO_CLASS_BE; -+ break; -+ case IOPRIO_CLASS_IDLE: -+ bfqq->new_ioprio_class = IOPRIO_CLASS_IDLE; -+ bfqq->new_ioprio = 7; -+ bfq_clear_bfqq_idle_window(bfqq); -+ break; -+ } -+ -+ if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", -+ bfqq->new_ioprio); -+ BUG(); -+ } -+ -+ bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); -+ bfqq->entity.prio_changed = 1; -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) -+{ -+ struct bfq_data *bfqd; -+ struct bfq_queue *bfqq, *new_bfqq; -+ unsigned long uninitialized_var(flags); -+ int ioprio = bic->icq.ioc->ioprio; -+ -+ bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), -+ &flags); -+ /* -+ * This condition may trigger on a newly created bic, be sure to -+ * drop the lock before returning. -+ */ -+ if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -+ goto out; -+ -+ bic->ioprio = ioprio; -+ -+ bfqq = bic->bfqq[BLK_RW_ASYNC]; -+ if (bfqq) { -+ new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, -+ GFP_ATOMIC); -+ if (new_bfqq) { -+ bic->bfqq[BLK_RW_ASYNC] = new_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, -+ "check_ioprio_change: bfqq %p %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ bfq_put_queue(bfqq); -+ } -+ } -+ -+ bfqq = bic->bfqq[BLK_RW_SYNC]; -+ if (bfqq) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+out: -+ bfq_put_bfqd_unlock(bfqd, &flags); -+} -+ -+static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic, pid_t pid, int is_sync) -+{ -+ RB_CLEAR_NODE(&bfqq->entity.rb_node); -+ INIT_LIST_HEAD(&bfqq->fifo); -+ INIT_HLIST_NODE(&bfqq->burst_list_node); -+ -+ atomic_set(&bfqq->ref, 0); -+ bfqq->bfqd = bfqd; -+ -+ if (bic) -+ bfq_set_next_ioprio_data(bfqq, bic); -+ -+ if (is_sync) { -+ if (!bfq_class_idle(bfqq)) -+ bfq_mark_bfqq_idle_window(bfqq); -+ bfq_mark_bfqq_sync(bfqq); -+ } else -+ bfq_clear_bfqq_sync(bfqq); -+ bfq_mark_bfqq_IO_bound(bfqq); -+ -+ /* Tentative initial value to trade off between thr and lat */ -+ bfqq->max_budget = (2 * bfq_max_budget(bfqd)) / 3; -+ bfqq->pid = pid; -+ -+ bfqq->wr_coeff = 1; -+ bfqq->last_wr_start_finish = 0; -+ /* -+ * Set to the value for which bfqq will not be deemed as -+ * soft rt when it becomes backlogged. -+ */ -+ bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); -+} -+ -+static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, -+ struct bio *bio, int is_sync, -+ struct bfq_io_cq *bic, -+ gfp_t gfp_mask) -+{ -+ struct bfq_group *bfqg; -+ struct bfq_queue *bfqq, *new_bfqq = NULL; -+ struct blkcg *blkcg; -+ -+retry: -+ rcu_read_lock(); -+ -+ blkcg = bio_blkcg(bio); -+ bfqg = bfq_find_alloc_group(bfqd, blkcg); -+ /* bic always exists here */ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ -+ /* -+ * Always try a new alloc if we fall back to the OOM bfqq -+ * originally, since it should just be a temporary situation. -+ */ -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ bfqq = NULL; -+ if (new_bfqq) { -+ bfqq = new_bfqq; -+ new_bfqq = NULL; -+ } else if (gfpflags_allow_blocking(gfp_mask)) { -+ rcu_read_unlock(); -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ new_bfqq = kmem_cache_alloc_node(bfq_pool, -+ gfp_mask | __GFP_ZERO, -+ bfqd->queue->node); -+ spin_lock_irq(bfqd->queue->queue_lock); -+ if (new_bfqq) -+ goto retry; -+ } else { -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ gfp_mask | __GFP_ZERO, -+ bfqd->queue->node); -+ } -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ } -+ } -+ -+ if (new_bfqq) -+ kmem_cache_free(bfq_pool, new_bfqq); -+ -+ rcu_read_unlock(); -+ -+ return bfqq; -+} -+ -+static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -+ struct bfq_group *bfqg, -+ int ioprio_class, int ioprio) -+{ -+ switch (ioprio_class) { -+ case IOPRIO_CLASS_RT: -+ return &bfqg->async_bfqq[0][ioprio]; -+ case IOPRIO_CLASS_NONE: -+ ioprio = IOPRIO_NORM; -+ /* fall through */ -+ case IOPRIO_CLASS_BE: -+ return &bfqg->async_bfqq[1][ioprio]; -+ case IOPRIO_CLASS_IDLE: -+ return &bfqg->async_idle_bfqq; -+ default: -+ BUG(); -+ } -+} -+ -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, int is_sync, -+ struct bfq_io_cq *bic, gfp_t gfp_mask) -+{ -+ const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); -+ const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); -+ struct bfq_queue **async_bfqq = NULL; -+ struct bfq_queue *bfqq = NULL; -+ -+ if (!is_sync) { -+ struct blkcg *blkcg; -+ struct bfq_group *bfqg; -+ -+ rcu_read_lock(); -+ blkcg = bio_blkcg(bio); -+ rcu_read_unlock(); -+ bfqg = bfq_find_alloc_group(bfqd, blkcg); -+ async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, -+ ioprio); -+ bfqq = *async_bfqq; -+ } -+ -+ if (!bfqq) -+ bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); -+ -+ /* -+ * Pin the queue now that it's allocated, scheduler exit will -+ * prune it. -+ */ -+ if (!is_sync && !(*async_bfqq)) { -+ atomic_inc(&bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ *async_bfqq = bfqq; -+ } -+ -+ atomic_inc(&bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, -+ atomic_read(&bfqq->ref)); -+ return bfqq; -+} -+ -+static void bfq_update_io_thinktime(struct bfq_data *bfqd, -+ struct bfq_io_cq *bic) -+{ -+ unsigned long elapsed = jiffies - bic->ttime.last_end_request; -+ unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); -+ -+ bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; -+ bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / -+ bic->ttime.ttime_samples; -+} -+ -+static void bfq_update_io_seektime(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ sector_t sdist; -+ u64 total; -+ -+ if (bfqq->last_request_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - bfqq->last_request_pos; -+ else -+ sdist = bfqq->last_request_pos - blk_rq_pos(rq); -+ -+ /* -+ * Don't allow the seek distance to get too large from the -+ * odd fragment, pagein, etc. -+ */ -+ if (bfqq->seek_samples == 0) /* first request, not really a seek */ -+ sdist = 0; -+ else if (bfqq->seek_samples <= 60) /* second & third seek */ -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); -+ else -+ sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); -+ -+ bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; -+ bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; -+ total = bfqq->seek_total + (bfqq->seek_samples/2); -+ do_div(total, bfqq->seek_samples); -+ bfqq->seek_mean = (sector_t)total; -+ -+ bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, -+ (u64)bfqq->seek_mean); -+} -+ -+/* -+ * Disable idle window if the process thinks too long or seeks so much that -+ * it doesn't matter. -+ */ -+static void bfq_update_idle_window(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) -+{ -+ int enable_idle; -+ -+ /* Don't idle for async or idle io prio class. */ -+ if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) -+ return; -+ -+ enable_idle = bfq_bfqq_idle_window(bfqq); -+ -+ if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -+ bfqd->bfq_slice_idle == 0 || -+ (bfqd->hw_tag && BFQQ_SEEKY(bfqq) && -+ bfqq->wr_coeff == 1)) -+ enable_idle = 0; -+ else if (bfq_sample_valid(bic->ttime.ttime_samples)) { -+ if (bic->ttime.ttime_mean > bfqd->bfq_slice_idle && -+ bfqq->wr_coeff == 1) -+ enable_idle = 0; -+ else -+ enable_idle = 1; -+ } -+ bfq_log_bfqq(bfqd, bfqq, "update_idle_window: enable_idle %d", -+ enable_idle); -+ -+ if (enable_idle) -+ bfq_mark_bfqq_idle_window(bfqq); -+ else -+ bfq_clear_bfqq_idle_window(bfqq); -+} -+ -+/* -+ * Called when a new fs request (rq) is added to bfqq. Check if there's -+ * something we should do about it. -+ */ -+static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) -+{ -+ struct bfq_io_cq *bic = RQ_BIC(rq); -+ -+ if (rq->cmd_flags & REQ_META) -+ bfqq->meta_pending++; -+ -+ bfq_update_io_thinktime(bfqd, bic); -+ bfq_update_io_seektime(bfqd, bfqq, rq); -+ if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { -+ bfq_clear_bfqq_constantly_seeky(bfqq); -+ if (!blk_queue_nonrot(bfqd->queue)) { -+ BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); -+ bfqd->const_seeky_busy_in_flight_queues--; -+ } -+ } -+ if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || -+ !BFQQ_SEEKY(bfqq)) -+ bfq_update_idle_window(bfqd, bfqq, bic); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), -+ (unsigned long long) bfqq->seek_mean); -+ -+ bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ -+ if (bfqq == bfqd->in_service_queue && bfq_bfqq_wait_request(bfqq)) { -+ bool small_req = bfqq->queued[rq_is_sync(rq)] == 1 && -+ blk_rq_sectors(rq) < 32; -+ bool budget_timeout = bfq_bfqq_budget_timeout(bfqq); -+ -+ /* -+ * There is just this request queued: if the request -+ * is small and the queue is not to be expired, then -+ * just exit. -+ * -+ * In this way, if the disk is being idled to wait for -+ * a new request from the in-service queue, we avoid -+ * unplugging the device and committing the disk to serve -+ * just a small request. On the contrary, we wait for -+ * the block layer to decide when to unplug the device: -+ * hopefully, new requests will be merged to this one -+ * quickly, then the device will be unplugged and -+ * larger requests will be dispatched. -+ */ -+ if (small_req && !budget_timeout) -+ return; -+ -+ /* -+ * A large enough request arrived, or the queue is to -+ * be expired: in both cases disk idling is to be -+ * stopped, so clear wait_request flag and reset -+ * timer. -+ */ -+ bfq_clear_bfqq_wait_request(bfqq); -+ del_timer(&bfqd->idle_slice_timer); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_idle_time(bfqq_group(bfqq)); -+#endif -+ -+ /* -+ * The queue is not empty, because a new request just -+ * arrived. Hence we can safely expire the queue, in -+ * case of budget timeout, without risking that the -+ * timestamps of the queue are not updated correctly. -+ * See [1] for more details. -+ */ -+ if (budget_timeout) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ -+ /* -+ * Let the request rip immediately, or let a new queue be -+ * selected if bfqq has just been expired. -+ */ -+ __blk_run_queue(bfqd->queue); -+ } -+} -+ -+static void bfq_insert_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ assert_spin_locked(bfqd->queue->queue_lock); -+ -+ bfq_add_request(rq); -+ -+ rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ list_add_tail(&rq->queuelist, &bfqq->fifo); -+ -+ bfq_rq_enqueued(bfqd, bfqq, rq); -+} -+ -+static void bfq_update_hw_tag(struct bfq_data *bfqd) -+{ -+ bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); -+ -+ if (bfqd->hw_tag == 1) -+ return; -+ -+ /* -+ * This sample is valid if the number of outstanding requests -+ * is large enough to allow a queueing behavior. Note that the -+ * sum is not exact, as it's not taking into account deactivated -+ * requests. -+ */ -+ if (bfqd->rq_in_driver + bfqd->queued < BFQ_HW_QUEUE_THRESHOLD) -+ return; -+ -+ if (bfqd->hw_tag_samples++ < BFQ_HW_QUEUE_SAMPLES) -+ return; -+ -+ bfqd->hw_tag = bfqd->max_rq_in_driver > BFQ_HW_QUEUE_THRESHOLD; -+ bfqd->max_rq_in_driver = 0; -+ bfqd->hw_tag_samples = 0; -+} -+ -+static void bfq_completed_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_data *bfqd = bfqq->bfqd; -+ bool sync = bfq_bfqq_sync(bfqq); -+ -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", -+ blk_rq_sectors(rq), sync); -+ -+ bfq_update_hw_tag(bfqd); -+ -+ BUG_ON(!bfqd->rq_in_driver); -+ BUG_ON(!bfqq->dispatched); -+ bfqd->rq_in_driver--; -+ bfqq->dispatched--; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_completion(bfqq_group(bfqq), -+ rq_start_time_ns(rq), -+ rq_io_start_time_ns(rq), rq->cmd_flags); -+#endif -+ -+ if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ if (!blk_queue_nonrot(bfqd->queue)) { -+ BUG_ON(!bfqd->busy_in_flight_queues); -+ bfqd->busy_in_flight_queues--; -+ if (bfq_bfqq_constantly_seeky(bfqq)) { -+ BUG_ON(!bfqd-> -+ const_seeky_busy_in_flight_queues); -+ bfqd->const_seeky_busy_in_flight_queues--; -+ } -+ } -+ } -+ -+ if (sync) { -+ bfqd->sync_flight--; -+ RQ_BIC(rq)->ttime.last_end_request = jiffies; -+ } -+ -+ /* -+ * If we are waiting to discover whether the request pattern of the -+ * task associated with the queue is actually isochronous, and -+ * both requisites for this condition to hold are satisfied, then -+ * compute soft_rt_next_start (see the comments to the function -+ * bfq_bfqq_softrt_next_start()). -+ */ -+ if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && -+ RB_EMPTY_ROOT(&bfqq->sort_list)) -+ bfqq->soft_rt_next_start = -+ bfq_bfqq_softrt_next_start(bfqd, bfqq); -+ -+ /* -+ * If this is the in-service queue, check if it needs to be expired, -+ * or if we want to idle in case it has no pending requests. -+ */ -+ if (bfqd->in_service_queue == bfqq) { -+ if (bfq_bfqq_budget_new(bfqq)) -+ bfq_set_budget_timeout(bfqd); -+ -+ if (bfq_bfqq_must_idle(bfqq)) { -+ bfq_arm_slice_timer(bfqd); -+ goto out; -+ } else if (bfq_may_expire_for_budg_timeout(bfqq)) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_BUDGET_TIMEOUT); -+ else if (RB_EMPTY_ROOT(&bfqq->sort_list) && -+ (bfqq->dispatched == 0 || -+ !bfq_bfqq_may_idle(bfqq))) -+ bfq_bfqq_expire(bfqd, bfqq, false, -+ BFQ_BFQQ_NO_MORE_REQUESTS); -+ } -+ -+ if (!bfqd->rq_in_driver) -+ bfq_schedule_dispatch(bfqd); -+ -+out: -+ return; -+} -+ -+static int __bfq_may_queue(struct bfq_queue *bfqq) -+{ -+ if (bfq_bfqq_wait_request(bfqq) && bfq_bfqq_must_alloc(bfqq)) { -+ bfq_clear_bfqq_must_alloc(bfqq); -+ return ELV_MQUEUE_MUST; -+ } -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+static int bfq_may_queue(struct request_queue *q, int rw) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct task_struct *tsk = current; -+ struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq; -+ -+ /* -+ * Don't force setup of a queue from here, as a call to may_queue -+ * does not necessarily imply that a request actually will be -+ * queued. So just lookup a possibly existing queue, or return -+ * 'may queue' if that fails. -+ */ -+ bic = bfq_bic_lookup(bfqd, tsk->io_context); -+ if (!bic) -+ return ELV_MQUEUE_MAY; -+ -+ bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); -+ if (bfqq) -+ return __bfq_may_queue(bfqq); -+ -+ return ELV_MQUEUE_MAY; -+} -+ -+/* -+ * Queue lock held here. -+ */ -+static void bfq_put_request(struct request *rq) -+{ -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ -+ if (bfqq) { -+ const int rw = rq_data_dir(rq); -+ -+ BUG_ON(!bfqq->allocated[rw]); -+ bfqq->allocated[rw]--; -+ -+ rq->elv.priv[0] = NULL; -+ rq->elv.priv[1] = NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/* -+ * Allocate bfq data structures associated with this request. -+ */ -+static int bfq_set_request(struct request_queue *q, struct request *rq, -+ struct bio *bio, gfp_t gfp_mask) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ struct bfq_io_cq *bic = icq_to_bic(rq->elv.icq); -+ const int rw = rq_data_dir(rq); -+ const int is_sync = rq_is_sync(rq); -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ -+ might_sleep_if(gfpflags_allow_blocking(gfp_mask)); -+ -+ bfq_check_ioprio_change(bic, bio); -+ -+ spin_lock_irqsave(q->queue_lock, flags); -+ -+ if (!bic) -+ goto queue_fail; -+ -+ bfq_bic_update_cgroup(bic, bio); -+ -+ bfqq = bic_to_bfqq(bic, is_sync); -+ if (!bfqq || bfqq == &bfqd->oom_bfqq) { -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); -+ bic_set_bfqq(bic, bfqq, is_sync); -+ if (is_sync) { -+ if (bfqd->large_burst) -+ bfq_mark_bfqq_in_large_burst(bfqq); -+ else -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ } -+ -+ bfqq->allocated[rw]++; -+ atomic_inc(&bfqq->ref); -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, -+ atomic_read(&bfqq->ref)); -+ -+ rq->elv.priv[0] = bic; -+ rq->elv.priv[1] = bfqq; -+ -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 0; -+ -+queue_fail: -+ bfq_schedule_dispatch(bfqd); -+ spin_unlock_irqrestore(q->queue_lock, flags); -+ -+ return 1; -+} -+ -+static void bfq_kick_queue(struct work_struct *work) -+{ -+ struct bfq_data *bfqd = -+ container_of(work, struct bfq_data, unplug_work); -+ struct request_queue *q = bfqd->queue; -+ -+ spin_lock_irq(q->queue_lock); -+ __blk_run_queue(q); -+ spin_unlock_irq(q->queue_lock); -+} -+ -+/* -+ * Handler of the expiration of the timer running if the in-service queue -+ * is idling inside its time slice. -+ */ -+static void bfq_idle_slice_timer(unsigned long data) -+{ -+ struct bfq_data *bfqd = (struct bfq_data *)data; -+ struct bfq_queue *bfqq; -+ unsigned long flags; -+ enum bfqq_expiration reason; -+ -+ spin_lock_irqsave(bfqd->queue->queue_lock, flags); -+ -+ bfqq = bfqd->in_service_queue; -+ /* -+ * Theoretical race here: the in-service queue can be NULL or -+ * different from the queue that was idling if the timer handler -+ * spins on the queue_lock and a new request arrives for the -+ * current queue and there is a full dispatch cycle that changes -+ * the in-service queue. This can hardly happen, but in the worst -+ * case we just expire a queue too early. -+ */ -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ if (bfq_bfqq_budget_timeout(bfqq)) -+ /* -+ * Also here the queue can be safely expired -+ * for budget timeout without wasting -+ * guarantees -+ */ -+ reason = BFQ_BFQQ_BUDGET_TIMEOUT; -+ else if (bfqq->queued[0] == 0 && bfqq->queued[1] == 0) -+ /* -+ * The queue may not be empty upon timer expiration, -+ * because we may not disable the timer when the -+ * first request of the in-service queue arrives -+ * during disk idling. -+ */ -+ reason = BFQ_BFQQ_TOO_IDLE; -+ else -+ goto schedule_dispatch; -+ -+ bfq_bfqq_expire(bfqd, bfqq, true, reason); -+ } -+ -+schedule_dispatch: -+ bfq_schedule_dispatch(bfqd); -+ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+} -+ -+static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) -+{ -+ del_timer_sync(&bfqd->idle_slice_timer); -+ cancel_work_sync(&bfqd->unplug_work); -+} -+ -+static void __bfq_put_async_bfqq(struct bfq_data *bfqd, -+ struct bfq_queue **bfqq_ptr) -+{ -+ struct bfq_group *root_group = bfqd->root_group; -+ struct bfq_queue *bfqq = *bfqq_ptr; -+ -+ bfq_log(bfqd, "put_async_bfqq: %p", bfqq); -+ if (bfqq) { -+ bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); -+ bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ bfq_put_queue(bfqq); -+ *bfqq_ptr = NULL; -+ } -+} -+ -+/* -+ * Release all the bfqg references to its async queues. If we are -+ * deallocating the group these queues may still contain requests, so -+ * we reparent them to the root cgroup (i.e., the only one that will -+ * exist for sure until all the requests on a device are gone). -+ */ -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg) -+{ -+ int i, j; -+ -+ for (i = 0; i < 2; i++) -+ for (j = 0; j < IOPRIO_BE_NR; j++) -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_bfqq[i][j]); -+ -+ __bfq_put_async_bfqq(bfqd, &bfqg->async_idle_bfqq); -+} -+ -+static void bfq_exit_queue(struct elevator_queue *e) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ struct request_queue *q = bfqd->queue; -+ struct bfq_queue *bfqq, *n; -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ spin_lock_irq(q->queue_lock); -+ -+ BUG_ON(bfqd->in_service_queue); -+ list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -+ bfq_deactivate_bfqq(bfqd, bfqq, 0); -+ -+ spin_unlock_irq(q->queue_lock); -+ -+ bfq_shutdown_timer_wq(bfqd); -+ -+ synchronize_rcu(); -+ -+ BUG_ON(timer_pending(&bfqd->idle_slice_timer)); -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ blkcg_deactivate_policy(q, &blkcg_policy_bfq); -+#else -+ kfree(bfqd->root_group); -+#endif -+ -+ kfree(bfqd); -+} -+ -+static void bfq_init_root_group(struct bfq_group *root_group, -+ struct bfq_data *bfqd) -+{ -+ int i; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ root_group->entity.parent = NULL; -+ root_group->my_entity = NULL; -+ root_group->bfqd = bfqd; -+#endif -+ for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) -+ root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+} -+ -+static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -+{ -+ struct bfq_data *bfqd; -+ struct elevator_queue *eq; -+ -+ eq = elevator_alloc(q, e); -+ if (!eq) -+ return -ENOMEM; -+ -+ bfqd = kzalloc_node(sizeof(*bfqd), GFP_KERNEL, q->node); -+ if (!bfqd) { -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+ } -+ eq->elevator_data = bfqd; -+ -+ /* -+ * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. -+ * Grab a permanent reference to it, so that the normal code flow -+ * will not attempt to free it. -+ */ -+ bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -+ atomic_inc(&bfqd->oom_bfqq.ref); -+ bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; -+ bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; -+ bfqd->oom_bfqq.entity.new_weight = -+ bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ /* -+ * Trigger weight initialization, according to ioprio, at the -+ * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -+ * class won't be changed any more. -+ */ -+ bfqd->oom_bfqq.entity.prio_changed = 1; -+ -+ bfqd->queue = q; -+ -+ spin_lock_irq(q->queue_lock); -+ q->elevator = eq; -+ spin_unlock_irq(q->queue_lock); -+ -+ bfqd->root_group = bfq_create_group_hierarchy(bfqd, q->node); -+ if (!bfqd->root_group) -+ goto out_free; -+ bfq_init_root_group(bfqd->root_group, bfqd); -+ bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqd->active_numerous_groups = 0; -+#endif -+ -+ init_timer(&bfqd->idle_slice_timer); -+ bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -+ bfqd->idle_slice_timer.data = (unsigned long)bfqd; -+ -+ bfqd->queue_weights_tree = RB_ROOT; -+ bfqd->group_weights_tree = RB_ROOT; -+ -+ INIT_WORK(&bfqd->unplug_work, bfq_kick_queue); -+ -+ INIT_LIST_HEAD(&bfqd->active_list); -+ INIT_LIST_HEAD(&bfqd->idle_list); -+ INIT_HLIST_HEAD(&bfqd->burst_list); -+ -+ bfqd->hw_tag = -1; -+ -+ bfqd->bfq_max_budget = bfq_default_max_budget; -+ -+ bfqd->bfq_fifo_expire[0] = bfq_fifo_expire[0]; -+ bfqd->bfq_fifo_expire[1] = bfq_fifo_expire[1]; -+ bfqd->bfq_back_max = bfq_back_max; -+ bfqd->bfq_back_penalty = bfq_back_penalty; -+ bfqd->bfq_slice_idle = bfq_slice_idle; -+ bfqd->bfq_class_idle_last_service = 0; -+ bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; -+ bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; -+ bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; -+ -+ bfqd->bfq_requests_within_timer = 120; -+ -+ bfqd->bfq_large_burst_thresh = 11; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(500); -+ -+ bfqd->low_latency = true; -+ -+ bfqd->bfq_wr_coeff = 20; -+ bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); -+ bfqd->bfq_wr_max_time = 0; -+ bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -+ bfqd->bfq_wr_min_inter_arr_async = msecs_to_jiffies(500); -+ bfqd->bfq_wr_max_softrt_rate = 7000; /* -+ * Approximate rate required -+ * to playback or record a -+ * high-definition compressed -+ * video. -+ */ -+ bfqd->wr_busy_queues = 0; -+ bfqd->busy_in_flight_queues = 0; -+ bfqd->const_seeky_busy_in_flight_queues = 0; -+ -+ /* -+ * Begin by assuming, optimistically, that the device peak rate is -+ * equal to the highest reference rate. -+ */ -+ bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * -+ T_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ -+ return 0; -+ -+out_free: -+ kfree(bfqd); -+ kobject_put(&eq->kobj); -+ return -ENOMEM; -+} -+ -+static void bfq_slab_kill(void) -+{ -+ kmem_cache_destroy(bfq_pool); -+} -+ -+static int __init bfq_slab_setup(void) -+{ -+ bfq_pool = KMEM_CACHE(bfq_queue, 0); -+ if (!bfq_pool) -+ return -ENOMEM; -+ return 0; -+} -+ -+static ssize_t bfq_var_show(unsigned int var, char *page) -+{ -+ return sprintf(page, "%d\n", var); -+} -+ -+static ssize_t bfq_var_store(unsigned long *var, const char *page, -+ size_t count) -+{ -+ unsigned long new_val; -+ int ret = kstrtoul(page, 10, &new_val); -+ -+ if (ret == 0) -+ *var = new_val; -+ -+ return count; -+} -+ -+static ssize_t bfq_wr_max_time_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ -+ return sprintf(page, "%d\n", bfqd->bfq_wr_max_time > 0 ? -+ jiffies_to_msecs(bfqd->bfq_wr_max_time) : -+ jiffies_to_msecs(bfq_wr_duration(bfqd))); -+} -+ -+static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) -+{ -+ struct bfq_queue *bfqq; -+ struct bfq_data *bfqd = e->elevator_data; -+ ssize_t num_char = 0; -+ -+ num_char += sprintf(page + num_char, "Tot reqs queued %d\n\n", -+ bfqd->queued); -+ -+ spin_lock_irq(bfqd->queue->queue_lock); -+ -+ num_char += sprintf(page + num_char, "Active:\n"); -+ list_for_each_entry(bfqq, &bfqd->active_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, nr_queued %d %d, ", -+ bfqq->pid, -+ bfqq->entity.weight, -+ bfqq->queued[0], -+ bfqq->queued[1]); -+ num_char += sprintf(page + num_char, -+ "dur %d/%u\n", -+ jiffies_to_msecs( -+ jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ num_char += sprintf(page + num_char, "Idle:\n"); -+ list_for_each_entry(bfqq, &bfqd->idle_list, bfqq_list) { -+ num_char += sprintf(page + num_char, -+ "pid%d: weight %hu, dur %d/%u\n", -+ bfqq->pid, -+ bfqq->entity.weight, -+ jiffies_to_msecs(jiffies - -+ bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ spin_unlock_irq(bfqd->queue->queue_lock); -+ -+ return num_char; -+} -+ -+#define SHOW_FUNCTION(__FUNC, __VAR, __CONV) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned int __data = __VAR; \ -+ if (__CONV) \ -+ __data = jiffies_to_msecs(__data); \ -+ return bfq_var_show(__data, (page)); \ -+} -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); -+SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); -+SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); -+SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); -+SHOW_FUNCTION(bfq_max_budget_async_rq_show, -+ bfqd->bfq_max_budget_async_rq, 0); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); -+SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); -+SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); -+SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); -+SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -+SHOW_FUNCTION(bfq_wr_min_idle_time_show, bfqd->bfq_wr_min_idle_time, 1); -+SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, -+ 1); -+SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); -+#undef SHOW_FUNCTION -+ -+#define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ -+static ssize_t \ -+__FUNC(struct elevator_queue *e, const char *page, size_t count) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ if (__CONV) \ -+ *(__PTR) = msecs_to_jiffies(__data); \ -+ else \ -+ *(__PTR) = __data; \ -+ return ret; \ -+} -+STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); -+STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, -+ INT_MAX, 0); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, -+ 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); -+STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -+ 1); -+STORE_FUNCTION(bfq_wr_min_idle_time_store, &bfqd->bfq_wr_min_idle_time, 0, -+ INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_min_inter_arr_async_store, -+ &bfqd->bfq_wr_min_inter_arr_async, 0, INT_MAX, 1); -+STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, -+ INT_MAX, 0); -+#undef STORE_FUNCTION -+ -+/* do nothing for the moment */ -+static ssize_t bfq_weights_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ return count; -+} -+ -+static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) -+{ -+ u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); -+ -+ if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) -+ return bfq_calc_max_budget(bfqd->peak_rate, timeout); -+ else -+ return bfq_default_max_budget; -+} -+ -+static ssize_t bfq_max_budget_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data == 0) -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); -+ else { -+ if (__data > INT_MAX) -+ __data = INT_MAX; -+ bfqd->bfq_max_budget = __data; -+ } -+ -+ bfqd->bfq_user_max_budget = __data; -+ -+ return ret; -+} -+ -+static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data < 1) -+ __data = 1; -+ else if (__data > INT_MAX) -+ __data = INT_MAX; -+ -+ bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); -+ if (bfqd->bfq_user_max_budget == 0) -+ bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_low_latency_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (__data == 0 && bfqd->low_latency != 0) -+ bfq_end_wr(bfqd); -+ bfqd->low_latency = __data; -+ -+ return ret; -+} -+ -+#define BFQ_ATTR(name) \ -+ __ATTR(name, S_IRUGO|S_IWUSR, bfq_##name##_show, bfq_##name##_store) -+ -+static struct elv_fs_entry bfq_attrs[] = { -+ BFQ_ATTR(fifo_expire_sync), -+ BFQ_ATTR(fifo_expire_async), -+ BFQ_ATTR(back_seek_max), -+ BFQ_ATTR(back_seek_penalty), -+ BFQ_ATTR(slice_idle), -+ BFQ_ATTR(max_budget), -+ BFQ_ATTR(max_budget_async_rq), -+ BFQ_ATTR(timeout_sync), -+ BFQ_ATTR(timeout_async), -+ BFQ_ATTR(low_latency), -+ BFQ_ATTR(wr_coeff), -+ BFQ_ATTR(wr_max_time), -+ BFQ_ATTR(wr_rt_max_time), -+ BFQ_ATTR(wr_min_idle_time), -+ BFQ_ATTR(wr_min_inter_arr_async), -+ BFQ_ATTR(wr_max_softrt_rate), -+ BFQ_ATTR(weights), -+ __ATTR_NULL -+}; -+ -+static struct elevator_type iosched_bfq = { -+ .ops = { -+ .elevator_merge_fn = bfq_merge, -+ .elevator_merged_fn = bfq_merged_request, -+ .elevator_merge_req_fn = bfq_merged_requests, -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ .elevator_bio_merged_fn = bfq_bio_merged, -+#endif -+ .elevator_allow_merge_fn = bfq_allow_merge, -+ .elevator_dispatch_fn = bfq_dispatch_requests, -+ .elevator_add_req_fn = bfq_insert_request, -+ .elevator_activate_req_fn = bfq_activate_request, -+ .elevator_deactivate_req_fn = bfq_deactivate_request, -+ .elevator_completed_req_fn = bfq_completed_request, -+ .elevator_former_req_fn = elv_rb_former_request, -+ .elevator_latter_req_fn = elv_rb_latter_request, -+ .elevator_init_icq_fn = bfq_init_icq, -+ .elevator_exit_icq_fn = bfq_exit_icq, -+ .elevator_set_req_fn = bfq_set_request, -+ .elevator_put_req_fn = bfq_put_request, -+ .elevator_may_queue_fn = bfq_may_queue, -+ .elevator_init_fn = bfq_init_queue, -+ .elevator_exit_fn = bfq_exit_queue, -+ }, -+ .icq_size = sizeof(struct bfq_io_cq), -+ .icq_align = __alignof__(struct bfq_io_cq), -+ .elevator_attrs = bfq_attrs, -+ .elevator_name = "bfq", -+ .elevator_owner = THIS_MODULE, -+}; -+ -+static int __init bfq_init(void) -+{ -+ int ret; -+ -+ /* -+ * Can be 0 on HZ < 1000 setups. -+ */ -+ if (bfq_slice_idle == 0) -+ bfq_slice_idle = 1; -+ -+ if (bfq_timeout_async == 0) -+ bfq_timeout_async = 1; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ ret = blkcg_policy_register(&blkcg_policy_bfq); -+ if (ret) -+ return ret; -+#endif -+ -+ ret = -ENOMEM; -+ if (bfq_slab_setup()) -+ goto err_pol_unreg; -+ -+ /* -+ * Times to load large popular applications for the typical systems -+ * installed on the reference devices (see the comments before the -+ * definitions of the two arrays). -+ */ -+ T_slow[0] = msecs_to_jiffies(2600); -+ T_slow[1] = msecs_to_jiffies(1000); -+ T_fast[0] = msecs_to_jiffies(5500); -+ T_fast[1] = msecs_to_jiffies(2000); -+ -+ /* -+ * Thresholds that determine the switch between speed classes (see -+ * the comments before the definition of the array). -+ */ -+ device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; -+ device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; -+ -+ ret = elv_register(&iosched_bfq); -+ if (ret) -+ goto err_pol_unreg; -+ -+ pr_info("BFQ I/O-scheduler: v7r11"); -+ -+ return 0; -+ -+err_pol_unreg: -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ return ret; -+} -+ -+static void __exit bfq_exit(void) -+{ -+ elv_unregister(&iosched_bfq); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ blkcg_policy_unregister(&blkcg_policy_bfq); -+#endif -+ bfq_slab_kill(); -+} -+ -+module_init(bfq_init); -+module_exit(bfq_exit); -+ -+MODULE_AUTHOR("Arianna Avanzini, Fabio Checconi, Paolo Valente"); -+MODULE_LICENSE("GPL"); -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -new file mode 100644 -index 0000000..a5ed694 ---- /dev/null -+++ b/block/bfq-sched.c -@@ -0,0 +1,1199 @@ -+/* -+ * BFQ: Hierarchical B-WF2Q+ scheduler. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ */ -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+#define for_each_entity(entity) \ -+ for (; entity ; entity = entity->parent) -+ -+#define for_each_entity_safe(entity, parent) \ -+ for (; entity && ({ parent = entity->parent; 1; }); entity = parent) -+ -+ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ int extract, -+ struct bfq_data *bfqd); -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+ -+static void bfq_update_budget(struct bfq_entity *next_in_service) -+{ -+ struct bfq_entity *bfqg_entity; -+ struct bfq_group *bfqg; -+ struct bfq_sched_data *group_sd; -+ -+ BUG_ON(!next_in_service); -+ -+ group_sd = next_in_service->sched_data; -+ -+ bfqg = container_of(group_sd, struct bfq_group, sched_data); -+ /* -+ * bfq_group's my_entity field is not NULL only if the group -+ * is not the root group. We must not touch the root entity -+ * as it must never become an in-service entity. -+ */ -+ bfqg_entity = bfqg->my_entity; -+ if (bfqg_entity) -+ bfqg_entity->budget = next_in_service->budget; -+} -+ -+static int bfq_update_next_in_service(struct bfq_sched_data *sd) -+{ -+ struct bfq_entity *next_in_service; -+ -+ if (sd->in_service_entity) -+ /* will update/requeue at the end of service */ -+ return 0; -+ -+ /* -+ * NOTE: this can be improved in many ways, such as returning -+ * 1 (and thus propagating upwards the update) only when the -+ * budget changes, or caching the bfqq that will be scheduled -+ * next from this subtree. By now we worry more about -+ * correctness than about performance... -+ */ -+ next_in_service = bfq_lookup_next_entity(sd, 0, NULL); -+ sd->next_in_service = next_in_service; -+ -+ if (next_in_service) -+ bfq_update_budget(next_in_service); -+ -+ return 1; -+} -+ -+static void bfq_check_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *entity) -+{ -+ BUG_ON(sd->next_in_service != entity); -+} -+#else -+#define for_each_entity(entity) \ -+ for (; entity ; entity = NULL) -+ -+#define for_each_entity_safe(entity, parent) \ -+ for (parent = NULL; entity ; entity = parent) -+ -+static int bfq_update_next_in_service(struct bfq_sched_data *sd) -+{ -+ return 0; -+} -+ -+static void bfq_check_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *entity) -+{ -+} -+ -+static void bfq_update_budget(struct bfq_entity *next_in_service) -+{ -+} -+#endif -+ -+/* -+ * Shift for timestamp calculations. This actually limits the maximum -+ * service allowed in one timestamp delta (small shift values increase it), -+ * the maximum total weight that can be used for the queues in the system -+ * (big shift values increase it), and the period of virtual time -+ * wraparounds. -+ */ -+#define WFQ_SERVICE_SHIFT 22 -+ -+/** -+ * bfq_gt - compare two timestamps. -+ * @a: first ts. -+ * @b: second ts. -+ * -+ * Return @a > @b, dealing with wrapping correctly. -+ */ -+static int bfq_gt(u64 a, u64 b) -+{ -+ return (s64)(a - b) > 0; -+} -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = NULL; -+ -+ BUG_ON(!entity); -+ -+ if (!entity->my_sched_data) -+ bfqq = container_of(entity, struct bfq_queue, entity); -+ -+ return bfqq; -+} -+ -+ -+/** -+ * bfq_delta - map service into the virtual time domain. -+ * @service: amount of service. -+ * @weight: scale factor (weight of an entity or weight sum). -+ */ -+static u64 bfq_delta(unsigned long service, unsigned long weight) -+{ -+ u64 d = (u64)service << WFQ_SERVICE_SHIFT; -+ -+ do_div(d, weight); -+ return d; -+} -+ -+/** -+ * bfq_calc_finish - assign the finish time to an entity. -+ * @entity: the entity to act upon. -+ * @service: the service to be charged to the entity. -+ */ -+static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ BUG_ON(entity->weight == 0); -+ -+ entity->finish = entity->start + -+ bfq_delta(service, entity->weight); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_finish: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_finish: start %llu, finish %llu, delta %llu", -+ entity->start, entity->finish, -+ bfq_delta(service, entity->weight)); -+ } -+} -+ -+/** -+ * bfq_entity_of - get an entity from a node. -+ * @node: the node field of the entity. -+ * -+ * Convert a node pointer to the relative entity. This is used only -+ * to simplify the logic of some functions and not as the generic -+ * conversion mechanism because, e.g., in the tree walking functions, -+ * the check for a %NULL value would be redundant. -+ */ -+static struct bfq_entity *bfq_entity_of(struct rb_node *node) -+{ -+ struct bfq_entity *entity = NULL; -+ -+ if (node) -+ entity = rb_entry(node, struct bfq_entity, rb_node); -+ -+ return entity; -+} -+ -+/** -+ * bfq_extract - remove an entity from a tree. -+ * @root: the tree root. -+ * @entity: the entity to remove. -+ */ -+static void bfq_extract(struct rb_root *root, struct bfq_entity *entity) -+{ -+ BUG_ON(entity->tree != root); -+ -+ entity->tree = NULL; -+ rb_erase(&entity->rb_node, root); -+} -+ -+/** -+ * bfq_idle_extract - extract an entity from the idle tree. -+ * @st: the service tree of the owning @entity. -+ * @entity: the entity being removed. -+ */ -+static void bfq_idle_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *next; -+ -+ BUG_ON(entity->tree != &st->idle); -+ -+ if (entity == st->first_idle) { -+ next = rb_next(&entity->rb_node); -+ st->first_idle = bfq_entity_of(next); -+ } -+ -+ if (entity == st->last_idle) { -+ next = rb_prev(&entity->rb_node); -+ st->last_idle = bfq_entity_of(next); -+ } -+ -+ bfq_extract(&st->idle, entity); -+ -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+} -+ -+/** -+ * bfq_insert - generic tree insertion. -+ * @root: tree root. -+ * @entity: entity to insert. -+ * -+ * This is used for the idle and the active tree, since they are both -+ * ordered by finish time. -+ */ -+static void bfq_insert(struct rb_root *root, struct bfq_entity *entity) -+{ -+ struct bfq_entity *entry; -+ struct rb_node **node = &root->rb_node; -+ struct rb_node *parent = NULL; -+ -+ BUG_ON(entity->tree); -+ -+ while (*node) { -+ parent = *node; -+ entry = rb_entry(parent, struct bfq_entity, rb_node); -+ -+ if (bfq_gt(entry->finish, entity->finish)) -+ node = &parent->rb_left; -+ else -+ node = &parent->rb_right; -+ } -+ -+ rb_link_node(&entity->rb_node, parent, node); -+ rb_insert_color(&entity->rb_node, root); -+ -+ entity->tree = root; -+} -+ -+/** -+ * bfq_update_min - update the min_start field of a entity. -+ * @entity: the entity to update. -+ * @node: one of its children. -+ * -+ * This function is called when @entity may store an invalid value for -+ * min_start due to updates to the active tree. The function assumes -+ * that the subtree rooted at @node (which may be its left or its right -+ * child) has a valid min_start value. -+ */ -+static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) -+{ -+ struct bfq_entity *child; -+ -+ if (node) { -+ child = rb_entry(node, struct bfq_entity, rb_node); -+ if (bfq_gt(entity->min_start, child->min_start)) -+ entity->min_start = child->min_start; -+ } -+} -+ -+/** -+ * bfq_update_active_node - recalculate min_start. -+ * @node: the node to update. -+ * -+ * @node may have changed position or one of its children may have moved, -+ * this function updates its min_start value. The left and right subtrees -+ * are assumed to hold a correct min_start value. -+ */ -+static void bfq_update_active_node(struct rb_node *node) -+{ -+ struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); -+ -+ entity->min_start = entity->start; -+ bfq_update_min(entity, node->rb_right); -+ bfq_update_min(entity, node->rb_left); -+} -+ -+/** -+ * bfq_update_active_tree - update min_start for the whole active tree. -+ * @node: the starting node. -+ * -+ * @node must be the deepest modified node after an update. This function -+ * updates its min_start using the values held by its children, assuming -+ * that they did not change, and then updates all the nodes that may have -+ * changed in the path to the root. The only nodes that may have changed -+ * are the ones in the path or their siblings. -+ */ -+static void bfq_update_active_tree(struct rb_node *node) -+{ -+ struct rb_node *parent; -+ -+up: -+ bfq_update_active_node(node); -+ -+ parent = rb_parent(node); -+ if (!parent) -+ return; -+ -+ if (node == parent->rb_left && parent->rb_right) -+ bfq_update_active_node(parent->rb_right); -+ else if (parent->rb_left) -+ bfq_update_active_node(parent->rb_left); -+ -+ node = parent; -+ goto up; -+} -+ -+static void bfq_weights_tree_add(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root); -+ -+static void bfq_weights_tree_remove(struct bfq_data *bfqd, -+ struct bfq_entity *entity, -+ struct rb_root *root); -+ -+ -+/** -+ * bfq_active_insert - insert an entity in the active tree of its -+ * group/device. -+ * @st: the service tree of the entity. -+ * @entity: the entity being inserted. -+ * -+ * The active tree is ordered by finish time, but an extra key is kept -+ * per each node, containing the minimum value for the start times of -+ * its children (and the node itself), so it's possible to search for -+ * the eligible node with the lowest finish time in logarithmic time. -+ */ -+static void bfq_active_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node = &entity->rb_node; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ bfq_insert(&st->active, entity); -+ -+ if (node->rb_left) -+ node = node->rb_left; -+ else if (node->rb_right) -+ node = node->rb_right; -+ -+ bfq_update_active_tree(node); -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->active_list); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { /* bfq_group */ -+ BUG_ON(!bfqd); -+ bfq_weights_tree_add(bfqd, entity, &bfqd->group_weights_tree); -+ } -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ bfqg->active_entities++; -+ if (bfqg->active_entities == 2) -+ bfqd->active_numerous_groups++; -+ } -+#endif -+} -+ -+/** -+ * bfq_ioprio_to_weight - calc a weight from an ioprio. -+ * @ioprio: the ioprio value to convert. -+ */ -+static unsigned short bfq_ioprio_to_weight(int ioprio) -+{ -+ BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); -+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; -+} -+ -+/** -+ * bfq_weight_to_ioprio - calc an ioprio from a weight. -+ * @weight: the weight value to convert. -+ * -+ * To preserve as much as possible the old only-ioprio user interface, -+ * 0 is used as an escape ioprio value for weights (numerically) equal or -+ * larger than IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF. -+ */ -+static unsigned short bfq_weight_to_ioprio(int weight) -+{ -+ BUG_ON(weight < BFQ_MIN_WEIGHT || weight > BFQ_MAX_WEIGHT); -+ return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight < 0 ? -+ 0 : IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - weight; -+} -+ -+static void bfq_get_entity(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ if (bfqq) { -+ atomic_inc(&bfqq->ref); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ } -+} -+ -+/** -+ * bfq_find_deepest - find the deepest node that an extraction can modify. -+ * @node: the node being removed. -+ * -+ * Do the first step of an extraction in an rb tree, looking for the -+ * node that will replace @node, and returning the deepest node that -+ * the following modifications to the tree can touch. If @node is the -+ * last node in the tree return %NULL. -+ */ -+static struct rb_node *bfq_find_deepest(struct rb_node *node) -+{ -+ struct rb_node *deepest; -+ -+ if (!node->rb_right && !node->rb_left) -+ deepest = rb_parent(node); -+ else if (!node->rb_right) -+ deepest = node->rb_left; -+ else if (!node->rb_left) -+ deepest = node->rb_right; -+ else { -+ deepest = rb_next(node); -+ if (deepest->rb_right) -+ deepest = deepest->rb_right; -+ else if (rb_parent(deepest) != node) -+ deepest = rb_parent(deepest); -+ } -+ -+ return deepest; -+} -+ -+/** -+ * bfq_active_extract - remove an entity from the active tree. -+ * @st: the service_tree containing the tree. -+ * @entity: the entity being removed. -+ */ -+static void bfq_active_extract(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct rb_node *node; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd = NULL; -+ struct bfq_group *bfqg = NULL; -+ struct bfq_data *bfqd = NULL; -+#endif -+ -+ node = bfq_find_deepest(&entity->rb_node); -+ bfq_extract(&st->active, entity); -+ -+ if (node) -+ bfq_update_active_tree(node); -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ sd = entity->sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+#endif -+ if (bfqq) -+ list_del(&bfqq->bfqq_list); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { /* bfq_group */ -+ BUG_ON(!bfqd); -+ bfq_weights_tree_remove(bfqd, entity, -+ &bfqd->group_weights_tree); -+ } -+ if (bfqg != bfqd->root_group) { -+ BUG_ON(!bfqg); -+ BUG_ON(!bfqd); -+ BUG_ON(!bfqg->active_entities); -+ bfqg->active_entities--; -+ if (bfqg->active_entities == 1) { -+ BUG_ON(!bfqd->active_numerous_groups); -+ bfqd->active_numerous_groups--; -+ } -+ } -+#endif -+} -+ -+/** -+ * bfq_idle_insert - insert an entity into the idle tree. -+ * @st: the service tree containing the tree. -+ * @entity: the entity to insert. -+ */ -+static void bfq_idle_insert(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (!first_idle || bfq_gt(first_idle->finish, entity->finish)) -+ st->first_idle = entity; -+ if (!last_idle || bfq_gt(entity->finish, last_idle->finish)) -+ st->last_idle = entity; -+ -+ bfq_insert(&st->idle, entity); -+ -+ if (bfqq) -+ list_add(&bfqq->bfqq_list, &bfqq->bfqd->idle_list); -+} -+ -+/** -+ * bfq_forget_entity - remove an entity from the wfq trees. -+ * @st: the service tree. -+ * @entity: the entity being removed. -+ * -+ * Update the device status and forget everything about @entity, putting -+ * the device reference to it, if it is a queue. Entities belonging to -+ * groups are not refcounted. -+ */ -+static void bfq_forget_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_sched_data *sd; -+ -+ BUG_ON(!entity->on_st); -+ -+ entity->on_st = 0; -+ st->wsum -= entity->weight; -+ if (bfqq) { -+ sd = entity->sched_data; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", -+ bfqq, atomic_read(&bfqq->ref)); -+ bfq_put_queue(bfqq); -+ } -+} -+ -+/** -+ * bfq_put_idle_entity - release the idle tree ref of an entity. -+ * @st: service tree for the entity. -+ * @entity: the entity being released. -+ */ -+static void bfq_put_idle_entity(struct bfq_service_tree *st, -+ struct bfq_entity *entity) -+{ -+ bfq_idle_extract(st, entity); -+ bfq_forget_entity(st, entity); -+} -+ -+/** -+ * bfq_forget_idle - update the idle tree if necessary. -+ * @st: the service tree to act upon. -+ * -+ * To preserve the global O(log N) complexity we only remove one entry here; -+ * as the idle tree will not grow indefinitely this can be done safely. -+ */ -+static void bfq_forget_idle(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *first_idle = st->first_idle; -+ struct bfq_entity *last_idle = st->last_idle; -+ -+ if (RB_EMPTY_ROOT(&st->active) && last_idle && -+ !bfq_gt(last_idle->finish, st->vtime)) { -+ /* -+ * Forget the whole idle tree, increasing the vtime past -+ * the last finish time of idle entities. -+ */ -+ st->vtime = last_idle->finish; -+ } -+ -+ if (first_idle && !bfq_gt(first_idle->finish, st->vtime)) -+ bfq_put_idle_entity(st, first_idle); -+} -+ -+static struct bfq_service_tree * -+__bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, -+ struct bfq_entity *entity) -+{ -+ struct bfq_service_tree *new_st = old_st; -+ -+ if (entity->prio_changed) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned short prev_weight, new_weight; -+ struct bfq_data *bfqd = NULL; -+ struct rb_root *root; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ struct bfq_sched_data *sd; -+ struct bfq_group *bfqg; -+#endif -+ -+ if (bfqq) -+ bfqd = bfqq->bfqd; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ sd = entity->my_sched_data; -+ bfqg = container_of(sd, struct bfq_group, sched_data); -+ BUG_ON(!bfqg); -+ bfqd = (struct bfq_data *)bfqg->bfqd; -+ BUG_ON(!bfqd); -+ } -+#endif -+ -+ BUG_ON(old_st->wsum < entity->weight); -+ old_st->wsum -= entity->weight; -+ -+ if (entity->new_weight != entity->orig_weight) { -+ if (entity->new_weight < BFQ_MIN_WEIGHT || -+ entity->new_weight > BFQ_MAX_WEIGHT) { -+ pr_crit("update_weight_prio: new_weight %d\n", -+ entity->new_weight); -+ BUG(); -+ } -+ entity->orig_weight = entity->new_weight; -+ if (bfqq) -+ bfqq->ioprio = -+ bfq_weight_to_ioprio(entity->orig_weight); -+ } -+ -+ if (bfqq) -+ bfqq->ioprio_class = bfqq->new_ioprio_class; -+ entity->prio_changed = 0; -+ -+ /* -+ * NOTE: here we may be changing the weight too early, -+ * this will cause unfairness. The correct approach -+ * would have required additional complexity to defer -+ * weight changes to the proper time instants (i.e., -+ * when entity->finish <= old_st->vtime). -+ */ -+ new_st = bfq_entity_service_tree(entity); -+ -+ prev_weight = entity->weight; -+ new_weight = entity->orig_weight * -+ (bfqq ? bfqq->wr_coeff : 1); -+ /* -+ * If the weight of the entity changes, remove the entity -+ * from its old weight counter (if there is a counter -+ * associated with the entity), and add it to the counter -+ * associated with its new weight. -+ */ -+ if (prev_weight != new_weight) { -+ root = bfqq ? &bfqd->queue_weights_tree : -+ &bfqd->group_weights_tree; -+ bfq_weights_tree_remove(bfqd, entity, root); -+ } -+ entity->weight = new_weight; -+ /* -+ * Add the entity to its weights tree only if it is -+ * not associated with a weight-raised queue. -+ */ -+ if (prev_weight != new_weight && -+ (bfqq ? bfqq->wr_coeff == 1 : 1)) -+ /* If we get here, root has been initialized. */ -+ bfq_weights_tree_add(bfqd, entity, root); -+ -+ new_st->wsum += entity->weight; -+ -+ if (new_st != old_st) -+ entity->start = new_st->vtime; -+ } -+ -+ return new_st; -+} -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); -+#endif -+ -+/** -+ * bfq_bfqq_served - update the scheduler status after selection for -+ * service. -+ * @bfqq: the queue being served. -+ * @served: bytes to transfer. -+ * -+ * NOTE: this can be optimized, as the timestamps of upper level entities -+ * are synchronized every time a new bfqq is selected for service. By now, -+ * we keep it to better check consistency. -+ */ -+static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st; -+ -+ for_each_entity(entity) { -+ st = bfq_entity_service_tree(entity); -+ -+ entity->service += served; -+ BUG_ON(entity->service > entity->budget); -+ BUG_ON(st->wsum == 0); -+ -+ st->vtime += bfq_delta(served, st->wsum); -+ bfq_forget_idle(st); -+ } -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); -+#endif -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); -+} -+ -+/** -+ * bfq_bfqq_charge_full_budget - set the service to the entity budget. -+ * @bfqq: the queue that needs a service update. -+ * -+ * When it's not possible to be fair in the service domain, because -+ * a queue is not consuming its budget fast enough (the meaning of -+ * fast depends on the timeout parameter), we charge it a full -+ * budget. In this way we should obtain a sort of time-domain -+ * fairness among all the seeky/slow queues. -+ */ -+static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); -+ -+ bfq_bfqq_served(bfqq, entity->budget - entity->service); -+} -+ -+/** -+ * __bfq_activate_entity - activate an entity. -+ * @entity: the entity being activated. -+ * -+ * Called whenever an entity is activated, i.e., it is not active and one -+ * of its children receives a new request, or has to be reactivated due to -+ * budget exhaustion. It uses the current budget of the entity (and the -+ * service received if @entity is active) of the queue to calculate its -+ * timestamps. -+ */ -+static void __bfq_activate_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ if (entity == sd->in_service_entity) { -+ BUG_ON(entity->tree); -+ /* -+ * If we are requeueing the current entity we have -+ * to take care of not charging to it service it has -+ * not received. -+ */ -+ bfq_calc_finish(entity, entity->service); -+ entity->start = entity->finish; -+ sd->in_service_entity = NULL; -+ } else if (entity->tree == &st->active) { -+ /* -+ * Requeueing an entity due to a change of some -+ * next_in_service entity below it. We reuse the -+ * old start time. -+ */ -+ bfq_active_extract(st, entity); -+ } else if (entity->tree == &st->idle) { -+ /* -+ * Must be on the idle tree, bfq_idle_extract() will -+ * check for that. -+ */ -+ bfq_idle_extract(st, entity); -+ entity->start = bfq_gt(st->vtime, entity->finish) ? -+ st->vtime : entity->finish; -+ } else { -+ /* -+ * The finish time of the entity may be invalid, and -+ * it is in the past for sure, otherwise the queue -+ * would have been on the idle tree. -+ */ -+ entity->start = st->vtime; -+ st->wsum += entity->weight; -+ bfq_get_entity(entity); -+ -+ BUG_ON(entity->on_st); -+ entity->on_st = 1; -+ } -+ -+ st = __bfq_entity_update_weight_prio(st, entity); -+ bfq_calc_finish(entity, entity->budget); -+ bfq_active_insert(st, entity); -+} -+ -+/** -+ * bfq_activate_entity - activate an entity and its ancestors if necessary. -+ * @entity: the entity to activate. -+ * -+ * Activate @entity and all the entities on the path from it to the root. -+ */ -+static void bfq_activate_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd; -+ -+ for_each_entity(entity) { -+ __bfq_activate_entity(entity); -+ -+ sd = entity->sched_data; -+ if (!bfq_update_next_in_service(sd)) -+ /* -+ * No need to propagate the activation to the -+ * upper entities, as they will be updated when -+ * the in-service entity is rescheduled. -+ */ -+ break; -+ } -+} -+ -+/** -+ * __bfq_deactivate_entity - deactivate an entity from its service tree. -+ * @entity: the entity to deactivate. -+ * @requeue: if false, the entity will not be put into the idle tree. -+ * -+ * Deactivate an entity, independently from its previous state. If the -+ * entity was not on a service tree just return, otherwise if it is on -+ * any scheduler tree, extract it from that tree, and if necessary -+ * and if the caller did not specify @requeue, put it on the idle tree. -+ * -+ * Return %1 if the caller should update the entity hierarchy, i.e., -+ * if the entity was in service or if it was the next_in_service for -+ * its sched_data; return %0 otherwise. -+ */ -+static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st; -+ int was_in_service; -+ int ret = 0; -+ -+ if (sd == NULL || !entity->on_st) /* never activated, or inactive */ -+ return 0; -+ -+ st = bfq_entity_service_tree(entity); -+ was_in_service = entity == sd->in_service_entity; -+ -+ BUG_ON(was_in_service && entity->tree); -+ -+ if (was_in_service) { -+ bfq_calc_finish(entity, entity->service); -+ sd->in_service_entity = NULL; -+ } else if (entity->tree == &st->active) -+ bfq_active_extract(st, entity); -+ else if (entity->tree == &st->idle) -+ bfq_idle_extract(st, entity); -+ else if (entity->tree) -+ BUG(); -+ -+ if (was_in_service || sd->next_in_service == entity) -+ ret = bfq_update_next_in_service(sd); -+ -+ if (!requeue || !bfq_gt(entity->finish, st->vtime)) -+ bfq_forget_entity(st, entity); -+ else -+ bfq_idle_insert(st, entity); -+ -+ BUG_ON(sd->in_service_entity == entity); -+ BUG_ON(sd->next_in_service == entity); -+ -+ return ret; -+} -+ -+/** -+ * bfq_deactivate_entity - deactivate an entity. -+ * @entity: the entity to deactivate. -+ * @requeue: true if the entity can be put on the idle tree -+ */ -+static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -+{ -+ struct bfq_sched_data *sd; -+ struct bfq_entity *parent; -+ -+ for_each_entity_safe(entity, parent) { -+ sd = entity->sched_data; -+ -+ if (!__bfq_deactivate_entity(entity, requeue)) -+ /* -+ * The parent entity is still backlogged, and -+ * we don't need to update it as it is still -+ * in service. -+ */ -+ break; -+ -+ if (sd->next_in_service) -+ /* -+ * The parent entity is still backlogged and -+ * the budgets on the path towards the root -+ * need to be updated. -+ */ -+ goto update; -+ -+ /* -+ * If we reach there the parent is no more backlogged and -+ * we want to propagate the dequeue upwards. -+ */ -+ requeue = 1; -+ } -+ -+ return; -+ -+update: -+ entity = parent; -+ for_each_entity(entity) { -+ __bfq_activate_entity(entity); -+ -+ sd = entity->sched_data; -+ if (!bfq_update_next_in_service(sd)) -+ break; -+ } -+} -+ -+/** -+ * bfq_update_vtime - update vtime if necessary. -+ * @st: the service tree to act upon. -+ * -+ * If necessary update the service tree vtime to have at least one -+ * eligible entity, skipping to its start time. Assumes that the -+ * active tree of the device is not empty. -+ * -+ * NOTE: this hierarchical implementation updates vtimes quite often, -+ * we may end up with reactivated processes getting timestamps after a -+ * vtime skip done because we needed a ->first_active entity on some -+ * intermediate node. -+ */ -+static void bfq_update_vtime(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entry; -+ struct rb_node *node = st->active.rb_node; -+ -+ entry = rb_entry(node, struct bfq_entity, rb_node); -+ if (bfq_gt(entry->min_start, st->vtime)) { -+ st->vtime = entry->min_start; -+ bfq_forget_idle(st); -+ } -+} -+ -+/** -+ * bfq_first_active_entity - find the eligible entity with -+ * the smallest finish time -+ * @st: the service tree to select from. -+ * -+ * This function searches the first schedulable entity, starting from the -+ * root of the tree and going on the left every time on this side there is -+ * a subtree with at least one eligible (start >= vtime) entity. The path on -+ * the right is followed only if a) the left subtree contains no eligible -+ * entities and b) no eligible entity has been found yet. -+ */ -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) -+{ -+ struct bfq_entity *entry, *first = NULL; -+ struct rb_node *node = st->active.rb_node; -+ -+ while (node) { -+ entry = rb_entry(node, struct bfq_entity, rb_node); -+left: -+ if (!bfq_gt(entry->start, st->vtime)) -+ first = entry; -+ -+ BUG_ON(bfq_gt(entry->min_start, st->vtime)); -+ -+ if (node->rb_left) { -+ entry = rb_entry(node->rb_left, -+ struct bfq_entity, rb_node); -+ if (!bfq_gt(entry->min_start, st->vtime)) { -+ node = node->rb_left; -+ goto left; -+ } -+ } -+ if (first) -+ break; -+ node = node->rb_right; -+ } -+ -+ BUG_ON(!first && !RB_EMPTY_ROOT(&st->active)); -+ return first; -+} -+ -+/** -+ * __bfq_lookup_next_entity - return the first eligible entity in @st. -+ * @st: the service tree. -+ * -+ * Update the virtual time in @st and return the first eligible entity -+ * it contains. -+ */ -+static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, -+ bool force) -+{ -+ struct bfq_entity *entity, *new_next_in_service = NULL; -+ -+ if (RB_EMPTY_ROOT(&st->active)) -+ return NULL; -+ -+ bfq_update_vtime(st); -+ entity = bfq_first_active_entity(st); -+ BUG_ON(bfq_gt(entity->start, st->vtime)); -+ -+ /* -+ * If the chosen entity does not match with the sched_data's -+ * next_in_service and we are forcedly serving the IDLE priority -+ * class tree, bubble up budget update. -+ */ -+ if (unlikely(force && entity != entity->sched_data->next_in_service)) { -+ new_next_in_service = entity; -+ for_each_entity(new_next_in_service) -+ bfq_update_budget(new_next_in_service); -+ } -+ -+ return entity; -+} -+ -+/** -+ * bfq_lookup_next_entity - return the first eligible entity in @sd. -+ * @sd: the sched_data. -+ * @extract: if true the returned entity will be also extracted from @sd. -+ * -+ * NOTE: since we cache the next_in_service entity at each level of the -+ * hierarchy, the complexity of the lookup can be decreased with -+ * absolutely no effort just returning the cached next_in_service value; -+ * we prefer to do full lookups to test the consistency of * the data -+ * structures. -+ */ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -+ int extract, -+ struct bfq_data *bfqd) -+{ -+ struct bfq_service_tree *st = sd->service_tree; -+ struct bfq_entity *entity; -+ int i = 0; -+ -+ BUG_ON(sd->in_service_entity); -+ -+ if (bfqd && -+ jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { -+ entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, -+ true); -+ if (entity) { -+ i = BFQ_IOPRIO_CLASSES - 1; -+ bfqd->bfq_class_idle_last_service = jiffies; -+ sd->next_in_service = entity; -+ } -+ } -+ for (; i < BFQ_IOPRIO_CLASSES; i++) { -+ entity = __bfq_lookup_next_entity(st + i, false); -+ if (entity) { -+ if (extract) { -+ bfq_check_next_in_service(sd, entity); -+ bfq_active_extract(st + i, entity); -+ sd->in_service_entity = entity; -+ sd->next_in_service = NULL; -+ } -+ break; -+ } -+ } -+ -+ return entity; -+} -+ -+/* -+ * Get next queue for service. -+ */ -+static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) -+{ -+ struct bfq_entity *entity = NULL; -+ struct bfq_sched_data *sd; -+ struct bfq_queue *bfqq; -+ -+ BUG_ON(bfqd->in_service_queue); -+ -+ if (bfqd->busy_queues == 0) -+ return NULL; -+ -+ sd = &bfqd->root_group->sched_data; -+ for (; sd ; sd = entity->my_sched_data) { -+ entity = bfq_lookup_next_entity(sd, 1, bfqd); -+ BUG_ON(!entity); -+ entity->service = 0; -+ } -+ -+ bfqq = bfq_entity_to_bfqq(entity); -+ BUG_ON(!bfqq); -+ -+ return bfqq; -+} -+ -+static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) -+{ -+ if (bfqd->in_service_bic) { -+ put_io_context(bfqd->in_service_bic->icq.ioc); -+ bfqd->in_service_bic = NULL; -+ } -+ -+ bfqd->in_service_queue = NULL; -+ del_timer(&bfqd->idle_slice_timer); -+} -+ -+static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ int requeue) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfqq == bfqd->in_service_queue) -+ __bfq_bfqd_reset_in_service(bfqd); -+ -+ bfq_deactivate_entity(entity, requeue); -+} -+ -+static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_activate_entity(entity); -+} -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); -+#endif -+ -+/* -+ * Called when the bfqq no longer has requests pending, remove it from -+ * the service tree. -+ */ -+static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ int requeue) -+{ -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ bfq_log_bfqq(bfqd, bfqq, "del from busy"); -+ -+ bfq_clear_bfqq_busy(bfqq); -+ -+ BUG_ON(bfqd->busy_queues == 0); -+ bfqd->busy_queues--; -+ -+ if (!bfqq->dispatched) { -+ bfq_weights_tree_remove(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ if (!blk_queue_nonrot(bfqd->queue)) { -+ BUG_ON(!bfqd->busy_in_flight_queues); -+ bfqd->busy_in_flight_queues--; -+ if (bfq_bfqq_constantly_seeky(bfqq)) { -+ BUG_ON(!bfqd-> -+ const_seeky_busy_in_flight_queues); -+ bfqd->const_seeky_busy_in_flight_queues--; -+ } -+ } -+ } -+ if (bfqq->wr_coeff > 1) -+ bfqd->wr_busy_queues--; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ bfqg_stats_update_dequeue(bfqq_group(bfqq)); -+#endif -+ -+ bfq_deactivate_bfqq(bfqd, bfqq, requeue); -+} -+ -+/* -+ * Called when an inactive queue receives a new request. -+ */ -+static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ BUG_ON(bfq_bfqq_busy(bfqq)); -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ -+ bfq_log_bfqq(bfqd, bfqq, "add to busy"); -+ -+ bfq_activate_bfqq(bfqd, bfqq); -+ -+ bfq_mark_bfqq_busy(bfqq); -+ bfqd->busy_queues++; -+ -+ if (!bfqq->dispatched) { -+ if (bfqq->wr_coeff == 1) -+ bfq_weights_tree_add(bfqd, &bfqq->entity, -+ &bfqd->queue_weights_tree); -+ if (!blk_queue_nonrot(bfqd->queue)) { -+ bfqd->busy_in_flight_queues++; -+ if (bfq_bfqq_constantly_seeky(bfqq)) -+ bfqd->const_seeky_busy_in_flight_queues++; -+ } -+ } -+ if (bfqq->wr_coeff > 1) -+ bfqd->wr_busy_queues++; -+} -diff --git a/block/bfq.h b/block/bfq.h -new file mode 100644 -index 0000000..2bf54ae ---- /dev/null -+++ b/block/bfq.h -@@ -0,0 +1,801 @@ -+/* -+ * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. -+ * -+ * Based on ideas and code from CFQ: -+ * Copyright (C) 2003 Jens Axboe -+ * -+ * Copyright (C) 2008 Fabio Checconi -+ * Paolo Valente -+ * -+ * Copyright (C) 2010 Paolo Valente -+ */ -+ -+#ifndef _BFQ_H -+#define _BFQ_H -+ -+#include -+#include -+#include -+#include -+#include -+ -+#define BFQ_IOPRIO_CLASSES 3 -+#define BFQ_CL_IDLE_TIMEOUT (HZ/5) -+ -+#define BFQ_MIN_WEIGHT 1 -+#define BFQ_MAX_WEIGHT 1000 -+#define BFQ_WEIGHT_CONVERSION_COEFF 10 -+ -+#define BFQ_DEFAULT_QUEUE_IOPRIO 4 -+ -+#define BFQ_DEFAULT_GRP_WEIGHT 10 -+#define BFQ_DEFAULT_GRP_IOPRIO 0 -+#define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE -+ -+struct bfq_entity; -+ -+/** -+ * struct bfq_service_tree - per ioprio_class service tree. -+ * @active: tree for active entities (i.e., those backlogged). -+ * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). -+ * @first_idle: idle entity with minimum F_i. -+ * @last_idle: idle entity with maximum F_i. -+ * @vtime: scheduler virtual time. -+ * @wsum: scheduler weight sum; active and idle entities contribute to it. -+ * -+ * Each service tree represents a B-WF2Q+ scheduler on its own. Each -+ * ioprio_class has its own independent scheduler, and so its own -+ * bfq_service_tree. All the fields are protected by the queue lock -+ * of the containing bfqd. -+ */ -+struct bfq_service_tree { -+ struct rb_root active; -+ struct rb_root idle; -+ -+ struct bfq_entity *first_idle; -+ struct bfq_entity *last_idle; -+ -+ u64 vtime; -+ unsigned long wsum; -+}; -+ -+/** -+ * struct bfq_sched_data - multi-class scheduler. -+ * @in_service_entity: entity in service. -+ * @next_in_service: head-of-the-line entity in the scheduler. -+ * @service_tree: array of service trees, one per ioprio_class. -+ * -+ * bfq_sched_data is the basic scheduler queue. It supports three -+ * ioprio_classes, and can be used either as a toplevel queue or as -+ * an intermediate queue on a hierarchical setup. -+ * @next_in_service points to the active entity of the sched_data -+ * service trees that will be scheduled next. -+ * -+ * The supported ioprio_classes are the same as in CFQ, in descending -+ * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -+ * Requests from higher priority queues are served before all the -+ * requests from lower priority queues; among requests of the same -+ * queue requests are served according to B-WF2Q+. -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_sched_data { -+ struct bfq_entity *in_service_entity; -+ struct bfq_entity *next_in_service; -+ struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+}; -+ -+/** -+ * struct bfq_weight_counter - counter of the number of all active entities -+ * with a given weight. -+ * @weight: weight of the entities that this counter refers to. -+ * @num_active: number of active entities with this weight. -+ * @weights_node: weights tree member (see bfq_data's @queue_weights_tree -+ * and @group_weights_tree). -+ */ -+struct bfq_weight_counter { -+ short int weight; -+ unsigned int num_active; -+ struct rb_node weights_node; -+}; -+ -+/** -+ * struct bfq_entity - schedulable entity. -+ * @rb_node: service_tree member. -+ * @weight_counter: pointer to the weight counter associated with this entity. -+ * @on_st: flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree). -+ * @finish: B-WF2Q+ finish timestamp (aka F_i). -+ * @start: B-WF2Q+ start timestamp (aka S_i). -+ * @tree: tree the entity is enqueued into; %NULL if not on a tree. -+ * @min_start: minimum start time of the (active) subtree rooted at -+ * this entity; used for O(log N) lookups into active trees. -+ * @service: service received during the last round of service. -+ * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. -+ * @weight: weight of the queue -+ * @parent: parent entity, for hierarchical scheduling. -+ * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the -+ * associated scheduler queue, %NULL on leaf nodes. -+ * @sched_data: the scheduler queue this entity belongs to. -+ * @ioprio: the ioprio in use. -+ * @new_weight: when a weight change is requested, the new weight value. -+ * @orig_weight: original weight, used to implement weight boosting -+ * @prio_changed: flag, true when the user requested a weight, ioprio or -+ * ioprio_class change. -+ * -+ * A bfq_entity is used to represent either a bfq_queue (leaf node in the -+ * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -+ * entity belongs to the sched_data of the parent group in the cgroup -+ * hierarchy. Non-leaf entities have also their own sched_data, stored -+ * in @my_sched_data. -+ * -+ * Each entity stores independently its priority values; this would -+ * allow different weights on different devices, but this -+ * functionality is not exported to userspace by now. Priorities and -+ * weights are updated lazily, first storing the new values into the -+ * new_* fields, then setting the @prio_changed flag. As soon as -+ * there is a transition in the entity state that allows the priority -+ * update to take place the effective and the requested priority -+ * values are synchronized. -+ * -+ * Unless cgroups are used, the weight value is calculated from the -+ * ioprio to export the same interface as CFQ. When dealing with -+ * ``well-behaved'' queues (i.e., queues that do not spend too much -+ * time to consume their budget and have true sequential behavior, and -+ * when there are no external factors breaking anticipation) the -+ * relative weights at each level of the cgroups hierarchy should be -+ * guaranteed. All the fields are protected by the queue lock of the -+ * containing bfqd. -+ */ -+struct bfq_entity { -+ struct rb_node rb_node; -+ struct bfq_weight_counter *weight_counter; -+ -+ int on_st; -+ -+ u64 finish; -+ u64 start; -+ -+ struct rb_root *tree; -+ -+ u64 min_start; -+ -+ int service, budget; -+ unsigned short weight, new_weight; -+ unsigned short orig_weight; -+ -+ struct bfq_entity *parent; -+ -+ struct bfq_sched_data *my_sched_data; -+ struct bfq_sched_data *sched_data; -+ -+ int prio_changed; -+}; -+ -+struct bfq_group; -+ -+/** -+ * struct bfq_queue - leaf schedulable entity. -+ * @ref: reference counter. -+ * @bfqd: parent bfq_data. -+ * @new_ioprio: when an ioprio change is requested, the new ioprio value. -+ * @ioprio_class: the ioprio_class in use. -+ * @new_ioprio_class: when an ioprio_class change is requested, the new -+ * ioprio_class value. -+ * @new_bfqq: shared bfq_queue if queue is cooperating with -+ * one or more other queues. -+ * @sort_list: sorted list of pending requests. -+ * @next_rq: if fifo isn't expired, next request to serve. -+ * @queued: nr of requests queued in @sort_list. -+ * @allocated: currently allocated requests. -+ * @meta_pending: pending metadata requests. -+ * @fifo: fifo list of requests in sort_list. -+ * @entity: entity representing this queue in the scheduler. -+ * @max_budget: maximum budget allowed from the feedback mechanism. -+ * @budget_timeout: budget expiration (in jiffies). -+ * @dispatched: number of requests on the dispatch list or inside driver. -+ * @flags: status flags. -+ * @bfqq_list: node for active/idle bfqq list inside our bfqd. -+ * @burst_list_node: node for the device's burst list. -+ * @seek_samples: number of seeks sampled -+ * @seek_total: sum of the distances of the seeks sampled -+ * @seek_mean: mean seek distance -+ * @last_request_pos: position of the last request enqueued -+ * @requests_within_timer: number of consecutive pairs of request completion -+ * and arrival, such that the queue becomes idle -+ * after the completion, but the next request arrives -+ * within an idle time slice; used only if the queue's -+ * IO_bound has been cleared. -+ * @pid: pid of the process owning the queue, used for logging purposes. -+ * @last_wr_start_finish: start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period -+ * @wr_cur_max_time: current max raising time for this queue -+ * @soft_rt_next_start: minimum time instant such that, only if a new -+ * request is enqueued after this time instant in an -+ * idle @bfq_queue with no outstanding requests, then -+ * the task associated with the queue it is deemed as -+ * soft real-time (see the comments to the function -+ * bfq_bfqq_softrt_next_start()) -+ * @last_idle_bklogged: time of the last transition of the @bfq_queue from -+ * idle to backlogged -+ * @service_from_backlogged: cumulative service received from the @bfq_queue -+ * since the last transition from idle to -+ * backlogged -+ * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the -+ * queue is shared -+ * -+ * A bfq_queue is a leaf request queue; it can be associated with an -+ * io_context or more, if it is async or shared between cooperating -+ * processes. @cgroup holds a reference to the cgroup, to be sure that it -+ * does not disappear while a bfqq still references it (mostly to avoid -+ * races between request issuing and task migration followed by cgroup -+ * destruction). -+ * All the fields are protected by the queue lock of the containing bfqd. -+ */ -+struct bfq_queue { -+ atomic_t ref; -+ struct bfq_data *bfqd; -+ -+ unsigned short ioprio, new_ioprio; -+ unsigned short ioprio_class, new_ioprio_class; -+ -+ /* fields for cooperating queues handling */ -+ struct bfq_queue *new_bfqq; -+ struct rb_node pos_node; -+ struct rb_root *pos_root; -+ -+ struct rb_root sort_list; -+ struct request *next_rq; -+ int queued[2]; -+ int allocated[2]; -+ int meta_pending; -+ struct list_head fifo; -+ -+ struct bfq_entity entity; -+ -+ int max_budget; -+ unsigned long budget_timeout; -+ -+ int dispatched; -+ -+ unsigned int flags; -+ -+ struct list_head bfqq_list; -+ -+ struct hlist_node burst_list_node; -+ -+ unsigned int seek_samples; -+ u64 seek_total; -+ sector_t seek_mean; -+ sector_t last_request_pos; -+ -+ unsigned int requests_within_timer; -+ -+ pid_t pid; -+ struct bfq_io_cq *bic; -+ -+ /* weight-raising fields */ -+ unsigned long wr_cur_max_time; -+ unsigned long soft_rt_next_start; -+ unsigned long last_wr_start_finish; -+ unsigned int wr_coeff; -+ unsigned long last_idle_bklogged; -+ unsigned long service_from_backlogged; -+}; -+ -+/** -+ * struct bfq_ttime - per process thinktime stats. -+ * @ttime_total: total process thinktime -+ * @ttime_samples: number of thinktime samples -+ * @ttime_mean: average process thinktime -+ */ -+struct bfq_ttime { -+ unsigned long last_end_request; -+ -+ unsigned long ttime_total; -+ unsigned long ttime_samples; -+ unsigned long ttime_mean; -+}; -+ -+/** -+ * struct bfq_io_cq - per (request_queue, io_context) structure. -+ * @icq: associated io_cq structure -+ * @bfqq: array of two process queues, the sync and the async -+ * @ttime: associated @bfq_ttime struct -+ * @ioprio: per (request_queue, blkcg) ioprio. -+ * @blkcg_id: id of the blkcg the related io_cq belongs to. -+ */ -+struct bfq_io_cq { -+ struct io_cq icq; /* must be the first member */ -+ struct bfq_queue *bfqq[2]; -+ struct bfq_ttime ttime; -+ int ioprio; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ uint64_t blkcg_id; /* the current blkcg ID */ -+#endif -+}; -+ -+enum bfq_device_speed { -+ BFQ_BFQD_FAST, -+ BFQ_BFQD_SLOW, -+}; -+ -+/** -+ * struct bfq_data - per device data structure. -+ * @queue: request queue for the managed device. -+ * @root_group: root bfq_group for the device. -+ * @active_numerous_groups: number of bfq_groups containing more than one -+ * active @bfq_entity. -+ * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues -+ * have the same weight. The tree contains one counter -+ * for each distinct weight associated to some active -+ * and not weight-raised @bfq_queue (see the comments to -+ * the functions bfq_weights_tree_[add|remove] for -+ * further details). -+ * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted -+ * by weight. Used to keep track of whether all -+ * @bfq_groups have the same weight. The tree contains -+ * one counter for each distinct weight associated to -+ * some active @bfq_group (see the comments to the -+ * functions bfq_weights_tree_[add|remove] for further -+ * details). -+ * @busy_queues: number of bfq_queues containing requests (including the -+ * queue in service, even if it is idling). -+ * @busy_in_flight_queues: number of @bfq_queues containing pending or -+ * in-flight requests, plus the @bfq_queue in -+ * service, even if idle but waiting for the -+ * possible arrival of its next sync request. This -+ * field is updated only if the device is rotational, -+ * but used only if the device is also NCQ-capable. -+ * The reason why the field is updated also for non- -+ * NCQ-capable rotational devices is related to the -+ * fact that the value of @hw_tag may be set also -+ * later than when busy_in_flight_queues may need to -+ * be incremented for the first time(s). Taking also -+ * this possibility into account, to avoid unbalanced -+ * increments/decrements, would imply more overhead -+ * than just updating busy_in_flight_queues -+ * regardless of the value of @hw_tag. -+ * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues -+ * (that is, seeky queues that expired -+ * for budget timeout at least once) -+ * containing pending or in-flight -+ * requests, including the in-service -+ * @bfq_queue if constantly seeky. This -+ * field is updated only if the device -+ * is rotational, but used only if the -+ * device is also NCQ-capable (see the -+ * comments to @busy_in_flight_queues). -+ * @wr_busy_queues: number of weight-raised busy @bfq_queues. -+ * @queued: number of queued requests. -+ * @rq_in_driver: number of requests dispatched and waiting for completion. -+ * @sync_flight: number of sync requests in the driver. -+ * @max_rq_in_driver: max number of reqs in driver in the last -+ * @hw_tag_samples completed requests. -+ * @hw_tag_samples: nr of samples used to calculate hw_tag. -+ * @hw_tag: flag set to one if the driver is showing a queueing behavior. -+ * @budgets_assigned: number of budgets assigned. -+ * @idle_slice_timer: timer set when idling for the next sequential request -+ * from the queue in service. -+ * @unplug_work: delayed work to restart dispatching on the request queue. -+ * @in_service_queue: bfq_queue in service. -+ * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. -+ * @last_position: on-disk position of the last served request. -+ * @last_budget_start: beginning of the last budget. -+ * @last_idling_start: beginning of the last idle slice. -+ * @peak_rate: peak transfer rate observed for a budget. -+ * @peak_rate_samples: number of samples used to calculate @peak_rate. -+ * @bfq_max_budget: maximum budget allotted to a bfq_queue before -+ * rescheduling. -+ * @active_list: list of all the bfq_queues active on the device. -+ * @idle_list: list of all the bfq_queues idle on the device. -+ * @bfq_fifo_expire: timeout for async/sync requests; when it expires -+ * requests are served in fifo order. -+ * @bfq_back_penalty: weight of backward seeks wrt forward ones. -+ * @bfq_back_max: maximum allowed backward seek. -+ * @bfq_slice_idle: maximum idling time. -+ * @bfq_user_max_budget: user-configured max budget value -+ * (0 for auto-tuning). -+ * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to -+ * async queues. -+ * @bfq_timeout: timeout for bfq_queues to consume their budget; used to -+ * to prevent seeky queues to impose long latencies to well -+ * behaved ones (this also implies that seeky queues cannot -+ * receive guarantees in the service domain; after a timeout -+ * they are charged for the whole allocated budget, to try -+ * to preserve a behavior reasonably fair among them, but -+ * without service-domain guarantees). -+ * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is -+ * no more granted any weight-raising. -+ * @bfq_failed_cooperations: number of consecutive failed cooperation -+ * chances after which weight-raising is restored -+ * to a queue subject to more than bfq_coop_thresh -+ * queue merges. -+ * @bfq_requests_within_timer: number of consecutive requests that must be -+ * issued within the idle time slice to set -+ * again idling to a queue which was marked as -+ * non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ * @last_ins_in_burst: last time at which a queue entered the current -+ * burst of queues being activated shortly after -+ * each other; for more details about this and the -+ * following parameters related to a burst of -+ * activations, see the comments to the function -+ * @bfq_handle_burst. -+ * @bfq_burst_interval: reference time interval used to decide whether a -+ * queue has been activated shortly after -+ * @last_ins_in_burst. -+ * @burst_size: number of queues in the current burst of queue activations. -+ * @bfq_large_burst_thresh: maximum burst size above which the current -+ * queue-activation burst is deemed as 'large'. -+ * @large_burst: true if a large queue-activation burst is in progress. -+ * @burst_list: head of the burst list (as for the above fields, more details -+ * in the comments to the function bfq_handle_burst). -+ * @low_latency: if set to true, low-latency heuristics are enabled. -+ * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised -+ * queue is multiplied. -+ * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). -+ * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. -+ * @bfq_wr_min_idle_time: minimum idle period after which weight-raising -+ * may be reactivated for a queue (in jiffies). -+ * @bfq_wr_min_inter_arr_async: minimum period between request arrivals -+ * after which weight-raising may be -+ * reactivated for an already busy queue -+ * (in jiffies). -+ * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, -+ * sectors per seconds. -+ * @RT_prod: cached value of the product R*T used for computing the maximum -+ * duration of the weight raising automatically. -+ * @device_speed: device-speed class for the low-latency heuristic. -+ * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. -+ * -+ * All the fields are protected by the @queue lock. -+ */ -+struct bfq_data { -+ struct request_queue *queue; -+ -+ struct bfq_group *root_group; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ int active_numerous_groups; -+#endif -+ -+ struct rb_root queue_weights_tree; -+ struct rb_root group_weights_tree; -+ -+ int busy_queues; -+ int busy_in_flight_queues; -+ int const_seeky_busy_in_flight_queues; -+ int wr_busy_queues; -+ int queued; -+ int rq_in_driver; -+ int sync_flight; -+ -+ int max_rq_in_driver; -+ int hw_tag_samples; -+ int hw_tag; -+ -+ int budgets_assigned; -+ -+ struct timer_list idle_slice_timer; -+ struct work_struct unplug_work; -+ -+ struct bfq_queue *in_service_queue; -+ struct bfq_io_cq *in_service_bic; -+ -+ sector_t last_position; -+ -+ ktime_t last_budget_start; -+ ktime_t last_idling_start; -+ int peak_rate_samples; -+ u64 peak_rate; -+ int bfq_max_budget; -+ -+ struct list_head active_list; -+ struct list_head idle_list; -+ -+ unsigned int bfq_fifo_expire[2]; -+ unsigned int bfq_back_penalty; -+ unsigned int bfq_back_max; -+ unsigned int bfq_slice_idle; -+ u64 bfq_class_idle_last_service; -+ -+ int bfq_user_max_budget; -+ int bfq_max_budget_async_rq; -+ unsigned int bfq_timeout[2]; -+ -+ unsigned int bfq_coop_thresh; -+ unsigned int bfq_failed_cooperations; -+ unsigned int bfq_requests_within_timer; -+ -+ unsigned long last_ins_in_burst; -+ unsigned long bfq_burst_interval; -+ int burst_size; -+ unsigned long bfq_large_burst_thresh; -+ bool large_burst; -+ struct hlist_head burst_list; -+ -+ bool low_latency; -+ -+ /* parameters of the low_latency heuristics */ -+ unsigned int bfq_wr_coeff; -+ unsigned int bfq_wr_max_time; -+ unsigned int bfq_wr_rt_max_time; -+ unsigned int bfq_wr_min_idle_time; -+ unsigned long bfq_wr_min_inter_arr_async; -+ unsigned int bfq_wr_max_softrt_rate; -+ u64 RT_prod; -+ enum bfq_device_speed device_speed; -+ -+ struct bfq_queue oom_bfqq; -+}; -+ -+enum bfqq_state_flags { -+ BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ -+ BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ -+ BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ -+ BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -+ BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ -+ BFQ_BFQQ_FLAG_IO_bound, /* -+ * bfqq has timed-out at least once -+ * having consumed at most 2/10 of -+ * its budget -+ */ -+ BFQ_BFQQ_FLAG_in_large_burst, /* -+ * bfqq activated in a large burst, -+ * see comments to bfq_handle_burst. -+ */ -+ BFQ_BFQQ_FLAG_constantly_seeky, /* -+ * bfqq has proved to be slow and -+ * seeky until budget timeout -+ */ -+ BFQ_BFQQ_FLAG_softrt_update, /* -+ * may need softrt-next-start -+ * update -+ */ -+}; -+ -+#define BFQ_BFQQ_FNS(name) \ -+static void bfq_mark_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags |= (1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static void bfq_clear_bfqq_##name(struct bfq_queue *bfqq) \ -+{ \ -+ (bfqq)->flags &= ~(1 << BFQ_BFQQ_FLAG_##name); \ -+} \ -+static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ -+{ \ -+ return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ -+} -+ -+BFQ_BFQQ_FNS(busy); -+BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(must_alloc); -+BFQ_BFQQ_FNS(fifo_expire); -+BFQ_BFQQ_FNS(idle_window); -+BFQ_BFQQ_FNS(sync); -+BFQ_BFQQ_FNS(budget_new); -+BFQ_BFQQ_FNS(IO_bound); -+BFQ_BFQQ_FNS(in_large_burst); -+BFQ_BFQQ_FNS(constantly_seeky); -+BFQ_BFQQ_FNS(softrt_update); -+#undef BFQ_BFQQ_FNS -+ -+/* Logging facilities. */ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+ -+/* Expiration reasons. */ -+enum bfqq_expiration { -+ BFQ_BFQQ_TOO_IDLE = 0, /* -+ * queue has been idling for -+ * too long -+ */ -+ BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ -+ BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ -+ BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+}; -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ -+struct bfqg_stats { -+ /* total bytes transferred */ -+ struct blkg_rwstat service_bytes; -+ /* total IOs serviced, post merge */ -+ struct blkg_rwstat serviced; -+ /* number of ios merged */ -+ struct blkg_rwstat merged; -+ /* total time spent on device in ns, may not be accurate w/ queueing */ -+ struct blkg_rwstat service_time; -+ /* total time spent waiting in scheduler queue in ns */ -+ struct blkg_rwstat wait_time; -+ /* number of IOs queued up */ -+ struct blkg_rwstat queued; -+ /* total sectors transferred */ -+ struct blkg_stat sectors; -+ /* total disk time and nr sectors dispatched by this group */ -+ struct blkg_stat time; -+ /* time not charged to this cgroup */ -+ struct blkg_stat unaccounted_time; -+ /* sum of number of ios queued across all samples */ -+ struct blkg_stat avg_queue_size_sum; -+ /* count of samples taken for average */ -+ struct blkg_stat avg_queue_size_samples; -+ /* how many times this group has been removed from service tree */ -+ struct blkg_stat dequeue; -+ /* total time spent waiting for it to be assigned a timeslice. */ -+ struct blkg_stat group_wait_time; -+ /* time spent idling for this blkcg_gq */ -+ struct blkg_stat idle_time; -+ /* total time with empty current active q with other requests queued */ -+ struct blkg_stat empty_time; -+ /* fields after this shouldn't be cleared on stat reset */ -+ uint64_t start_group_wait_time; -+ uint64_t start_idle_time; -+ uint64_t start_empty_time; -+ uint16_t flags; -+}; -+ -+/* -+ * struct bfq_group_data - per-blkcg storage for the blkio subsystem. -+ * -+ * @ps: @blkcg_policy_storage that this structure inherits -+ * @weight: weight of the bfq_group -+ */ -+struct bfq_group_data { -+ /* must be the first member */ -+ struct blkcg_policy_data pd; -+ -+ unsigned short weight; -+}; -+ -+/** -+ * struct bfq_group - per (device, cgroup) data structure. -+ * @entity: schedulable entity to insert into the parent group sched_data. -+ * @sched_data: own sched_data, to contain child entities (they may be -+ * both bfq_queues and bfq_groups). -+ * @bfqd: the bfq_data for the device this group acts upon. -+ * @async_bfqq: array of async queues for all the tasks belonging to -+ * the group, one queue per ioprio value per ioprio_class, -+ * except for the idle class that has only one queue. -+ * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). -+ * @my_entity: pointer to @entity, %NULL for the toplevel group; used -+ * to avoid too many special cases during group creation/ -+ * migration. -+ * @active_entities: number of active entities belonging to the group; -+ * unused for the root group. Used to know whether there -+ * are groups with more than one active @bfq_entity -+ * (see the comments to the function -+ * bfq_bfqq_must_not_expire()). -+ * -+ * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup -+ * there is a set of bfq_groups, each one collecting the lower-level -+ * entities belonging to the group that are acting on the same device. -+ * -+ * Locking works as follows: -+ * o @bfqd is protected by the queue lock, RCU is used to access it -+ * from the readers. -+ * o All the other fields are protected by the @bfqd queue lock. -+ */ -+struct bfq_group { -+ /* must be the first member */ -+ struct blkg_policy_data pd; -+ -+ struct bfq_entity entity; -+ struct bfq_sched_data sched_data; -+ -+ void *bfqd; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+ -+ struct bfq_entity *my_entity; -+ -+ int active_entities; -+ -+ struct bfqg_stats stats; -+ struct bfqg_stats dead_stats; /* stats pushed from dead children */ -+}; -+ -+#else -+struct bfq_group { -+ struct bfq_sched_data sched_data; -+ -+ struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; -+ struct bfq_queue *async_idle_bfqq; -+}; -+#endif -+ -+static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); -+ -+static struct bfq_service_tree * -+bfq_entity_service_tree(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sched_data = entity->sched_data; -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS; -+ -+ BUG_ON(idx >= BFQ_IOPRIO_CLASSES); -+ BUG_ON(sched_data == NULL); -+ -+ return sched_data->service_tree + idx; -+} -+ -+static struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync) -+{ -+ return bic->bfqq[is_sync]; -+} -+ -+static void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, -+ bool is_sync) -+{ -+ bic->bfqq[is_sync] = bfqq; -+} -+ -+static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) -+{ -+ return bic->icq.q->elevator->elevator_data; -+} -+ -+/** -+ * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. -+ * @ptr: a pointer to a bfqd. -+ * @flags: storage for the flags to be saved. -+ * -+ * This function allows bfqg->bfqd to be protected by the -+ * queue lock of the bfqd they reference; the pointer is dereferenced -+ * under RCU, so the storage for bfqd is assured to be safe as long -+ * as the RCU read side critical section does not end. After the -+ * bfqd->queue->queue_lock is taken the pointer is rechecked, to be -+ * sure that no other writer accessed it. If we raced with a writer, -+ * the function returns NULL, with the queue unlocked, otherwise it -+ * returns the dereferenced pointer, with the queue locked. -+ */ -+static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) -+{ -+ struct bfq_data *bfqd; -+ -+ rcu_read_lock(); -+ bfqd = rcu_dereference(*(struct bfq_data **)ptr); -+ -+ if (bfqd != NULL) { -+ spin_lock_irqsave(bfqd->queue->queue_lock, *flags); -+ if (ptr == NULL) -+ printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); -+ else if (*ptr == bfqd) -+ goto out; -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -+ } -+ -+ bfqd = NULL; -+out: -+ rcu_read_unlock(); -+ return bfqd; -+} -+ -+static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) -+{ -+ spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -+} -+ -+static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); -+static void bfq_put_queue(struct bfq_queue *bfqq); -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); -+static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -+ struct bio *bio, int is_sync, -+ struct bfq_io_cq *bic, gfp_t gfp_mask); -+static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -+ struct bfq_group *bfqg); -+static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); -+ -+#endif /* _BFQ_H */ --- -2.10.0 - diff --git a/patches/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch b/patches/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch deleted file mode 100644 index 28eeb1f..0000000 --- a/patches/0003-block-bfq-add-Early-Queue-Merge-EQM-to-BFQ-v7r11-for.patch +++ /dev/null @@ -1,1101 +0,0 @@ -From e4d9bed2dfdec562b23491e44602c89c4a2a5ea4 Mon Sep 17 00:00:00 2001 -From: Mauro Andreolini -Date: Sun, 6 Sep 2015 16:09:05 +0200 -Subject: [PATCH 3/4] block, bfq: add Early Queue Merge (EQM) to BFQ-v7r11 for - 4.10.0 - -A set of processes may happen to perform interleaved reads, i.e.,requests -whose union would give rise to a sequential read pattern. There are two -typical cases: in the first case, processes read fixed-size chunks of -data at a fixed distance from each other, while in the second case processes -may read variable-size chunks at variable distances. The latter case occurs -for example with QEMU, which splits the I/O generated by the guest into -multiple chunks, and lets these chunks be served by a pool of cooperating -processes, iteratively assigning the next chunk of I/O to the first -available process. CFQ uses actual queue merging for the first type of -rocesses, whereas it uses preemption to get a sequential read pattern out -of the read requests performed by the second type of processes. In the end -it uses two different mechanisms to achieve the same goal: boosting the -throughput with interleaved I/O. - -This patch introduces Early Queue Merge (EQM), a unified mechanism to get a -sequential read pattern with both types of processes. The main idea is -checking newly arrived requests against the next request of the active queue -both in case of actual request insert and in case of request merge. By doing -so, both the types of processes can be handled by just merging their queues. -EQM is then simpler and more compact than the pair of mechanisms used in -CFQ. - -Finally, EQM also preserves the typical low-latency properties of BFQ, by -properly restoring the weight-raising state of a queue when it gets back to -a non-merged state. - -Signed-off-by: Mauro Andreolini -Signed-off-by: Arianna Avanzini -Signed-off-by: Paolo Valente -Signed-off-by: Linus Walleij ---- - block/bfq-cgroup.c | 5 + - block/bfq-iosched.c | 685 +++++++++++++++++++++++++++++++++++++++++++++++++++- - block/bfq.h | 66 +++++ - 3 files changed, 743 insertions(+), 13 deletions(-) - -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c -index 8b08a57..0367996 100644 ---- a/block/bfq-cgroup.c -+++ b/block/bfq-cgroup.c -@@ -440,6 +440,7 @@ static void bfq_pd_init(struct blkg_policy_data *pd) - */ - bfqg->bfqd = bfqd; - bfqg->active_entities = 0; -+ bfqg->rq_pos_tree = RB_ROOT; - } - - static void bfq_pd_free(struct blkg_policy_data *pd) -@@ -533,6 +534,9 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - return bfqg; - } - -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq); -+ - /** - * bfq_bfqq_move - migrate @bfqq to @bfqg. - * @bfqd: queue descriptor. -@@ -580,6 +584,7 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqg_get(bfqg); - - if (busy) { -+ bfq_pos_tree_add_move(bfqd, bfqq); - if (resume) - bfq_activate_bfqq(bfqd, bfqq); - } -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index 85e2169..cf3e9b1 100644 ---- a/block/bfq-iosched.c -+++ b/block/bfq-iosched.c -@@ -295,6 +295,72 @@ static struct request *bfq_choose_req(struct bfq_data *bfqd, - } - } - -+static struct bfq_queue * -+bfq_rq_pos_tree_lookup(struct bfq_data *bfqd, struct rb_root *root, -+ sector_t sector, struct rb_node **ret_parent, -+ struct rb_node ***rb_link) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *bfqq = NULL; -+ -+ parent = NULL; -+ p = &root->rb_node; -+ while (*p) { -+ struct rb_node **n; -+ -+ parent = *p; -+ bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ -+ /* -+ * Sort strictly based on sector. Smallest to the left, -+ * largest to the right. -+ */ -+ if (sector > blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_right; -+ else if (sector < blk_rq_pos(bfqq->next_rq)) -+ n = &(*p)->rb_left; -+ else -+ break; -+ p = n; -+ bfqq = NULL; -+ } -+ -+ *ret_parent = parent; -+ if (rb_link) -+ *rb_link = p; -+ -+ bfq_log(bfqd, "rq_pos_tree_lookup %llu: returning %d", -+ (unsigned long long) sector, -+ bfqq ? bfqq->pid : 0); -+ -+ return bfqq; -+} -+ -+static void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct rb_node **p, *parent; -+ struct bfq_queue *__bfqq; -+ -+ if (bfqq->pos_root) { -+ rb_erase(&bfqq->pos_node, bfqq->pos_root); -+ bfqq->pos_root = NULL; -+ } -+ -+ if (bfq_class_idle(bfqq)) -+ return; -+ if (!bfqq->next_rq) -+ return; -+ -+ bfqq->pos_root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, bfqq->pos_root, -+ blk_rq_pos(bfqq->next_rq), &parent, &p); -+ if (!__bfqq) { -+ rb_link_node(&bfqq->pos_node, parent, p); -+ rb_insert_color(&bfqq->pos_node, bfqq->pos_root); -+ } else -+ bfqq->pos_root = NULL; -+} -+ - /* - * Tell whether there are active queues or groups with differentiated weights. - */ -@@ -527,6 +593,57 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - return dur; - } - -+static unsigned int bfq_bfqq_cooperations(struct bfq_queue *bfqq) -+{ -+ return bfqq->bic ? bfqq->bic->cooperations : 0; -+} -+ -+static void -+bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) -+{ -+ if (bic->saved_idle_window) -+ bfq_mark_bfqq_idle_window(bfqq); -+ else -+ bfq_clear_bfqq_idle_window(bfqq); -+ if (bic->saved_IO_bound) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ else -+ bfq_clear_bfqq_IO_bound(bfqq); -+ /* Assuming that the flag in_large_burst is already correctly set */ -+ if (bic->wr_time_left && bfqq->bfqd->low_latency && -+ !bfq_bfqq_in_large_burst(bfqq) && -+ bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { -+ /* -+ * Start a weight raising period with the duration given by -+ * the raising_time_left snapshot. -+ */ -+ if (bfq_bfqq_busy(bfqq)) -+ bfqq->bfqd->wr_busy_queues++; -+ bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bic->wr_time_left; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->entity.prio_changed = 1; -+ } -+ /* -+ * Clear wr_time_left to prevent bfq_bfqq_save_state() from -+ * getting confused about the queue's need of a weight-raising -+ * period. -+ */ -+ bic->wr_time_left = 0; -+} -+ -+static int bfqq_process_refs(struct bfq_queue *bfqq) -+{ -+ int process_refs, io_refs; -+ -+ lockdep_assert_held(bfqq->bfqd->queue->queue_lock); -+ -+ io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -+ process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; -+ BUG_ON(process_refs < 0); -+ return process_refs; -+} -+ - /* Empty burst list and add just bfqq (see comments to bfq_handle_burst) */ - static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { -@@ -763,8 +880,14 @@ static void bfq_add_request(struct request *rq) - BUG_ON(!next_rq); - bfqq->next_rq = next_rq; - -+ /* -+ * Adjust priority tree position, if next_rq changes. -+ */ -+ if (prev != bfqq->next_rq) -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ - if (!bfq_bfqq_busy(bfqq)) { -- bool soft_rt, in_burst, -+ bool soft_rt, coop_or_in_burst, - idle_for_long_time = time_is_before_jiffies( - bfqq->budget_timeout + - bfqd->bfq_wr_min_idle_time); -@@ -792,11 +915,12 @@ static void bfq_add_request(struct request *rq) - bfqd->last_ins_in_burst = jiffies; - } - -- in_burst = bfq_bfqq_in_large_burst(bfqq); -+ coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || -+ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; - soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -- !in_burst && -+ !coop_or_in_burst && - time_is_before_jiffies(bfqq->soft_rt_next_start); -- interactive = !in_burst && idle_for_long_time; -+ interactive = !coop_or_in_burst && idle_for_long_time; - entity->budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); - -@@ -815,6 +939,9 @@ static void bfq_add_request(struct request *rq) - if (!bfqd->low_latency) - goto add_bfqq_busy; - -+ if (bfq_bfqq_just_split(bfqq)) -+ goto set_prio_changed; -+ - /* - * If the queue: - * - is not being boosted, -@@ -839,7 +966,7 @@ static void bfq_add_request(struct request *rq) - } else if (old_wr_coeff > 1) { - if (interactive) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- else if (in_burst || -+ else if (coop_or_in_burst || - (bfqq->wr_cur_max_time == - bfqd->bfq_wr_rt_max_time && - !soft_rt)) { -@@ -904,6 +1031,7 @@ static void bfq_add_request(struct request *rq) - bfqd->bfq_wr_rt_max_time; - } - } -+set_prio_changed: - if (old_wr_coeff != bfqq->wr_coeff) - entity->prio_changed = 1; - add_bfqq_busy: -@@ -1046,6 +1174,15 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - bfqd->last_position); - BUG_ON(!next_rq); - bfqq->next_rq = next_rq; -+ /* -+ * If next_rq changes, update both the queue's budget to -+ * fit the new request and the queue's position in its -+ * rq_pos_tree. -+ */ -+ if (prev != bfqq->next_rq) { -+ bfq_updated_next_req(bfqd, bfqq); -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } - } - } - -@@ -1128,11 +1265,346 @@ static void bfq_end_wr(struct bfq_data *bfqd) - spin_unlock_irq(bfqd->queue->queue_lock); - } - -+static sector_t bfq_io_struct_pos(void *io_struct, bool request) -+{ -+ if (request) -+ return blk_rq_pos(io_struct); -+ else -+ return ((struct bio *)io_struct)->bi_iter.bi_sector; -+} -+ -+static int bfq_rq_close_to_sector(void *io_struct, bool request, -+ sector_t sector) -+{ -+ return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -+ BFQQ_SEEK_THR; -+} -+ -+static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ sector_t sector) -+{ -+ struct rb_root *root = &bfq_bfqq_to_bfqg(bfqq)->rq_pos_tree; -+ struct rb_node *parent, *node; -+ struct bfq_queue *__bfqq; -+ -+ if (RB_EMPTY_ROOT(root)) -+ return NULL; -+ -+ /* -+ * First, if we find a request starting at the end of the last -+ * request, choose it. -+ */ -+ __bfqq = bfq_rq_pos_tree_lookup(bfqd, root, sector, &parent, NULL); -+ if (__bfqq) -+ return __bfqq; -+ -+ /* -+ * If the exact sector wasn't found, the parent of the NULL leaf -+ * will contain the closest sector (rq_pos_tree sorted by -+ * next_request position). -+ */ -+ __bfqq = rb_entry(parent, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ if (blk_rq_pos(__bfqq->next_rq) < sector) -+ node = rb_next(&__bfqq->pos_node); -+ else -+ node = rb_prev(&__bfqq->pos_node); -+ if (!node) -+ return NULL; -+ -+ __bfqq = rb_entry(node, struct bfq_queue, pos_node); -+ if (bfq_rq_close_to_sector(__bfqq->next_rq, true, sector)) -+ return __bfqq; -+ -+ return NULL; -+} -+ -+static struct bfq_queue *bfq_find_close_cooperator(struct bfq_data *bfqd, -+ struct bfq_queue *cur_bfqq, -+ sector_t sector) -+{ -+ struct bfq_queue *bfqq; -+ -+ /* -+ * We shall notice if some of the queues are cooperating, -+ * e.g., working closely on the same area of the device. In -+ * that case, we can group them together and: 1) don't waste -+ * time idling, and 2) serve the union of their requests in -+ * the best possible order for throughput. -+ */ -+ bfqq = bfqq_find_close(bfqd, cur_bfqq, sector); -+ if (!bfqq || bfqq == cur_bfqq) -+ return NULL; -+ -+ return bfqq; -+} -+ -+static struct bfq_queue * -+bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ int process_refs, new_process_refs; -+ struct bfq_queue *__bfqq; -+ -+ /* -+ * If there are no process references on the new_bfqq, then it is -+ * unsafe to follow the ->new_bfqq chain as other bfqq's in the chain -+ * may have dropped their last reference (not just their last process -+ * reference). -+ */ -+ if (!bfqq_process_refs(new_bfqq)) -+ return NULL; -+ -+ /* Avoid a circular list and skip interim queue merges. */ -+ while ((__bfqq = new_bfqq->new_bfqq)) { -+ if (__bfqq == bfqq) -+ return NULL; -+ new_bfqq = __bfqq; -+ } -+ -+ process_refs = bfqq_process_refs(bfqq); -+ new_process_refs = bfqq_process_refs(new_bfqq); -+ /* -+ * If the process for the bfqq has gone away, there is no -+ * sense in merging the queues. -+ */ -+ if (process_refs == 0 || new_process_refs == 0) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "scheduling merge with queue %d", -+ new_bfqq->pid); -+ -+ /* -+ * Merging is just a redirection: the requests of the process -+ * owning one of the two queues are redirected to the other queue. -+ * The latter queue, in its turn, is set as shared if this is the -+ * first time that the requests of some process are redirected to -+ * it. -+ * -+ * We redirect bfqq to new_bfqq and not the opposite, because we -+ * are in the context of the process owning bfqq, hence we have -+ * the io_cq of this process. So we can immediately configure this -+ * io_cq to redirect the requests of the process to new_bfqq. -+ * -+ * NOTE, even if new_bfqq coincides with the in-service queue, the -+ * io_cq of new_bfqq is not available, because, if the in-service -+ * queue is shared, bfqd->in_service_bic may not point to the -+ * io_cq of the in-service queue. -+ * Redirecting the requests of the process owning bfqq to the -+ * currently in-service queue is in any case the best option, as -+ * we feed the in-service queue with new requests close to the -+ * last request served and, by doing so, hopefully increase the -+ * throughput. -+ */ -+ bfqq->new_bfqq = new_bfqq; -+ atomic_add(process_refs, &new_bfqq->ref); -+ return new_bfqq; -+} -+ -+static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, -+ struct bfq_queue *new_bfqq) -+{ -+ if (bfq_class_idle(bfqq) || bfq_class_idle(new_bfqq) || -+ (bfqq->ioprio_class != new_bfqq->ioprio_class)) -+ return false; -+ -+ /* -+ * If either of the queues has already been detected as seeky, -+ * then merging it with the other queue is unlikely to lead to -+ * sequential I/O. -+ */ -+ if (BFQQ_SEEKY(bfqq) || BFQQ_SEEKY(new_bfqq)) -+ return false; -+ -+ /* -+ * Interleaved I/O is known to be done by (some) applications -+ * only for reads, so it does not make sense to merge async -+ * queues. -+ */ -+ if (!bfq_bfqq_sync(bfqq) || !bfq_bfqq_sync(new_bfqq)) -+ return false; -+ -+ return true; -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service queue -+ * or with a close queue among the scheduled queues. -+ * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * structure otherwise. -+ * -+ * The OOM queue is not allowed to participate to cooperation: in fact, since -+ * the requests temporarily redirected to the OOM queue could be redirected -+ * again to dedicated queues at any time, the state needed to correctly -+ * handle merging with the OOM queue would be quite complex and expensive -+ * to maintain. Besides, in such a critical condition as an out of memory, -+ * the benefits of queue merging may be little relevant, or even negligible. -+ */ -+static struct bfq_queue * -+bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ void *io_struct, bool request) -+{ -+ struct bfq_queue *in_service_bfqq, *new_bfqq; -+ -+ if (bfqq->new_bfqq) -+ return bfqq->new_bfqq; -+ if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) -+ return NULL; -+ /* If device has only one backlogged bfq_queue, don't search. */ -+ if (bfqd->busy_queues == 1) -+ return NULL; -+ -+ in_service_bfqq = bfqd->in_service_queue; -+ -+ if (!in_service_bfqq || in_service_bfqq == bfqq || -+ !bfqd->in_service_bic || -+ unlikely(in_service_bfqq == &bfqd->oom_bfqq)) -+ goto check_scheduled; -+ -+ if (bfq_rq_close_to_sector(io_struct, request, bfqd->last_position) && -+ bfqq->entity.parent == in_service_bfqq->entity.parent && -+ bfq_may_be_close_cooperator(bfqq, in_service_bfqq)) { -+ new_bfqq = bfq_setup_merge(bfqq, in_service_bfqq); -+ if (new_bfqq) -+ return new_bfqq; -+ } -+ /* -+ * Check whether there is a cooperator among currently scheduled -+ * queues. The only thing we need is that the bio/request is not -+ * NULL, as we need it to establish whether a cooperator exists. -+ */ -+check_scheduled: -+ new_bfqq = bfq_find_close_cooperator(bfqd, bfqq, -+ bfq_io_struct_pos(io_struct, request)); -+ -+ BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); -+ -+ if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ return bfq_setup_merge(bfqq, new_bfqq); -+ -+ return NULL; -+} -+ -+static void bfq_bfqq_save_state(struct bfq_queue *bfqq) -+{ -+ /* -+ * If !bfqq->bic, the queue is already shared or its requests -+ * have already been redirected to a shared queue; both idle window -+ * and weight raising state have already been saved. Do nothing. -+ */ -+ if (!bfqq->bic) -+ return; -+ if (bfqq->bic->wr_time_left) -+ /* -+ * This is the queue of a just-started process, and would -+ * deserve weight raising: we set wr_time_left to the full -+ * weight-raising duration to trigger weight-raising when -+ * and if the queue is split and the first request of the -+ * queue is enqueued. -+ */ -+ bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); -+ else if (bfqq->wr_coeff > 1) { -+ unsigned long wr_duration = -+ jiffies - bfqq->last_wr_start_finish; -+ /* -+ * It may happen that a queue's weight raising period lasts -+ * longer than its wr_cur_max_time, as weight raising is -+ * handled only when a request is enqueued or dispatched (it -+ * does not use any timer). If the weight raising period is -+ * about to end, don't save it. -+ */ -+ if (bfqq->wr_cur_max_time <= wr_duration) -+ bfqq->bic->wr_time_left = 0; -+ else -+ bfqq->bic->wr_time_left = -+ bfqq->wr_cur_max_time - wr_duration; -+ /* -+ * The bfq_queue is becoming shared or the requests of the -+ * process owning the queue are being redirected to a shared -+ * queue. Stop the weight raising period of the queue, as in -+ * both cases it should not be owned by an interactive or -+ * soft real-time application. -+ */ -+ bfq_bfqq_end_wr(bfqq); -+ } else -+ bfqq->bic->wr_time_left = 0; -+ bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -+ bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bfqq->bic->cooperations++; -+ bfqq->bic->failed_cooperations = 0; -+} -+ -+static void bfq_get_bic_reference(struct bfq_queue *bfqq) -+{ -+ /* -+ * If bfqq->bic has a non-NULL value, the bic to which it belongs -+ * is about to begin using a shared bfq_queue. -+ */ -+ if (bfqq->bic) -+ atomic_long_inc(&bfqq->bic->icq.ioc->refcount); -+} -+ -+static void -+bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, -+ struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) -+{ -+ bfq_log_bfqq(bfqd, bfqq, "merging with queue %lu", -+ (unsigned long) new_bfqq->pid); -+ /* Save weight raising and idle window of the merged queues */ -+ bfq_bfqq_save_state(bfqq); -+ bfq_bfqq_save_state(new_bfqq); -+ if (bfq_bfqq_IO_bound(bfqq)) -+ bfq_mark_bfqq_IO_bound(new_bfqq); -+ bfq_clear_bfqq_IO_bound(bfqq); -+ /* -+ * Grab a reference to the bic, to prevent it from being destroyed -+ * before being possibly touched by a bfq_split_bfqq(). -+ */ -+ bfq_get_bic_reference(bfqq); -+ bfq_get_bic_reference(new_bfqq); -+ /* -+ * Merge queues (that is, let bic redirect its requests to new_bfqq) -+ */ -+ bic_set_bfqq(bic, new_bfqq, 1); -+ bfq_mark_bfqq_coop(new_bfqq); -+ /* -+ * new_bfqq now belongs to at least two bics (it is a shared queue): -+ * set new_bfqq->bic to NULL. bfqq either: -+ * - does not belong to any bic any more, and hence bfqq->bic must -+ * be set to NULL, or -+ * - is a queue whose owning bics have already been redirected to a -+ * different queue, hence the queue is destined to not belong to -+ * any bic soon and bfqq->bic is already NULL (therefore the next -+ * assignment causes no harm). -+ */ -+ new_bfqq->bic = NULL; -+ bfqq->bic = NULL; -+ bfq_put_queue(bfqq); -+} -+ -+static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) -+{ -+ struct bfq_io_cq *bic = bfqq->bic; -+ struct bfq_data *bfqd = bfqq->bfqd; -+ -+ if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { -+ bic->failed_cooperations++; -+ if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) -+ bic->cooperations = 0; -+ } -+} -+ - static int bfq_allow_merge(struct request_queue *q, struct request *rq, - struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_io_cq *bic; -+ struct bfq_queue *bfqq, *new_bfqq; - - /* - * Disallow merge of a sync bio into an async request. -@@ -1149,7 +1621,26 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, - if (!bic) - return 0; - -- return bic_to_bfqq(bic, bfq_bio_sync(bio)) == RQ_BFQQ(rq); -+ bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); -+ /* -+ * We take advantage of this function to perform an early merge -+ * of the queues of possible cooperating processes. -+ */ -+ if (bfqq) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, bio, false); -+ if (new_bfqq) { -+ bfq_merge_bfqqs(bfqd, bic, bfqq, new_bfqq); -+ /* -+ * If we get here, the bio will be queued in the -+ * shared queue, i.e., new_bfqq, so use new_bfqq -+ * to decide whether bio and rq can be merged. -+ */ -+ bfqq = new_bfqq; -+ } else -+ bfq_bfqq_increase_failed_cooperations(bfqq); -+ } -+ -+ return bfqq == RQ_BFQQ(rq); - } - - static void __bfq_set_in_service_queue(struct bfq_data *bfqd, -@@ -1350,6 +1841,15 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - - __bfq_bfqd_reset_in_service(bfqd); - -+ /* -+ * If this bfqq is shared between multiple processes, check -+ * to make sure that those processes are still issuing I/Os -+ * within the mean seek distance. If not, it may be time to -+ * break the queues apart again. -+ */ -+ if (bfq_bfqq_coop(bfqq) && BFQQ_SEEKY(bfqq)) -+ bfq_mark_bfqq_split_coop(bfqq); -+ - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { - /* - * Overloading budget_timeout field to store the time -@@ -1358,8 +1858,13 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - */ - bfqq->budget_timeout = jiffies; - bfq_del_bfqq_busy(bfqd, bfqq, 1); -- } else -+ } else { - bfq_activate_bfqq(bfqd, bfqq); -+ /* -+ * Resort priority tree of potential close cooperators. -+ */ -+ bfq_pos_tree_add_move(bfqd, bfqq); -+ } - } - - /** -@@ -2246,10 +2751,12 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* - * If the queue was activated in a burst, or - * too much time has elapsed from the beginning -- * of this weight-raising period, then end weight -- * raising. -+ * of this weight-raising period, or the queue has -+ * exceeded the acceptable number of cooperations, -+ * then end weight raising. - */ - if (bfq_bfqq_in_large_burst(bfqq) || -+ bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || - time_is_before_jiffies(bfqq->last_wr_start_finish + - bfqq->wr_cur_max_time)) { - bfqq->last_wr_start_finish = jiffies; -@@ -2478,6 +2985,25 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - #endif - } - -+static void bfq_put_cooperator(struct bfq_queue *bfqq) -+{ -+ struct bfq_queue *__bfqq, *next; -+ -+ /* -+ * If this queue was scheduled to merge with another queue, be -+ * sure to drop the reference taken on that queue (and others in -+ * the merge chain). See bfq_setup_merge and bfq_merge_bfqqs. -+ */ -+ __bfqq = bfqq->new_bfqq; -+ while (__bfqq) { -+ if (__bfqq == bfqq) -+ break; -+ next = __bfqq->new_bfqq; -+ bfq_put_queue(__bfqq); -+ __bfqq = next; -+ } -+} -+ - static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - if (bfqq == bfqd->in_service_queue) { -@@ -2488,6 +3014,8 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, - atomic_read(&bfqq->ref)); - -+ bfq_put_cooperator(bfqq); -+ - bfq_put_queue(bfqq); - } - -@@ -2496,6 +3024,25 @@ static void bfq_init_icq(struct io_cq *icq) - struct bfq_io_cq *bic = icq_to_bic(icq); - - bic->ttime.last_end_request = jiffies; -+ /* -+ * A newly created bic indicates that the process has just -+ * started doing I/O, and is probably mapping into memory its -+ * executable and libraries: it definitely needs weight raising. -+ * There is however the possibility that the process performs, -+ * for a while, I/O close to some other process. EQM intercepts -+ * this behavior and may merge the queue corresponding to the -+ * process with some other queue, BEFORE the weight of the queue -+ * is raised. Merged queues are not weight-raised (they are assumed -+ * to belong to processes that benefit only from high throughput). -+ * If the merge is basically the consequence of an accident, then -+ * the queue will be split soon and will get back its old weight. -+ * It is then important to write down somewhere that this queue -+ * does need weight raising, even if it did not make it to get its -+ * weight raised before being merged. To this purpose, we overload -+ * the field raising_time_left and assign 1 to it, to mark the queue -+ * as needing weight raising. -+ */ -+ bic->wr_time_left = 1; - } - - static void bfq_exit_icq(struct io_cq *icq) -@@ -2509,6 +3056,13 @@ static void bfq_exit_icq(struct io_cq *icq) - } - - if (bic->bfqq[BLK_RW_SYNC]) { -+ /* -+ * If the bic is using a shared queue, put the reference -+ * taken on the io_context when the bic started using a -+ * shared bfq_queue. -+ */ -+ if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) -+ put_io_context(icq->ioc); - bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); - bic->bfqq[BLK_RW_SYNC] = NULL; - } -@@ -2814,6 +3368,10 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, - if (!bfq_bfqq_sync(bfqq) || bfq_class_idle(bfqq)) - return; - -+ /* Idle window just restored, statistics are meaningless. */ -+ if (bfq_bfqq_just_split(bfqq)) -+ return; -+ - enable_idle = bfq_bfqq_idle_window(bfqq); - - if (atomic_read(&bic->icq.ioc->active_ref) == 0 || -@@ -2861,6 +3419,7 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || - !BFQQ_SEEKY(bfqq)) - bfq_update_idle_window(bfqd, bfqq, bic); -+ bfq_clear_bfqq_just_split(bfqq); - - bfq_log_bfqq(bfqd, bfqq, - "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", -@@ -2925,12 +3484,47 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - static void bfq_insert_request(struct request_queue *q, struct request *rq) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ struct bfq_queue *bfqq = RQ_BFQQ(rq), *new_bfqq; - - assert_spin_locked(bfqd->queue->queue_lock); - -+ /* -+ * An unplug may trigger a requeue of a request from the device -+ * driver: make sure we are in process context while trying to -+ * merge two bfq_queues. -+ */ -+ if (!in_interrupt()) { -+ new_bfqq = bfq_setup_cooperator(bfqd, bfqq, rq, true); -+ if (new_bfqq) { -+ if (bic_to_bfqq(RQ_BIC(rq), 1) != bfqq) -+ new_bfqq = bic_to_bfqq(RQ_BIC(rq), 1); -+ /* -+ * Release the request's reference to the old bfqq -+ * and make sure one is taken to the shared queue. -+ */ -+ new_bfqq->allocated[rq_data_dir(rq)]++; -+ bfqq->allocated[rq_data_dir(rq)]--; -+ atomic_inc(&new_bfqq->ref); -+ bfq_put_queue(bfqq); -+ if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) -+ bfq_merge_bfqqs(bfqd, RQ_BIC(rq), -+ bfqq, new_bfqq); -+ rq->elv.priv[1] = new_bfqq; -+ bfqq = new_bfqq; -+ } else -+ bfq_bfqq_increase_failed_cooperations(bfqq); -+ } -+ - bfq_add_request(rq); - -+ /* -+ * Here a newly-created bfq_queue has already started a weight-raising -+ * period: clear raising_time_left to prevent bfq_bfqq_save_state() -+ * from assigning it a full weight-raising period. See the detailed -+ * comments about this field in bfq_init_icq(). -+ */ -+ if (bfqq->bic) -+ bfqq->bic->wr_time_left = 0; - rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &bfqq->fifo); - -@@ -3099,6 +3693,32 @@ static void bfq_put_request(struct request *rq) - } - - /* -+ * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -+ * was the last process referring to said bfqq. -+ */ -+static struct bfq_queue * -+bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -+{ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "splitting queue"); -+ -+ put_io_context(bic->icq.ioc); -+ -+ if (bfqq_process_refs(bfqq) == 1) { -+ bfqq->pid = current->pid; -+ bfq_clear_bfqq_coop(bfqq); -+ bfq_clear_bfqq_split_coop(bfqq); -+ return bfqq; -+ } -+ -+ bic_set_bfqq(bic, NULL, 1); -+ -+ bfq_put_cooperator(bfqq); -+ -+ bfq_put_queue(bfqq); -+ return NULL; -+} -+ -+/* - * Allocate bfq data structures associated with this request. - */ - static int bfq_set_request(struct request_queue *q, struct request *rq, -@@ -3110,6 +3730,7 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - const int is_sync = rq_is_sync(rq); - struct bfq_queue *bfqq; - unsigned long flags; -+ bool split = false; - - might_sleep_if(gfpflags_allow_blocking(gfp_mask)); - -@@ -3122,15 +3743,30 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - - bfq_bic_update_cgroup(bic, bio); - -+new_queue: - bfqq = bic_to_bfqq(bic, is_sync); - if (!bfqq || bfqq == &bfqd->oom_bfqq) { - bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); - bic_set_bfqq(bic, bfqq, is_sync); -- if (is_sync) { -- if (bfqd->large_burst) -+ if (split && is_sync) { -+ if ((bic->was_in_burst_list && bfqd->large_burst) || -+ bic->saved_in_large_burst) - bfq_mark_bfqq_in_large_burst(bfqq); -- else -+ else { - bfq_clear_bfqq_in_large_burst(bfqq); -+ if (bic->was_in_burst_list) -+ hlist_add_head(&bfqq->burst_list_node, -+ &bfqd->burst_list); -+ } -+ } -+ } else { -+ /* If the queue was seeky for too long, break it apart. */ -+ if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { -+ bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ bfqq = bfq_split_bfqq(bic, bfqq); -+ split = true; -+ if (!bfqq) -+ goto new_queue; - } - } - -@@ -3142,6 +3778,26 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; - -+ /* -+ * If a bfq_queue has only one process reference, it is owned -+ * by only one bfq_io_cq: we can set the bic field of the -+ * bfq_queue to the address of that structure. Also, if the -+ * queue has just been split, mark a flag so that the -+ * information is available to the other scheduler hooks. -+ */ -+ if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { -+ bfqq->bic = bic; -+ if (split) { -+ bfq_mark_bfqq_just_split(bfqq); -+ /* -+ * If the queue has just been split from a shared -+ * queue, restore the idle window and the possible -+ * weight raising period. -+ */ -+ bfq_bfqq_resume_state(bfqq, bic); -+ } -+ } -+ - spin_unlock_irqrestore(q->queue_lock, flags); - - return 0; -@@ -3295,6 +3951,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, - root_group->my_entity = NULL; - root_group->bfqd = bfqd; - #endif -+ root_group->rq_pos_tree = RB_ROOT; - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; - } -@@ -3375,6 +4032,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; - bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; - -+ bfqd->bfq_coop_thresh = 2; -+ bfqd->bfq_failed_cooperations = 7000; - bfqd->bfq_requests_within_timer = 120; - - bfqd->bfq_large_burst_thresh = 11; -diff --git a/block/bfq.h b/block/bfq.h -index 2bf54ae..fcce855 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -183,6 +183,8 @@ struct bfq_group; - * ioprio_class value. - * @new_bfqq: shared bfq_queue if queue is cooperating with - * one or more other queues. -+ * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). -+ * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). - * @sort_list: sorted list of pending requests. - * @next_rq: if fifo isn't expired, next request to serve. - * @queued: nr of requests queued in @sort_list. -@@ -304,6 +306,26 @@ struct bfq_ttime { - * @ttime: associated @bfq_ttime struct - * @ioprio: per (request_queue, blkcg) ioprio. - * @blkcg_id: id of the blkcg the related io_cq belongs to. -+ * @wr_time_left: snapshot of the time left before weight raising ends -+ * for the sync queue associated to this process; this -+ * snapshot is taken to remember this value while the weight -+ * raising is suspended because the queue is merged with a -+ * shared queue, and is used to set @raising_cur_max_time -+ * when the queue is split from the shared queue and its -+ * weight is raised again -+ * @saved_idle_window: same purpose as the previous field for the idle -+ * window -+ * @saved_IO_bound: same purpose as the previous two fields for the I/O -+ * bound classification of a queue -+ * @saved_in_large_burst: same purpose as the previous fields for the -+ * value of the field keeping the queue's belonging -+ * to a large burst -+ * @was_in_burst_list: true if the queue belonged to a burst list -+ * before its merge with another cooperating queue -+ * @cooperations: counter of consecutive successful queue merges underwent -+ * by any of the process' @bfq_queues -+ * @failed_cooperations: counter of consecutive failed queue merges of any -+ * of the process' @bfq_queues - */ - struct bfq_io_cq { - struct io_cq icq; /* must be the first member */ -@@ -314,6 +336,16 @@ struct bfq_io_cq { - #ifdef CONFIG_BFQ_GROUP_IOSCHED - uint64_t blkcg_id; /* the current blkcg ID */ - #endif -+ -+ unsigned int wr_time_left; -+ bool saved_idle_window; -+ bool saved_IO_bound; -+ -+ bool saved_in_large_burst; -+ bool was_in_burst_list; -+ -+ unsigned int cooperations; -+ unsigned int failed_cooperations; - }; - - enum bfq_device_speed { -@@ -557,6 +589,9 @@ enum bfqq_state_flags { - * may need softrt-next-start - * update - */ -+ BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -+ BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ -+ BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ - }; - - #define BFQ_BFQQ_FNS(name) \ -@@ -583,6 +618,9 @@ BFQ_BFQQ_FNS(budget_new); - BFQ_BFQQ_FNS(IO_bound); - BFQ_BFQQ_FNS(in_large_burst); - BFQ_BFQQ_FNS(constantly_seeky); -+BFQ_BFQQ_FNS(coop); -+BFQ_BFQQ_FNS(split_coop); -+BFQ_BFQQ_FNS(just_split); - BFQ_BFQQ_FNS(softrt_update); - #undef BFQ_BFQQ_FNS - -@@ -675,6 +713,9 @@ struct bfq_group_data { - * are groups with more than one active @bfq_entity - * (see the comments to the function - * bfq_bfqq_must_not_expire()). -+ * @rq_pos_tree: rbtree sorted by next_request position, used when -+ * determining if two or more queues have interleaving -+ * requests (see bfq_find_close_cooperator()). - * - * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup - * there is a set of bfq_groups, each one collecting the lower-level -@@ -701,6 +742,8 @@ struct bfq_group { - - int active_entities; - -+ struct rb_root rq_pos_tree; -+ - struct bfqg_stats stats; - struct bfqg_stats dead_stats; /* stats pushed from dead children */ - }; -@@ -711,6 +754,8 @@ struct bfq_group { - - struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; - struct bfq_queue *async_idle_bfqq; -+ -+ struct rb_root rq_pos_tree; - }; - #endif - -@@ -787,6 +832,27 @@ static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) - spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); - } - -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *group_entity = bfqq->entity.parent; -+ -+ if (!group_entity) -+ group_entity = &bfqq->bfqd->root_group->entity; -+ -+ return container_of(group_entity, struct bfq_group, entity); -+} -+ -+#else -+ -+static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -+{ -+ return bfqq->bfqd->root_group; -+} -+ -+#endif -+ - static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); - static void bfq_put_queue(struct bfq_queue *bfqq); - static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); --- -2.10.0 - diff --git a/patches/0004-Turn-BFQ-v7r11-for-4.10.0-into-BFQ-v8r8-for-4.10.0.patch b/patches/0004-Turn-BFQ-v7r11-for-4.10.0-into-BFQ-v8r8-for-4.10.0.patch deleted file mode 100644 index 48e64d9..0000000 --- a/patches/0004-Turn-BFQ-v7r11-for-4.10.0-into-BFQ-v8r8-for-4.10.0.patch +++ /dev/null @@ -1,9187 +0,0 @@ -From b782bbfcb5e08e92c0448d0c6a870b44db198837 Mon Sep 17 00:00:00 2001 -From: Paolo Valente -Date: Mon, 16 May 2016 11:16:17 +0200 -Subject: [PATCH 4/4] Turn BFQ-v7r11 for 4.10.0 into BFQ-v8r8 for 4.10.0 - -Signed-off-by: Paolo Valente ---- - Documentation/block/00-INDEX | 2 + - Documentation/block/bfq-iosched.txt | 530 ++++++ - block/Kconfig.iosched | 18 +- - block/bfq-cgroup.c | 510 +++--- - block/bfq-iosched.c | 3414 ++++++++++++++++++++++------------- - block/bfq-sched.c | 1290 ++++++++++--- - block/bfq.h | 800 ++++---- - 7 files changed, 4390 insertions(+), 2174 deletions(-) - create mode 100644 Documentation/block/bfq-iosched.txt - -diff --git a/Documentation/block/00-INDEX b/Documentation/block/00-INDEX -index e55103a..8d55b4b 100644 ---- a/Documentation/block/00-INDEX -+++ b/Documentation/block/00-INDEX -@@ -1,5 +1,7 @@ - 00-INDEX - - This file -+bfq-iosched.txt -+ - BFQ IO scheduler and its tunables - biodoc.txt - - Notes on the Generic Block Layer Rewrite in Linux 2.5 - biovecs.txt -diff --git a/Documentation/block/bfq-iosched.txt b/Documentation/block/bfq-iosched.txt -new file mode 100644 -index 0000000..13b5248 ---- /dev/null -+++ b/Documentation/block/bfq-iosched.txt -@@ -0,0 +1,530 @@ -+BFQ (Budget Fair Queueing) -+========================== -+ -+BFQ is a proportional-share I/O scheduler, with some extra -+low-latency capabilities. In addition to cgroups support (blkio or io -+controllers), BFQ's main features are: -+- BFQ guarantees a high system and application responsiveness, and a -+ low latency for time-sensitive applications, such as audio or video -+ players; -+- BFQ distributes bandwidth, and not just time, among processes or -+ groups (switching back to time distribution when needed to keep -+ throughput high). -+ -+On average CPUs, the current version of BFQ can handle devices -+performing at most ~30K IOPS; at most ~50 KIOPS on faster CPUs. As a -+reference, 30-50 KIOPS correspond to very high bandwidths with -+sequential I/O (e.g., 8-12 GB/s if I/O requests are 256 KB large), and -+to 120-200 MB/s with 4KB random I/O. -+ -+The table of contents follow. Impatients can just jump to Section 3. -+ -+CONTENTS -+ -+1. When may BFQ be useful? -+ 1-1 Personal systems -+ 1-2 Server systems -+2. How does BFQ work? -+3. What are BFQ's tunable? -+4. BFQ group scheduling -+ 4-1 Service guarantees provided -+ 4-2 Interface -+ -+1. When may BFQ be useful? -+========================== -+ -+BFQ provides the following benefits on personal and server systems. -+ -+1-1 Personal systems -+-------------------- -+ -+Low latency for interactive applications -+ -+Regardless of the actual background workload, BFQ guarantees that, for -+interactive tasks, the storage device is virtually as responsive as if -+it was idle. For example, even if one or more of the following -+background workloads are being executed: -+- one or more large files are being read, written or copied, -+- a tree of source files is being compiled, -+- one or more virtual machines are performing I/O, -+- a software update is in progress, -+- indexing daemons are scanning filesystems and updating their -+ databases, -+starting an application or loading a file from within an application -+takes about the same time as if the storage device was idle. As a -+comparison, with CFQ, NOOP or DEADLINE, and in the same conditions, -+applications experience high latencies, or even become unresponsive -+until the background workload terminates (also on SSDs). -+ -+Low latency for soft real-time applications -+ -+Also soft real-time applications, such as audio and video -+players/streamers, enjoy a low latency and a low drop rate, regardless -+of the background I/O workload. As a consequence, these applications -+do not suffer from almost any glitch due to the background workload. -+ -+Higher speed for code-development tasks -+ -+If some additional workload happens to be executed in parallel, then -+BFQ executes the I/O-related components of typical code-development -+tasks (compilation, checkout, merge, ...) much more quickly than CFQ, -+NOOP or DEADLINE. -+ -+High throughput -+ -+On hard disks, BFQ achieves up to 30% higher throughput than CFQ, and -+up to 150% higher throughput than DEADLINE and NOOP, with all the -+sequential workloads considered in our tests. With random workloads, -+and with all the workloads on flash-based devices, BFQ achieves, -+instead, about the same throughput as the other schedulers. -+ -+Strong fairness, bandwidth and delay guarantees -+ -+BFQ distributes the device throughput, and not just the device time, -+among I/O-bound applications in proportion their weights, with any -+workload and regardless of the device parameters. From these bandwidth -+guarantees, it is possible to compute tight per-I/O-request delay -+guarantees by a simple formula. If not configured for strict service -+guarantees, BFQ switches to time-based resource sharing (only) for -+applications that would otherwise cause a throughput loss. -+ -+1-2 Server systems -+------------------ -+ -+Most benefits for server systems follow from the same service -+properties as above. In particular, regardless of whether additional, -+possibly heavy workloads are being served, BFQ guarantees: -+ -+. audio and video-streaming with zero or very low jitter and drop -+ rate; -+ -+. fast retrieval of WEB pages and embedded objects; -+ -+. real-time recording of data in live-dumping applications (e.g., -+ packet logging); -+ -+. responsiveness in local and remote access to a server. -+ -+ -+2. How does BFQ work? -+===================== -+ -+BFQ is a proportional-share I/O scheduler, whose general structure, -+plus a lot of code, are borrowed from CFQ. -+ -+- Each process doing I/O on a device is associated with a weight and a -+ (bfq_)queue. -+ -+- BFQ grants exclusive access to the device, for a while, to one queue -+ (process) at a time, and implements this service model by -+ associating every queue with a budget, measured in number of -+ sectors. -+ -+ - After a queue is granted access to the device, the budget of the -+ queue is decremented, on each request dispatch, by the size of the -+ request. -+ -+ - The in-service queue is expired, i.e., its service is suspended, -+ only if one of the following events occurs: 1) the queue finishes -+ its budget, 2) the queue empties, 3) a "budget timeout" fires. -+ -+ - The budget timeout prevents processes doing random I/O from -+ holding the device for too long and dramatically reducing -+ throughput. -+ -+ - Actually, as in CFQ, a queue associated with a process issuing -+ sync requests may not be expired immediately when it empties. In -+ contrast, BFQ may idle the device for a short time interval, -+ giving the process the chance to go on being served if it issues -+ a new request in time. Device idling typically boosts the -+ throughput on rotational devices, if processes do synchronous -+ and sequential I/O. In addition, under BFQ, device idling is -+ also instrumental in guaranteeing the desired throughput -+ fraction to processes issuing sync requests (see the description -+ of the slice_idle tunable in this document, or [1, 2], for more -+ details). -+ -+ - With respect to idling for service guarantees, if several -+ processes are competing for the device at the same time, but -+ all processes (and groups, after the following commit) have -+ the same weight, then BFQ guarantees the expected throughput -+ distribution without ever idling the device. Throughput is -+ thus as high as possible in this common scenario. -+ -+ - If low-latency mode is enabled (default configuration), BFQ -+ executes some special heuristics to detect interactive and soft -+ real-time applications (e.g., video or audio players/streamers), -+ and to reduce their latency. The most important action taken to -+ achieve this goal is to give to the queues associated with these -+ applications more than their fair share of the device -+ throughput. For brevity, we call just "weight-raising" the whole -+ sets of actions taken by BFQ to privilege these queues. In -+ particular, BFQ provides a milder form of weight-raising for -+ interactive applications, and a stronger form for soft real-time -+ applications. -+ -+ - BFQ automatically deactivates idling for queues born in a burst of -+ queue creations. In fact, these queues are usually associated with -+ the processes of applications and services that benefit mostly -+ from a high throughput. Examples are systemd during boot, or git -+ grep. -+ -+ - As CFQ, BFQ merges queues performing interleaved I/O, i.e., -+ performing random I/O that becomes mostly sequential if -+ merged. Differently from CFQ, BFQ achieves this goal with a more -+ reactive mechanism, called Early Queue Merge (EQM). EQM is so -+ responsive in detecting interleaved I/O (cooperating processes), -+ that it enables BFQ to achieve a high throughput, by queue -+ merging, even for queues for which CFQ needs a different -+ mechanism, preemption, to get a high throughput. As such EQM is a -+ unified mechanism to achieve a high throughput with interleaved -+ I/O. -+ -+ - Queues are scheduled according to a variant of WF2Q+, named -+ B-WF2Q+, and implemented using an augmented rb-tree to preserve an -+ O(log N) overall complexity. See [2] for more details. B-WF2Q+ is -+ also ready for hierarchical scheduling. However, for a cleaner -+ logical breakdown, the code that enables and completes -+ hierarchical support is provided in the next commit, which focuses -+ exactly on this feature. -+ -+ - B-WF2Q+ guarantees a tight deviation with respect to an ideal, -+ perfectly fair, and smooth service. In particular, B-WF2Q+ -+ guarantees that each queue receives a fraction of the device -+ throughput proportional to its weight, even if the throughput -+ fluctuates, and regardless of: the device parameters, the current -+ workload and the budgets assigned to the queue. -+ -+ - The last, budget-independence, property (although probably -+ counterintuitive in the first place) is definitely beneficial, for -+ the following reasons: -+ -+ - First, with any proportional-share scheduler, the maximum -+ deviation with respect to an ideal service is proportional to -+ the maximum budget (slice) assigned to queues. As a consequence, -+ BFQ can keep this deviation tight not only because of the -+ accurate service of B-WF2Q+, but also because BFQ *does not* -+ need to assign a larger budget to a queue to let the queue -+ receive a higher fraction of the device throughput. -+ -+ - Second, BFQ is free to choose, for every process (queue), the -+ budget that best fits the needs of the process, or best -+ leverages the I/O pattern of the process. In particular, BFQ -+ updates queue budgets with a simple feedback-loop algorithm that -+ allows a high throughput to be achieved, while still providing -+ tight latency guarantees to time-sensitive applications. When -+ the in-service queue expires, this algorithm computes the next -+ budget of the queue so as to: -+ -+ - Let large budgets be eventually assigned to the queues -+ associated with I/O-bound applications performing sequential -+ I/O: in fact, the longer these applications are served once -+ got access to the device, the higher the throughput is. -+ -+ - Let small budgets be eventually assigned to the queues -+ associated with time-sensitive applications (which typically -+ perform sporadic and short I/O), because, the smaller the -+ budget assigned to a queue waiting for service is, the sooner -+ B-WF2Q+ will serve that queue (Subsec 3.3 in [2]). -+ -+- If several processes are competing for the device at the same time, -+ but all processes and groups have the same weight, then BFQ -+ guarantees the expected throughput distribution without ever idling -+ the device. It uses preemption instead. Throughput is then much -+ higher in this common scenario. -+ -+- ioprio classes are served in strict priority order, i.e., -+ lower-priority queues are not served as long as there are -+ higher-priority queues. Among queues in the same class, the -+ bandwidth is distributed in proportion to the weight of each -+ queue. A very thin extra bandwidth is however guaranteed to -+ the Idle class, to prevent it from starving. -+ -+ -+3. What are BFQ's tunable? -+========================== -+ -+The tunables back_seek-max, back_seek_penalty, fifo_expire_async and -+fifo_expire_sync below are the same as in CFQ. Their description is -+just copied from that for CFQ. Some considerations in the description -+of slice_idle are copied from CFQ too. -+ -+per-process ioprio and weight -+----------------------------- -+ -+Unless the cgroups interface is used (see "4. BFQ group scheduling"), -+weights can be assigned to processes only indirectly, through I/O -+priorities, and according to the relation: -+weight = (IOPRIO_BE_NR - ioprio) * 10. -+ -+Beware that, if low-latency is set, then BFQ automatically raises the -+weight of the queues associated with interactive and soft real-time -+applications. Unset this tunable if you need/want to control weights. -+ -+slice_idle -+---------- -+ -+This parameter specifies how long BFQ should idle for next I/O -+request, when certain sync BFQ queues become empty. By default -+slice_idle is a non-zero value. Idling has a double purpose: boosting -+throughput and making sure that the desired throughput distribution is -+respected (see the description of how BFQ works, and, if needed, the -+papers referred there). -+ -+As for throughput, idling can be very helpful on highly seeky media -+like single spindle SATA/SAS disks where we can cut down on overall -+number of seeks and see improved throughput. -+ -+Setting slice_idle to 0 will remove all the idling on queues and one -+should see an overall improved throughput on faster storage devices -+like multiple SATA/SAS disks in hardware RAID configuration. -+ -+So depending on storage and workload, it might be useful to set -+slice_idle=0. In general for SATA/SAS disks and software RAID of -+SATA/SAS disks keeping slice_idle enabled should be useful. For any -+configurations where there are multiple spindles behind single LUN -+(Host based hardware RAID controller or for storage arrays), setting -+slice_idle=0 might end up in better throughput and acceptable -+latencies. -+ -+Idling is however necessary to have service guarantees enforced in -+case of differentiated weights or differentiated I/O-request lengths. -+To see why, suppose that a given BFQ queue A must get several I/O -+requests served for each request served for another queue B. Idling -+ensures that, if A makes a new I/O request slightly after becoming -+empty, then no request of B is dispatched in the middle, and thus A -+does not lose the possibility to get more than one request dispatched -+before the next request of B is dispatched. Note that idling -+guarantees the desired differentiated treatment of queues only in -+terms of I/O-request dispatches. To guarantee that the actual service -+order then corresponds to the dispatch order, the strict_guarantees -+tunable must be set too. -+ -+There is an important flipside for idling: apart from the above cases -+where it is beneficial also for throughput, idling can severely impact -+throughput. One important case is random workload. Because of this -+issue, BFQ tends to avoid idling as much as possible, when it is not -+beneficial also for throughput. As a consequence of this behavior, and -+of further issues described for the strict_guarantees tunable, -+short-term service guarantees may be occasionally violated. And, in -+some cases, these guarantees may be more important than guaranteeing -+maximum throughput. For example, in video playing/streaming, a very -+low drop rate may be more important than maximum throughput. In these -+cases, consider setting the strict_guarantees parameter. -+ -+strict_guarantees -+----------------- -+ -+If this parameter is set (default: unset), then BFQ -+ -+- always performs idling when the in-service queue becomes empty; -+ -+- forces the device to serve one I/O request at a time, by dispatching a -+ new request only if there is no outstanding request. -+ -+In the presence of differentiated weights or I/O-request sizes, both -+the above conditions are needed to guarantee that every BFQ queue -+receives its allotted share of the bandwidth. The first condition is -+needed for the reasons explained in the description of the slice_idle -+tunable. The second condition is needed because all modern storage -+devices reorder internally-queued requests, which may trivially break -+the service guarantees enforced by the I/O scheduler. -+ -+Setting strict_guarantees may evidently affect throughput. -+ -+back_seek_max -+------------- -+ -+This specifies, given in Kbytes, the maximum "distance" for backward seeking. -+The distance is the amount of space from the current head location to the -+sectors that are backward in terms of distance. -+ -+This parameter allows the scheduler to anticipate requests in the "backward" -+direction and consider them as being the "next" if they are within this -+distance from the current head location. -+ -+back_seek_penalty -+----------------- -+ -+This parameter is used to compute the cost of backward seeking. If the -+backward distance of request is just 1/back_seek_penalty from a "front" -+request, then the seeking cost of two requests is considered equivalent. -+ -+So scheduler will not bias toward one or the other request (otherwise scheduler -+will bias toward front request). Default value of back_seek_penalty is 2. -+ -+fifo_expire_async -+----------------- -+ -+This parameter is used to set the timeout of asynchronous requests. Default -+value of this is 248ms. -+ -+fifo_expire_sync -+---------------- -+ -+This parameter is used to set the timeout of synchronous requests. Default -+value of this is 124ms. In case to favor synchronous requests over asynchronous -+one, this value should be decreased relative to fifo_expire_async. -+ -+low_latency -+----------- -+ -+This parameter is used to enable/disable BFQ's low latency mode. By -+default, low latency mode is enabled. If enabled, interactive and soft -+real-time applications are privileged and experience a lower latency, -+as explained in more detail in the description of how BFQ works. -+ -+DO NOT enable this mode if you need full control on bandwidth -+distribution. In fact, if it is enabled, then BFQ automatically -+increases the bandwidth share of privileged applications, as the main -+means to guarantee a lower latency to them. -+ -+timeout_sync -+------------ -+ -+Maximum amount of device time that can be given to a task (queue) once -+it has been selected for service. On devices with costly seeks, -+increasing this time usually increases maximum throughput. On the -+opposite end, increasing this time coarsens the granularity of the -+short-term bandwidth and latency guarantees, especially if the -+following parameter is set to zero. -+ -+max_budget -+---------- -+ -+Maximum amount of service, measured in sectors, that can be provided -+to a BFQ queue once it is set in service (of course within the limits -+of the above timeout). According to what said in the description of -+the algorithm, larger values increase the throughput in proportion to -+the percentage of sequential I/O requests issued. The price of larger -+values is that they coarsen the granularity of short-term bandwidth -+and latency guarantees. -+ -+The default value is 0, which enables auto-tuning: BFQ sets max_budget -+to the maximum number of sectors that can be served during -+timeout_sync, according to the estimated peak rate. -+ -+weights -+------- -+ -+Read-only parameter, used to show the weights of the currently active -+BFQ queues. -+ -+ -+wr_ tunables -+------------ -+ -+BFQ exports a few parameters to control/tune the behavior of -+low-latency heuristics. -+ -+wr_coeff -+ -+Factor by which the weight of a weight-raised queue is multiplied. If -+the queue is deemed soft real-time, then the weight is further -+multiplied by an additional, constant factor. -+ -+wr_max_time -+ -+Maximum duration of a weight-raising period for an interactive task -+(ms). If set to zero (default value), then this value is computed -+automatically, as a function of the peak rate of the device. In any -+case, when the value of this parameter is read, it always reports the -+current duration, regardless of whether it has been set manually or -+computed automatically. -+ -+wr_max_softrt_rate -+ -+Maximum service rate below which a queue is deemed to be associated -+with a soft real-time application, and is then weight-raised -+accordingly (sectors/sec). -+ -+wr_min_idle_time -+ -+Minimum idle period after which interactive weight-raising may be -+reactivated for a queue (in ms). -+ -+wr_rt_max_time -+ -+Maximum weight-raising duration for soft real-time queues (in ms). The -+start time from which this duration is considered is automatically -+moved forward if the queue is detected to be still soft real-time -+before the current soft real-time weight-raising period finishes. -+ -+wr_min_inter_arr_async -+ -+Minimum period between I/O request arrivals after which weight-raising -+may be reactivated for an already busy async queue (in ms). -+ -+ -+4. Group scheduling with BFQ -+============================ -+ -+BFQ supports both cgroups-v1 and cgroups-v2 io controllers, namely -+blkio and io. In particular, BFQ supports weight-based proportional -+share. To activate cgroups support, set BFQ_GROUP_IOSCHED. -+ -+4-1 Service guarantees provided -+------------------------------- -+ -+With BFQ, proportional share means true proportional share of the -+device bandwidth, according to group weights. For example, a group -+with weight 200 gets twice the bandwidth, and not just twice the time, -+of a group with weight 100. -+ -+BFQ supports hierarchies (group trees) of any depth. Bandwidth is -+distributed among groups and processes in the expected way: for each -+group, the children of the group share the whole bandwidth of the -+group in proportion to their weights. In particular, this implies -+that, for each leaf group, every process of the group receives the -+same share of the whole group bandwidth, unless the ioprio of the -+process is modified. -+ -+The resource-sharing guarantee for a group may partially or totally -+switch from bandwidth to time, if providing bandwidth guarantees to -+the group lowers the throughput too much. This switch occurs on a -+per-process basis: if a process of a leaf group causes throughput loss -+if served in such a way to receive its share of the bandwidth, then -+BFQ switches back to just time-based proportional share for that -+process. -+ -+4-2 Interface -+------------- -+ -+To get proportional sharing of bandwidth with BFQ for a given device, -+BFQ must of course be the active scheduler for that device. -+ -+Within each group directory, the names of the files associated with -+BFQ-specific cgroup parameters and stats begin with the "bfq." -+prefix. So, with cgroups-v1 or cgroups-v2, the full prefix for -+BFQ-specific files is "blkio.bfq." or "io.bfq." For example, the group -+parameter to set the weight of a group with BFQ is blkio.bfq.weight -+or io.bfq.weight. -+ -+Parameters to set -+----------------- -+ -+For each group, there is only the following parameter to set. -+ -+weight (namely blkio.bfq.weight or io.bfq-weight): the weight of the -+group inside its parent. Available values: 1..10000 (default 100). The -+linear mapping between ioprio and weights, described at the beginning -+of the tunable section, is still valid, but all weights higher than -+IOPRIO_BE_NR*10 are mapped to ioprio 0. -+ -+Recall that, if low-latency is set, then BFQ automatically raises the -+weight of the queues associated with interactive and soft real-time -+applications. Unset this tunable if you need/want to control weights. -+ -+ -+[1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ Scheduler", Proceedings of the First Workshop on Mobile System -+ Technologies (MST-2015), May 2015. -+ http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf -+ -+[2] P. Valente and M. Andreolini, "Improving Application -+ Responsiveness with the BFQ Disk I/O Scheduler", Proceedings of -+ the 5th Annual International Systems and Storage Conference -+ (SYSTOR '12), June 2012. -+ Slightly extended version: -+ http://algogroup.unimore.it/people/paolo/disk_sched/bfq-v1-suite- -+ results.pdf -diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched -index f78cd1a..f2cd945 100644 ---- a/block/Kconfig.iosched -+++ b/block/Kconfig.iosched -@@ -43,20 +43,20 @@ config IOSCHED_BFQ - tristate "BFQ I/O scheduler" - default n - ---help--- -- The BFQ I/O scheduler tries to distribute bandwidth among -- all processes according to their weights. -- It aims at distributing the bandwidth as desired, independently of -- the disk parameters and with any workload. It also tries to -- guarantee low latency to interactive and soft real-time -- applications. If compiled built-in (saying Y here), BFQ can -- be configured to support hierarchical scheduling. -+ The BFQ I/O scheduler distributes bandwidth among all -+ processes according to their weights, regardless of the -+ device parameters and with any workload. It also guarantees -+ a low latency to interactive and soft real-time applications. -+ Details in Documentation/block/bfq-iosched.txt - - config BFQ_GROUP_IOSCHED - bool "BFQ hierarchical scheduling support" -- depends on CGROUPS && IOSCHED_BFQ=y -+ depends on IOSCHED_BFQ && BLK_CGROUP - default n - ---help--- -- Enable hierarchical scheduling in BFQ, using the blkio controller. -+ -+ Enable hierarchical scheduling in BFQ, using the blkio -+ (cgroups-v1) or io (cgroups-v2) controller. - - choice - prompt "Default I/O scheduler" -diff --git a/block/bfq-cgroup.c b/block/bfq-cgroup.c -index 0367996..0125275 100644 ---- a/block/bfq-cgroup.c -+++ b/block/bfq-cgroup.c -@@ -7,7 +7,9 @@ - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * -- * Copyright (C) 2010 Paolo Valente -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2016 Paolo Valente - * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ - * file. -@@ -163,8 +165,6 @@ static struct bfq_group *blkg_to_bfqg(struct blkcg_gq *blkg) - { - struct blkg_policy_data *pd = blkg_to_pd(blkg, &blkcg_policy_bfq); - -- BUG_ON(!pd); -- - return pd_to_bfqg(pd); - } - -@@ -208,59 +208,47 @@ static void bfqg_put(struct bfq_group *bfqg) - - static void bfqg_stats_update_io_add(struct bfq_group *bfqg, - struct bfq_queue *bfqq, -- int rw) -+ unsigned int op) - { -- blkg_rwstat_add(&bfqg->stats.queued, rw, 1); -+ blkg_rwstat_add(&bfqg->stats.queued, op, 1); - bfqg_stats_end_empty_time(&bfqg->stats); - if (!(bfqq == ((struct bfq_data *)bfqg->bfqd)->in_service_queue)) - bfqg_stats_set_start_group_wait_time(bfqg, bfqq_group(bfqq)); - } - --static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, int rw) --{ -- blkg_rwstat_add(&bfqg->stats.queued, rw, -1); --} -- --static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, int rw) -+static void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) - { -- blkg_rwstat_add(&bfqg->stats.merged, rw, 1); -+ blkg_rwstat_add(&bfqg->stats.queued, op, -1); - } - --static void bfqg_stats_update_dispatch(struct bfq_group *bfqg, -- uint64_t bytes, int rw) -+static void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) - { -- blkg_stat_add(&bfqg->stats.sectors, bytes >> 9); -- blkg_rwstat_add(&bfqg->stats.serviced, rw, 1); -- blkg_rwstat_add(&bfqg->stats.service_bytes, rw, bytes); -+ blkg_rwstat_add(&bfqg->stats.merged, op, 1); - } - - static void bfqg_stats_update_completion(struct bfq_group *bfqg, -- uint64_t start_time, uint64_t io_start_time, int rw) -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) - { - struct bfqg_stats *stats = &bfqg->stats; - unsigned long long now = sched_clock(); - - if (time_after64(now, io_start_time)) -- blkg_rwstat_add(&stats->service_time, rw, now - io_start_time); -+ blkg_rwstat_add(&stats->service_time, op, -+ now - io_start_time); - if (time_after64(io_start_time, start_time)) -- blkg_rwstat_add(&stats->wait_time, rw, -+ blkg_rwstat_add(&stats->wait_time, op, - io_start_time - start_time); - } - - /* @stats = 0 */ - static void bfqg_stats_reset(struct bfqg_stats *stats) - { -- if (!stats) -- return; -- - /* queued stats shouldn't be cleared */ -- blkg_rwstat_reset(&stats->service_bytes); -- blkg_rwstat_reset(&stats->serviced); - blkg_rwstat_reset(&stats->merged); - blkg_rwstat_reset(&stats->service_time); - blkg_rwstat_reset(&stats->wait_time); - blkg_stat_reset(&stats->time); -- blkg_stat_reset(&stats->unaccounted_time); - blkg_stat_reset(&stats->avg_queue_size_sum); - blkg_stat_reset(&stats->avg_queue_size_samples); - blkg_stat_reset(&stats->dequeue); -@@ -270,19 +258,16 @@ static void bfqg_stats_reset(struct bfqg_stats *stats) - } - - /* @to += @from */ --static void bfqg_stats_merge(struct bfqg_stats *to, struct bfqg_stats *from) -+static void bfqg_stats_add_aux(struct bfqg_stats *to, struct bfqg_stats *from) - { - if (!to || !from) - return; - - /* queued stats shouldn't be cleared */ -- blkg_rwstat_add_aux(&to->service_bytes, &from->service_bytes); -- blkg_rwstat_add_aux(&to->serviced, &from->serviced); - blkg_rwstat_add_aux(&to->merged, &from->merged); - blkg_rwstat_add_aux(&to->service_time, &from->service_time); - blkg_rwstat_add_aux(&to->wait_time, &from->wait_time); - blkg_stat_add_aux(&from->time, &from->time); -- blkg_stat_add_aux(&to->unaccounted_time, &from->unaccounted_time); - blkg_stat_add_aux(&to->avg_queue_size_sum, &from->avg_queue_size_sum); - blkg_stat_add_aux(&to->avg_queue_size_samples, - &from->avg_queue_size_samples); -@@ -311,10 +296,8 @@ static void bfqg_stats_xfer_dead(struct bfq_group *bfqg) - if (unlikely(!parent)) - return; - -- bfqg_stats_merge(&parent->dead_stats, &bfqg->stats); -- bfqg_stats_merge(&parent->dead_stats, &bfqg->dead_stats); -+ bfqg_stats_add_aux(&parent->stats, &bfqg->stats); - bfqg_stats_reset(&bfqg->stats); -- bfqg_stats_reset(&bfqg->dead_stats); - } - - static void bfq_init_entity(struct bfq_entity *entity, -@@ -329,21 +312,17 @@ static void bfq_init_entity(struct bfq_entity *entity, - bfqq->ioprio_class = bfqq->new_ioprio_class; - bfqg_get(bfqg); - } -- entity->parent = bfqg->my_entity; -+ entity->parent = bfqg->my_entity; /* NULL for root group */ - entity->sched_data = &bfqg->sched_data; - } - - static void bfqg_stats_exit(struct bfqg_stats *stats) - { -- blkg_rwstat_exit(&stats->service_bytes); -- blkg_rwstat_exit(&stats->serviced); - blkg_rwstat_exit(&stats->merged); - blkg_rwstat_exit(&stats->service_time); - blkg_rwstat_exit(&stats->wait_time); - blkg_rwstat_exit(&stats->queued); -- blkg_stat_exit(&stats->sectors); - blkg_stat_exit(&stats->time); -- blkg_stat_exit(&stats->unaccounted_time); - blkg_stat_exit(&stats->avg_queue_size_sum); - blkg_stat_exit(&stats->avg_queue_size_samples); - blkg_stat_exit(&stats->dequeue); -@@ -354,15 +333,11 @@ static void bfqg_stats_exit(struct bfqg_stats *stats) - - static int bfqg_stats_init(struct bfqg_stats *stats, gfp_t gfp) - { -- if (blkg_rwstat_init(&stats->service_bytes, gfp) || -- blkg_rwstat_init(&stats->serviced, gfp) || -- blkg_rwstat_init(&stats->merged, gfp) || -+ if (blkg_rwstat_init(&stats->merged, gfp) || - blkg_rwstat_init(&stats->service_time, gfp) || - blkg_rwstat_init(&stats->wait_time, gfp) || - blkg_rwstat_init(&stats->queued, gfp) || -- blkg_stat_init(&stats->sectors, gfp) || - blkg_stat_init(&stats->time, gfp) || -- blkg_stat_init(&stats->unaccounted_time, gfp) || - blkg_stat_init(&stats->avg_queue_size_sum, gfp) || - blkg_stat_init(&stats->avg_queue_size_samples, gfp) || - blkg_stat_init(&stats->dequeue, gfp) || -@@ -386,11 +361,27 @@ static struct bfq_group_data *blkcg_to_bfqgd(struct blkcg *blkcg) - return cpd_to_bfqgd(blkcg_to_cpd(blkcg, &blkcg_policy_bfq)); - } - -+static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) -+{ -+ struct bfq_group_data *bgd; -+ -+ bgd = kzalloc(sizeof(*bgd), gfp); -+ if (!bgd) -+ return NULL; -+ return &bgd->pd; -+} -+ - static void bfq_cpd_init(struct blkcg_policy_data *cpd) - { - struct bfq_group_data *d = cpd_to_bfqgd(cpd); - -- d->weight = BFQ_DEFAULT_GRP_WEIGHT; -+ d->weight = cgroup_subsys_on_dfl(io_cgrp_subsys) ? -+ CGROUP_WEIGHT_DFL : BFQ_WEIGHT_LEGACY_DFL; -+} -+ -+static void bfq_cpd_free(struct blkcg_policy_data *cpd) -+{ -+ kfree(cpd_to_bfqgd(cpd)); - } - - static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) -@@ -401,8 +392,7 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - if (!bfqg) - return NULL; - -- if (bfqg_stats_init(&bfqg->stats, gfp) || -- bfqg_stats_init(&bfqg->dead_stats, gfp)) { -+ if (bfqg_stats_init(&bfqg->stats, gfp)) { - kfree(bfqg); - return NULL; - } -@@ -410,27 +400,20 @@ static struct blkg_policy_data *bfq_pd_alloc(gfp_t gfp, int node) - return &bfqg->pd; - } - --static void bfq_group_set_parent(struct bfq_group *bfqg, -- struct bfq_group *parent) -+static void bfq_pd_init(struct blkg_policy_data *pd) - { -+ struct blkcg_gq *blkg; -+ struct bfq_group *bfqg; -+ struct bfq_data *bfqd; - struct bfq_entity *entity; -+ struct bfq_group_data *d; - -- BUG_ON(!parent); -- BUG_ON(!bfqg); -- BUG_ON(bfqg == parent); -- -+ blkg = pd_to_blkg(pd); -+ BUG_ON(!blkg); -+ bfqg = blkg_to_bfqg(blkg); -+ bfqd = blkg->q->elevator->elevator_data; - entity = &bfqg->entity; -- entity->parent = parent->my_entity; -- entity->sched_data = &parent->sched_data; --} -- --static void bfq_pd_init(struct blkg_policy_data *pd) --{ -- struct blkcg_gq *blkg = pd_to_blkg(pd); -- struct bfq_group *bfqg = blkg_to_bfqg(blkg); -- struct bfq_data *bfqd = blkg->q->elevator->elevator_data; -- struct bfq_entity *entity = &bfqg->entity; -- struct bfq_group_data *d = blkcg_to_bfqgd(blkg->blkcg); -+ d = blkcg_to_bfqgd(blkg->blkcg); - - entity->orig_weight = entity->weight = entity->new_weight = d->weight; - entity->my_sched_data = &bfqg->sched_data; -@@ -448,70 +431,53 @@ static void bfq_pd_free(struct blkg_policy_data *pd) - struct bfq_group *bfqg = pd_to_bfqg(pd); - - bfqg_stats_exit(&bfqg->stats); -- bfqg_stats_exit(&bfqg->dead_stats); -- - return kfree(bfqg); - } - --/* offset delta from bfqg->stats to bfqg->dead_stats */ --static const int dead_stats_off_delta = offsetof(struct bfq_group, dead_stats) - -- offsetof(struct bfq_group, stats); -- --/* to be used by recursive prfill, sums live and dead stats recursively */ --static u64 bfqg_stat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -+static void bfq_pd_reset_stats(struct blkg_policy_data *pd) - { -- u64 sum = 0; -+ struct bfq_group *bfqg = pd_to_bfqg(pd); - -- sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); -- sum += blkg_stat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, -- off + dead_stats_off_delta); -- return sum; -+ bfqg_stats_reset(&bfqg->stats); - } - --/* to be used by recursive prfill, sums live and dead rwstats recursively */ --static struct blkg_rwstat --bfqg_rwstat_pd_recursive_sum(struct blkg_policy_data *pd, int off) -+static void bfq_group_set_parent(struct bfq_group *bfqg, -+ struct bfq_group *parent) - { -- struct blkg_rwstat a, b; -+ struct bfq_entity *entity; - -- a = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, off); -- b = blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_bfq, -- off + dead_stats_off_delta); -- blkg_rwstat_add_aux(&a, &b); -- return a; -+ BUG_ON(!parent); -+ BUG_ON(!bfqg); -+ BUG_ON(bfqg == parent); -+ -+ entity = &bfqg->entity; -+ entity->parent = parent->my_entity; -+ entity->sched_data = &parent->sched_data; - } - --static void bfq_pd_reset_stats(struct blkg_policy_data *pd) -+static struct bfq_group *bfq_lookup_bfqg(struct bfq_data *bfqd, -+ struct blkcg *blkcg) - { -- struct bfq_group *bfqg = pd_to_bfqg(pd); -+ struct blkcg_gq *blkg; - -- bfqg_stats_reset(&bfqg->stats); -- bfqg_stats_reset(&bfqg->dead_stats); -+ blkg = blkg_lookup(blkcg, bfqd->queue); -+ if (likely(blkg)) -+ return blkg_to_bfqg(blkg); -+ return NULL; - } - --static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, -- struct blkcg *blkcg) -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) - { -- struct request_queue *q = bfqd->queue; -- struct bfq_group *bfqg = NULL, *parent; -- struct bfq_entity *entity = NULL; -+ struct bfq_group *bfqg, *parent; -+ struct bfq_entity *entity; - - assert_spin_locked(bfqd->queue->queue_lock); - -- /* avoid lookup for the common case where there's no blkcg */ -- if (blkcg == &blkcg_root) { -- bfqg = bfqd->root_group; -- } else { -- struct blkcg_gq *blkg; -- -- blkg = blkg_lookup_create(blkcg, q); -- if (!IS_ERR(blkg)) -- bfqg = blkg_to_bfqg(blkg); -- else /* fallback to root_group */ -- bfqg = bfqd->root_group; -- } -+ bfqg = bfq_lookup_bfqg(bfqd, blkcg); - -- BUG_ON(!bfqg); -+ if (unlikely(!bfqg)) -+ return NULL; - - /* - * Update chain of bfq_groups as we might be handling a leaf group -@@ -537,11 +503,15 @@ static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, - static void bfq_pos_tree_add_move(struct bfq_data *bfqd, - struct bfq_queue *bfqq); - -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ - /** - * bfq_bfqq_move - migrate @bfqq to @bfqg. - * @bfqd: queue descriptor. - * @bfqq: the queue to move. -- * @entity: @bfqq's entity. - * @bfqg: the group to move to. - * - * Move @bfqq to @bfqg, deactivating it from its old group and reactivating -@@ -552,26 +522,40 @@ static void bfq_pos_tree_add_move(struct bfq_data *bfqd, - * rcu_read_lock()). - */ - static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- struct bfq_entity *entity, struct bfq_group *bfqg) -+ struct bfq_group *bfqg) - { -- int busy, resume; -- -- busy = bfq_bfqq_busy(bfqq); -- resume = !RB_EMPTY_ROOT(&bfqq->sort_list); -+ struct bfq_entity *entity = &bfqq->entity; - -- BUG_ON(resume && !entity->on_st); -- BUG_ON(busy && !resume && entity->on_st && -+ BUG_ON(!bfq_bfqq_busy(bfqq) && !RB_EMPTY_ROOT(&bfqq->sort_list)); -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list) && !entity->on_st); -+ BUG_ON(bfq_bfqq_busy(bfqq) && RB_EMPTY_ROOT(&bfqq->sort_list) -+ && entity->on_st && - bfqq != bfqd->in_service_queue); -+ BUG_ON(!bfq_bfqq_busy(bfqq) && bfqq == bfqd->in_service_queue); -+ -+ /* If bfqq is empty, then bfq_bfqq_expire also invokes -+ * bfq_del_bfqq_busy, thereby removing bfqq and its entity -+ * from data structures related to current group. Otherwise we -+ * need to remove bfqq explicitly with bfq_deactivate_bfqq, as -+ * we do below. -+ */ -+ if (bfqq == bfqd->in_service_queue) -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); - -- if (busy) { -- BUG_ON(atomic_read(&bfqq->ref) < 2); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); - -- if (!resume) -- bfq_del_bfqq_busy(bfqd, bfqq, 0); -- else -- bfq_deactivate_bfqq(bfqd, bfqq, 0); -- } else if (entity->on_st) -+ if (bfq_bfqq_busy(bfqq)) -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); -+ else if (entity->on_st) { -+ BUG_ON(&bfq_entity_service_tree(entity)->idle != -+ entity->tree); - bfq_put_idle_entity(bfq_entity_service_tree(entity), entity); -+ } - bfqg_put(bfqq_group(bfqq)); - - /* -@@ -583,14 +567,17 @@ static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, - entity->sched_data = &bfqg->sched_data; - bfqg_get(bfqg); - -- if (busy) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list) && bfq_bfqq_busy(bfqq)); -+ if (bfq_bfqq_busy(bfqq)) { - bfq_pos_tree_add_move(bfqd, bfqq); -- if (resume) -- bfq_activate_bfqq(bfqd, bfqq); -+ bfq_activate_bfqq(bfqd, bfqq); - } - - if (!bfqd->in_service_queue && !bfqd->rq_in_driver) - bfq_schedule_dispatch(bfqd); -+ BUG_ON(entity->on_st && !bfq_bfqq_busy(bfqq) -+ && &bfq_entity_service_tree(entity)->idle != -+ entity->tree); - } - - /** -@@ -617,7 +604,11 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - - lockdep_assert_held(bfqd->queue->queue_lock); - -- bfqg = bfq_find_alloc_group(bfqd, blkcg); -+ bfqg = bfq_find_set_group(bfqd, blkcg); -+ -+ if (unlikely(!bfqg)) -+ bfqg = bfqd->root_group; -+ - if (async_bfqq) { - entity = &async_bfqq->entity; - -@@ -625,7 +616,8 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - bic_set_bfqq(bic, NULL, 0); - bfq_log_bfqq(bfqd, async_bfqq, - "bic_change_group: %p %d", -- async_bfqq, atomic_read(&async_bfqq->ref)); -+ async_bfqq, -+ async_bfqq->ref); - bfq_put_queue(async_bfqq); - } - } -@@ -633,7 +625,7 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - if (sync_bfqq) { - entity = &sync_bfqq->entity; - if (entity->sched_data != &bfqg->sched_data) -- bfq_bfqq_move(bfqd, sync_bfqq, entity, bfqg); -+ bfq_bfqq_move(bfqd, sync_bfqq, bfqg); - } - - return bfqg; -@@ -642,25 +634,23 @@ static struct bfq_group *__bfq_bic_change_cgroup(struct bfq_data *bfqd, - static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) - { - struct bfq_data *bfqd = bic_to_bfqd(bic); -- struct blkcg *blkcg; - struct bfq_group *bfqg = NULL; -- uint64_t id; -+ uint64_t serial_nr; - - rcu_read_lock(); -- blkcg = bio_blkcg(bio); -- id = blkcg->css.serial_nr; -- rcu_read_unlock(); -+ serial_nr = bio_blkcg(bio)->css.serial_nr; - - /* - * Check whether blkcg has changed. The condition may trigger - * spuriously on a newly created cic but there's no harm. - */ -- if (unlikely(!bfqd) || likely(bic->blkcg_id == id)) -- return; -+ if (unlikely(!bfqd) || likely(bic->blkcg_serial_nr == serial_nr)) -+ goto out; - -- bfqg = __bfq_bic_change_cgroup(bfqd, bic, blkcg); -- BUG_ON(!bfqg); -- bic->blkcg_id = id; -+ bfqg = __bfq_bic_change_cgroup(bfqd, bic, bio_blkcg(bio)); -+ bic->blkcg_serial_nr = serial_nr; -+out: -+ rcu_read_unlock(); - } - - /** -@@ -672,7 +662,7 @@ static void bfq_flush_idle_tree(struct bfq_service_tree *st) - struct bfq_entity *entity = st->first_idle; - - for (; entity ; entity = st->first_idle) -- __bfq_deactivate_entity(entity, 0); -+ __bfq_deactivate_entity(entity, false); - } - - /** -@@ -686,7 +676,7 @@ static void bfq_reparent_leaf_entity(struct bfq_data *bfqd, - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - BUG_ON(!bfqq); -- bfq_bfqq_move(bfqd, bfqq, entity, bfqd->root_group); -+ bfq_bfqq_move(bfqd, bfqq, bfqd->root_group); - } - - /** -@@ -717,11 +707,12 @@ static void bfq_reparent_active_entities(struct bfq_data *bfqd, - } - - /** -- * bfq_destroy_group - destroy @bfqg. -- * @bfqg: the group being destroyed. -+ * bfq_pd_offline - deactivate the entity associated with @pd, -+ * and reparent its children entities. -+ * @pd: descriptor of the policy going offline. - * -- * Destroy @bfqg, making sure that it is not referenced from its parent. -- * blkio already grabs the queue_lock for us, so no need to use RCU-based magic -+ * blkio already grabs the queue_lock for us, so no need to use -+ * RCU-based magic - */ - static void bfq_pd_offline(struct blkg_policy_data *pd) - { -@@ -776,10 +767,16 @@ static void bfq_pd_offline(struct blkg_policy_data *pd) - BUG_ON(bfqg->sched_data.next_in_service); - BUG_ON(bfqg->sched_data.in_service_entity); - -- __bfq_deactivate_entity(entity, 0); -+ __bfq_deactivate_entity(entity, false); - bfq_put_async_queues(bfqd, bfqg); - BUG_ON(entity->tree); - -+ /* -+ * @blkg is going offline and will be ignored by -+ * blkg_[rw]stat_recursive_sum(). Transfer stats to the parent so -+ * that they don't get lost. If IOs complete after this point, the -+ * stats for them will be lost. Oh well... -+ */ - bfqg_stats_xfer_dead(bfqg); - } - -@@ -789,46 +786,35 @@ static void bfq_end_wr_async(struct bfq_data *bfqd) - - list_for_each_entry(blkg, &bfqd->queue->blkg_list, q_node) { - struct bfq_group *bfqg = blkg_to_bfqg(blkg); -+ BUG_ON(!bfqg); - - bfq_end_wr_async_queues(bfqd, bfqg); - } - bfq_end_wr_async_queues(bfqd, bfqd->root_group); - } - --static u64 bfqio_cgroup_weight_read(struct cgroup_subsys_state *css, -- struct cftype *cftype) --{ -- struct blkcg *blkcg = css_to_blkcg(css); -- struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -- int ret = -EINVAL; -- -- spin_lock_irq(&blkcg->lock); -- ret = bfqgd->weight; -- spin_unlock_irq(&blkcg->lock); -- -- return ret; --} -- --static int bfqio_cgroup_weight_read_dfl(struct seq_file *sf, void *v) -+static int bfq_io_show_weight(struct seq_file *sf, void *v) - { - struct blkcg *blkcg = css_to_blkcg(seq_css(sf)); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); -+ unsigned int val = 0; - -- spin_lock_irq(&blkcg->lock); -- seq_printf(sf, "%u\n", bfqgd->weight); -- spin_unlock_irq(&blkcg->lock); -+ if (bfqgd) -+ val = bfqgd->weight; -+ -+ seq_printf(sf, "%u\n", val); - - return 0; - } - --static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, -- struct cftype *cftype, -- u64 val) -+static int bfq_io_set_weight_legacy(struct cgroup_subsys_state *css, -+ struct cftype *cftype, -+ u64 val) - { - struct blkcg *blkcg = css_to_blkcg(css); - struct bfq_group_data *bfqgd = blkcg_to_bfqgd(blkcg); - struct blkcg_gq *blkg; -- int ret = -EINVAL; -+ int ret = -ERANGE; - - if (val < BFQ_MIN_WEIGHT || val > BFQ_MAX_WEIGHT) - return ret; -@@ -873,13 +859,18 @@ static int bfqio_cgroup_weight_write(struct cgroup_subsys_state *css, - return ret; - } - --static ssize_t bfqio_cgroup_weight_write_dfl(struct kernfs_open_file *of, -- char *buf, size_t nbytes, -- loff_t off) -+static ssize_t bfq_io_set_weight(struct kernfs_open_file *of, -+ char *buf, size_t nbytes, -+ loff_t off) - { -+ u64 weight; - /* First unsigned long found in the file is used */ -- return bfqio_cgroup_weight_write(of_css(of), NULL, -- simple_strtoull(strim(buf), NULL, 0)); -+ int ret = kstrtoull(strim(buf), 0, &weight); -+ -+ if (ret) -+ return ret; -+ -+ return bfq_io_set_weight_legacy(of_css(of), NULL, weight); - } - - static int bfqg_print_stat(struct seq_file *sf, void *v) -@@ -899,16 +890,17 @@ static int bfqg_print_rwstat(struct seq_file *sf, void *v) - static u64 bfqg_prfill_stat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) - { -- u64 sum = bfqg_stat_pd_recursive_sum(pd, off); -- -+ u64 sum = blkg_stat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, off); - return __blkg_prfill_u64(sf, pd, sum); - } - - static u64 bfqg_prfill_rwstat_recursive(struct seq_file *sf, - struct blkg_policy_data *pd, int off) - { -- struct blkg_rwstat sum = bfqg_rwstat_pd_recursive_sum(pd, off); -- -+ struct blkg_rwstat sum = blkg_rwstat_recursive_sum(pd_to_blkg(pd), -+ &blkcg_policy_bfq, -+ off); - return __blkg_prfill_rwstat(sf, pd, &sum); - } - -@@ -928,6 +920,41 @@ static int bfqg_print_rwstat_recursive(struct seq_file *sf, void *v) - return 0; - } - -+static u64 bfqg_prfill_sectors(struct seq_file *sf, struct blkg_policy_data *pd, -+ int off) -+{ -+ u64 sum = blkg_rwstat_total(&pd->blkg->stat_bytes); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors, &blkcg_policy_bfq, 0, false); -+ return 0; -+} -+ -+static u64 bfqg_prfill_sectors_recursive(struct seq_file *sf, -+ struct blkg_policy_data *pd, int off) -+{ -+ struct blkg_rwstat tmp = blkg_rwstat_recursive_sum(pd->blkg, NULL, -+ offsetof(struct blkcg_gq, stat_bytes)); -+ u64 sum = atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_READ]) + -+ atomic64_read(&tmp.aux_cnt[BLKG_RWSTAT_WRITE]); -+ -+ return __blkg_prfill_u64(sf, pd, sum >> 9); -+} -+ -+static int bfqg_print_stat_sectors_recursive(struct seq_file *sf, void *v) -+{ -+ blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), -+ bfqg_prfill_sectors_recursive, &blkcg_policy_bfq, 0, -+ false); -+ return 0; -+} -+ -+ - static u64 bfqg_prfill_avg_queue_size(struct seq_file *sf, - struct blkg_policy_data *pd, int off) - { -@@ -964,38 +991,15 @@ bfq_create_group_hierarchy(struct bfq_data *bfqd, int node) - return blkg_to_bfqg(bfqd->queue->root_blkg); - } - --static struct blkcg_policy_data *bfq_cpd_alloc(gfp_t gfp) --{ -- struct bfq_group_data *bgd; -- -- bgd = kzalloc(sizeof(*bgd), GFP_KERNEL); -- if (!bgd) -- return NULL; -- return &bgd->pd; --} -- --static void bfq_cpd_free(struct blkcg_policy_data *cpd) --{ -- kfree(cpd_to_bfqgd(cpd)); --} -- --static struct cftype bfqio_files_dfl[] = { -+static struct cftype bfq_blkcg_legacy_files[] = { - { -- .name = "weight", -+ .name = "bfq.weight", - .flags = CFTYPE_NOT_ON_ROOT, -- .seq_show = bfqio_cgroup_weight_read_dfl, -- .write = bfqio_cgroup_weight_write_dfl, -+ .seq_show = bfq_io_show_weight, -+ .write_u64 = bfq_io_set_weight_legacy, - }, -- {} /* terminate */ --}; - --static struct cftype bfqio_files[] = { -- { -- .name = "bfq.weight", -- .read_u64 = bfqio_cgroup_weight_read, -- .write_u64 = bfqio_cgroup_weight_write, -- }, -- /* statistics, cover only the tasks in the bfqg */ -+ /* statistics, covers only the tasks in the bfqg */ - { - .name = "bfq.time", - .private = offsetof(struct bfq_group, stats.time), -@@ -1003,18 +1007,17 @@ static struct cftype bfqio_files[] = { - }, - { - .name = "bfq.sectors", -- .private = offsetof(struct bfq_group, stats.sectors), -- .seq_show = bfqg_print_stat, -+ .seq_show = bfqg_print_stat_sectors, - }, - { - .name = "bfq.io_service_bytes", -- .private = offsetof(struct bfq_group, stats.service_bytes), -- .seq_show = bfqg_print_rwstat, -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes, - }, - { - .name = "bfq.io_serviced", -- .private = offsetof(struct bfq_group, stats.serviced), -- .seq_show = bfqg_print_rwstat, -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios, - }, - { - .name = "bfq.io_service_time", -@@ -1045,18 +1048,17 @@ static struct cftype bfqio_files[] = { - }, - { - .name = "bfq.sectors_recursive", -- .private = offsetof(struct bfq_group, stats.sectors), -- .seq_show = bfqg_print_stat_recursive, -+ .seq_show = bfqg_print_stat_sectors_recursive, - }, - { - .name = "bfq.io_service_bytes_recursive", -- .private = offsetof(struct bfq_group, stats.service_bytes), -- .seq_show = bfqg_print_rwstat_recursive, -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_bytes_recursive, - }, - { - .name = "bfq.io_serviced_recursive", -- .private = offsetof(struct bfq_group, stats.serviced), -- .seq_show = bfqg_print_rwstat_recursive, -+ .private = (unsigned long)&blkcg_policy_bfq, -+ .seq_show = blkg_print_stat_ios_recursive, - }, - { - .name = "bfq.io_service_time_recursive", -@@ -1102,31 +1104,42 @@ static struct cftype bfqio_files[] = { - .private = offsetof(struct bfq_group, stats.dequeue), - .seq_show = bfqg_print_stat, - }, -- { -- .name = "bfq.unaccounted_time", -- .private = offsetof(struct bfq_group, stats.unaccounted_time), -- .seq_show = bfqg_print_stat, -- }, - { } /* terminate */ - }; - --static struct blkcg_policy blkcg_policy_bfq = { -- .dfl_cftypes = bfqio_files_dfl, -- .legacy_cftypes = bfqio_files, -- -- .pd_alloc_fn = bfq_pd_alloc, -- .pd_init_fn = bfq_pd_init, -- .pd_offline_fn = bfq_pd_offline, -- .pd_free_fn = bfq_pd_free, -- .pd_reset_stats_fn = bfq_pd_reset_stats, -- -- .cpd_alloc_fn = bfq_cpd_alloc, -- .cpd_init_fn = bfq_cpd_init, -- .cpd_bind_fn = bfq_cpd_init, -- .cpd_free_fn = bfq_cpd_free, -+static struct cftype bfq_blkg_files[] = { -+ { -+ .name = "bfq.weight", -+ .flags = CFTYPE_NOT_ON_ROOT, -+ .seq_show = bfq_io_show_weight, -+ .write = bfq_io_set_weight, -+ }, -+ {} /* terminate */ - }; - --#else -+#else /* CONFIG_BFQ_GROUP_IOSCHED */ -+ -+static inline void bfqg_stats_update_io_add(struct bfq_group *bfqg, -+ struct bfq_queue *bfqq, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op) { } -+static inline void -+bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op) { } -+static inline void bfqg_stats_update_completion(struct bfq_group *bfqg, -+ uint64_t start_time, uint64_t io_start_time, -+ unsigned int op) { } -+static inline void -+bfqg_stats_set_start_group_wait_time(struct bfq_group *bfqg, -+ struct bfq_group *curr_bfqg) { } -+static inline void bfqg_stats_end_empty_time(struct bfqg_stats *stats) { } -+static inline void bfqg_stats_update_dequeue(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg) { } -+static inline void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg) { } -+ -+static void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct bfq_group *bfqg) {} - - static void bfq_init_entity(struct bfq_entity *entity, - struct bfq_group *bfqg) -@@ -1142,35 +1155,22 @@ static void bfq_init_entity(struct bfq_entity *entity, - entity->sched_data = &bfqg->sched_data; - } - --static struct bfq_group * --bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) --{ -- struct bfq_data *bfqd = bic_to_bfqd(bic); -- -- return bfqd->root_group; --} -- --static void bfq_bfqq_move(struct bfq_data *bfqd, -- struct bfq_queue *bfqq, -- struct bfq_entity *entity, -- struct bfq_group *bfqg) --{ --} -+static void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio) {} - - static void bfq_end_wr_async(struct bfq_data *bfqd) - { - bfq_end_wr_async_queues(bfqd, bfqd->root_group); - } - --static void bfq_disconnect_groups(struct bfq_data *bfqd) -+static struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, -+ struct blkcg *blkcg) - { -- bfq_put_async_queues(bfqd, bfqd->root_group); -+ return bfqd->root_group; - } - --static struct bfq_group *bfq_find_alloc_group(struct bfq_data *bfqd, -- struct blkcg *blkcg) -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq) - { -- return bfqd->root_group; -+ return bfqq->bfqd->root_group; - } - - static struct bfq_group * -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index cf3e9b1..e5dfa5a 100644 ---- a/block/bfq-iosched.c -+++ b/block/bfq-iosched.c -@@ -1,5 +1,5 @@ - /* -- * Budget Fair Queueing (BFQ) disk scheduler. -+ * Budget Fair Queueing (BFQ) I/O scheduler. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe -@@ -7,25 +7,34 @@ - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * -- * Copyright (C) 2010 Paolo Valente -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2017 Paolo Valente - * - * Licensed under the GPL-2 as detailed in the accompanying COPYING.BFQ - * file. - * -- * BFQ is a proportional-share storage-I/O scheduling algorithm based on -- * the slice-by-slice service scheme of CFQ. But BFQ assigns budgets, -- * measured in number of sectors, to processes instead of time slices. The -- * device is not granted to the in-service process for a given time slice, -- * but until it has exhausted its assigned budget. This change from the time -- * to the service domain allows BFQ to distribute the device throughput -- * among processes as desired, without any distortion due to ZBR, workload -- * fluctuations or other factors. BFQ uses an ad hoc internal scheduler, -- * called B-WF2Q+, to schedule processes according to their budgets. More -- * precisely, BFQ schedules queues associated to processes. Thanks to the -- * accurate policy of B-WF2Q+, BFQ can afford to assign high budgets to -- * I/O-bound processes issuing sequential requests (to boost the -- * throughput), and yet guarantee a low latency to interactive and soft -- * real-time applications. -+ * BFQ is a proportional-share I/O scheduler, with some extra -+ * low-latency capabilities. BFQ also supports full hierarchical -+ * scheduling through cgroups. Next paragraphs provide an introduction -+ * on BFQ inner workings. Details on BFQ benefits and usage can be -+ * found in Documentation/block/bfq-iosched.txt. -+ * -+ * BFQ is a proportional-share storage-I/O scheduling algorithm based -+ * on the slice-by-slice service scheme of CFQ. But BFQ assigns -+ * budgets, measured in number of sectors, to processes instead of -+ * time slices. The device is not granted to the in-service process -+ * for a given time slice, but until it has exhausted its assigned -+ * budget. This change from the time to the service domain enables BFQ -+ * to distribute the device throughput among processes as desired, -+ * without any distortion due to throughput fluctuations, or to device -+ * internal queueing. BFQ uses an ad hoc internal scheduler, called -+ * B-WF2Q+, to schedule processes according to their budgets. More -+ * precisely, BFQ schedules queues associated with processes. Thanks to -+ * the accurate policy of B-WF2Q+, BFQ can afford to assign high -+ * budgets to I/O-bound processes issuing sequential requests (to -+ * boost the throughput), and yet guarantee a low latency to -+ * interactive and soft real-time applications. - * - * BFQ is described in [1], where also a reference to the initial, more - * theoretical paper on BFQ can be found. The interested reader can find -@@ -40,10 +49,10 @@ - * H-WF2Q+, while the augmented tree used to implement B-WF2Q+ with O(log N) - * complexity derives from the one introduced with EEVDF in [3]. - * -- * [1] P. Valente and M. Andreolini, ``Improving Application Responsiveness -- * with the BFQ Disk I/O Scheduler'', -- * Proceedings of the 5th Annual International Systems and Storage -- * Conference (SYSTOR '12), June 2012. -+ * [1] P. Valente, A. Avanzini, "Evolution of the BFQ Storage I/O -+ * Scheduler", Proceedings of the First Workshop on Mobile System -+ * Technologies (MST-2015), May 2015. -+ * http://algogroup.unimore.it/people/paolo/disk_sched/mst-2015.pdf - * - * http://algogroup.unimo.it/people/paolo/disk_sched/bf1-v1-suite-results.pdf - * -@@ -70,24 +79,23 @@ - #include "bfq.h" - #include "blk.h" - --/* Expiration time of sync (0) and async (1) requests, in jiffies. */ --static const int bfq_fifo_expire[2] = { HZ / 4, HZ / 8 }; -+/* Expiration time of sync (0) and async (1) requests, in ns. */ -+static const u64 bfq_fifo_expire[2] = { NSEC_PER_SEC / 4, NSEC_PER_SEC / 8 }; - - /* Maximum backwards seek, in KiB. */ --static const int bfq_back_max = 16 * 1024; -+static const int bfq_back_max = (16 * 1024); - - /* Penalty of a backwards seek, in number of sectors. */ - static const int bfq_back_penalty = 2; - --/* Idling period duration, in jiffies. */ --static int bfq_slice_idle = HZ / 125; -+/* Idling period duration, in ns. */ -+static u32 bfq_slice_idle = (NSEC_PER_SEC / 125); - - /* Minimum number of assigned budgets for which stats are safe to compute. */ - static const int bfq_stats_min_budgets = 194; - - /* Default maximum budget values, in sectors and number of requests. */ --static const int bfq_default_max_budget = 16 * 1024; --static const int bfq_max_budget_async_rq = 4; -+static const int bfq_default_max_budget = (16 * 1024); - - /* - * Async to sync throughput distribution is controlled as follows: -@@ -97,23 +105,28 @@ static const int bfq_max_budget_async_rq = 4; - static const int bfq_async_charge_factor = 10; - - /* Default timeout values, in jiffies, approximating CFQ defaults. */ --static const int bfq_timeout_sync = HZ / 8; --static int bfq_timeout_async = HZ / 25; -+static const int bfq_timeout = (HZ / 8); - --struct kmem_cache *bfq_pool; -+static struct kmem_cache *bfq_pool; - --/* Below this threshold (in ms), we consider thinktime immediate. */ --#define BFQ_MIN_TT 2 -+/* Below this threshold (in ns), we consider thinktime immediate. */ -+#define BFQ_MIN_TT (2 * NSEC_PER_MSEC) - - /* hw_tag detection: parallel requests threshold and min samples needed. */ - #define BFQ_HW_QUEUE_THRESHOLD 4 - #define BFQ_HW_QUEUE_SAMPLES 32 - --#define BFQQ_SEEK_THR (sector_t)(8 * 1024) --#define BFQQ_SEEKY(bfqq) ((bfqq)->seek_mean > BFQQ_SEEK_THR) -+#define BFQQ_SEEK_THR (sector_t)(8 * 100) -+#define BFQQ_SECT_THR_NONROT (sector_t)(2 * 32) -+#define BFQQ_CLOSE_THR (sector_t)(8 * 1024) -+#define BFQQ_SEEKY(bfqq) (hweight32(bfqq->seek_history) > 32/8) - --/* Min samples used for peak rate estimation (for autotuning). */ --#define BFQ_PEAK_RATE_SAMPLES 32 -+/* Min number of samples required to perform peak-rate update */ -+#define BFQ_RATE_MIN_SAMPLES 32 -+/* Min observation time interval required to perform a peak-rate update (ns) */ -+#define BFQ_RATE_MIN_INTERVAL (300*NSEC_PER_MSEC) -+/* Target observation time interval for a peak-rate update (ns) */ -+#define BFQ_RATE_REF_INTERVAL NSEC_PER_SEC - - /* Shift used for peak rate fixed precision calculations. */ - #define BFQ_RATE_SHIFT 16 -@@ -141,16 +154,24 @@ struct kmem_cache *bfq_pool; - * The device's speed class is dynamically (re)detected in - * bfq_update_peak_rate() every time the estimated peak rate is updated. - * -- * In the following definitions, R_slow[0]/R_fast[0] and T_slow[0]/T_fast[0] -- * are the reference values for a slow/fast rotational device, whereas -- * R_slow[1]/R_fast[1] and T_slow[1]/T_fast[1] are the reference values for -- * a slow/fast non-rotational device. Finally, device_speed_thresh are the -- * thresholds used to switch between speed classes. -+ * In the following definitions, R_slow[0]/R_fast[0] and -+ * T_slow[0]/T_fast[0] are the reference values for a slow/fast -+ * rotational device, whereas R_slow[1]/R_fast[1] and -+ * T_slow[1]/T_fast[1] are the reference values for a slow/fast -+ * non-rotational device. Finally, device_speed_thresh are the -+ * thresholds used to switch between speed classes. The reference -+ * rates are not the actual peak rates of the devices used as a -+ * reference, but slightly lower values. The reason for using these -+ * slightly lower values is that the peak-rate estimator tends to -+ * yield slightly lower values than the actual peak rate (it can yield -+ * the actual peak rate only if there is only one process doing I/O, -+ * and the process does sequential I/O). -+ * - * Both the reference peak rates and the thresholds are measured in - * sectors/usec, left-shifted by BFQ_RATE_SHIFT. - */ --static int R_slow[2] = {1536, 10752}; --static int R_fast[2] = {17415, 34791}; -+static int R_slow[2] = {1000, 10700}; -+static int R_fast[2] = {14000, 33000}; - /* - * To improve readability, a conversion function is used to initialize the - * following arrays, which entails that they can be initialized only in a -@@ -178,18 +199,6 @@ static void bfq_schedule_dispatch(struct bfq_data *bfqd); - #define bfq_sample_valid(samples) ((samples) > 80) - - /* -- * We regard a request as SYNC, if either it's a read or has the SYNC bit -- * set (in which case it could also be a direct WRITE). -- */ --static int bfq_bio_sync(struct bio *bio) --{ -- if (bio_data_dir(bio) == READ || (bio->bi_rw & REQ_SYNC)) -- return 1; -- -- return 0; --} -- --/* - * Scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing. - */ -@@ -409,11 +418,7 @@ static bool bfq_differentiated_weights(struct bfq_data *bfqd) - */ - static bool bfq_symmetric_scenario(struct bfq_data *bfqd) - { -- return --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- !bfqd->active_numerous_groups && --#endif -- !bfq_differentiated_weights(bfqd); -+ return !bfq_differentiated_weights(bfqd); - } - - /* -@@ -505,13 +510,45 @@ static void bfq_weights_tree_remove(struct bfq_data *bfqd, - entity->weight_counter = NULL; - } - -+/* -+ * Return expired entry, or NULL to just start from scratch in rbtree. -+ */ -+static struct request *bfq_check_fifo(struct bfq_queue *bfqq, -+ struct request *last) -+{ -+ struct request *rq; -+ -+ if (bfq_bfqq_fifo_expire(bfqq)) -+ return NULL; -+ -+ bfq_mark_bfqq_fifo_expire(bfqq); -+ -+ rq = rq_entry_fifo(bfqq->fifo.next); -+ -+ if (rq == last || ktime_get_ns() < rq->fifo_time) -+ return NULL; -+ -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "check_fifo: returned %p", rq); -+ BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); -+ return rq; -+} -+ - static struct request *bfq_find_next_rq(struct bfq_data *bfqd, - struct bfq_queue *bfqq, - struct request *last) - { - struct rb_node *rbnext = rb_next(&last->rb_node); - struct rb_node *rbprev = rb_prev(&last->rb_node); -- struct request *next = NULL, *prev = NULL; -+ struct request *next, *prev = NULL; -+ -+ BUG_ON(list_empty(&bfqq->fifo)); -+ -+ /* Follow expired path, else get first next available. */ -+ next = bfq_check_fifo(bfqq, last); -+ if (next) { -+ BUG_ON(next == last); -+ return next; -+ } - - BUG_ON(RB_EMPTY_NODE(&last->rb_node)); - -@@ -533,9 +570,19 @@ static struct request *bfq_find_next_rq(struct bfq_data *bfqd, - static unsigned long bfq_serv_to_charge(struct request *rq, - struct bfq_queue *bfqq) - { -- return blk_rq_sectors(rq) * -- (1 + ((!bfq_bfqq_sync(bfqq)) * (bfqq->wr_coeff == 1) * -- bfq_async_charge_factor)); -+ if (bfq_bfqq_sync(bfqq) || bfqq->wr_coeff > 1) -+ return blk_rq_sectors(rq); -+ -+ /* -+ * If there are no weight-raised queues, then amplify service -+ * by just the async charge factor; otherwise amplify service -+ * by twice the async charge factor, to further reduce latency -+ * for weight-raised queues. -+ */ -+ if (bfqq->bfqd->wr_busy_queues == 0) -+ return blk_rq_sectors(rq) * bfq_async_charge_factor; -+ -+ return blk_rq_sectors(rq) * 2 * bfq_async_charge_factor; - } - - /** -@@ -576,7 +623,7 @@ static void bfq_updated_next_req(struct bfq_data *bfqd, - entity->budget = new_budget; - bfq_log_bfqq(bfqd, bfqq, "updated next rq: new budget %lu", - new_budget); -- bfq_activate_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq); - } - } - -@@ -590,12 +637,23 @@ static unsigned int bfq_wr_duration(struct bfq_data *bfqd) - dur = bfqd->RT_prod; - do_div(dur, bfqd->peak_rate); - -- return dur; --} -+ /* -+ * Limit duration between 3 and 13 seconds. Tests show that -+ * higher values than 13 seconds often yield the opposite of -+ * the desired result, i.e., worsen responsiveness by letting -+ * non-interactive and non-soft-real-time applications -+ * preserve weight raising for a too long time interval. -+ * -+ * On the other end, lower values than 3 seconds make it -+ * difficult for most interactive tasks to complete their jobs -+ * before weight-raising finishes. -+ */ -+ if (dur > msecs_to_jiffies(13000)) -+ dur = msecs_to_jiffies(13000); -+ else if (dur < msecs_to_jiffies(3000)) -+ dur = msecs_to_jiffies(3000); - --static unsigned int bfq_bfqq_cooperations(struct bfq_queue *bfqq) --{ -- return bfqq->bic ? bfqq->bic->cooperations : 0; -+ return dur; - } - - static void -@@ -605,31 +663,31 @@ bfq_bfqq_resume_state(struct bfq_queue *bfqq, struct bfq_io_cq *bic) - bfq_mark_bfqq_idle_window(bfqq); - else - bfq_clear_bfqq_idle_window(bfqq); -+ - if (bic->saved_IO_bound) - bfq_mark_bfqq_IO_bound(bfqq); - else - bfq_clear_bfqq_IO_bound(bfqq); -- /* Assuming that the flag in_large_burst is already correctly set */ -- if (bic->wr_time_left && bfqq->bfqd->low_latency && -- !bfq_bfqq_in_large_burst(bfqq) && -- bic->cooperations < bfqq->bfqd->bfq_coop_thresh) { -- /* -- * Start a weight raising period with the duration given by -- * the raising_time_left snapshot. -- */ -- if (bfq_bfqq_busy(bfqq)) -- bfqq->bfqd->wr_busy_queues++; -- bfqq->wr_coeff = bfqq->bfqd->bfq_wr_coeff; -- bfqq->wr_cur_max_time = bic->wr_time_left; -- bfqq->last_wr_start_finish = jiffies; -- bfqq->entity.prio_changed = 1; -+ -+ bfqq->wr_coeff = bic->saved_wr_coeff; -+ bfqq->wr_start_at_switch_to_srt = bic->saved_wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies(bfqq->wr_start_at_switch_to_srt)); -+ bfqq->last_wr_start_finish = bic->saved_last_wr_start_finish; -+ bfqq->wr_cur_max_time = bic->saved_wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ if (bfqq->wr_coeff > 1 && (bfq_bfqq_in_large_burst(bfqq) || -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time))) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "resume state: switching off wr (%lu + %lu < %lu)", -+ bfqq->last_wr_start_finish, bfqq->wr_cur_max_time, -+ jiffies); -+ -+ bfqq->wr_coeff = 1; - } -- /* -- * Clear wr_time_left to prevent bfq_bfqq_save_state() from -- * getting confused about the queue's need of a weight-raising -- * period. -- */ -- bic->wr_time_left = 0; -+ /* make sure weight will be updated, however we got here */ -+ bfqq->entity.prio_changed = 1; - } - - static int bfqq_process_refs(struct bfq_queue *bfqq) -@@ -639,7 +697,7 @@ static int bfqq_process_refs(struct bfq_queue *bfqq) - lockdep_assert_held(bfqq->bfqd->queue->queue_lock); - - io_refs = bfqq->allocated[READ] + bfqq->allocated[WRITE]; -- process_refs = atomic_read(&bfqq->ref) - io_refs - bfqq->entity.on_st; -+ process_refs = bfqq->ref - io_refs - bfqq->entity.on_st; - BUG_ON(process_refs < 0); - return process_refs; - } -@@ -654,6 +712,7 @@ static void bfq_reset_burst_list(struct bfq_data *bfqd, struct bfq_queue *bfqq) - hlist_del_init(&item->burst_list_node); - hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); - bfqd->burst_size = 1; -+ bfqd->burst_parent_entity = bfqq->entity.parent; - } - - /* Add bfqq to the list of queues in current burst (see bfq_handle_burst) */ -@@ -662,6 +721,10 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - /* Increment burst size to take into account also bfqq */ - bfqd->burst_size++; - -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst %d", bfqd->burst_size); -+ -+ BUG_ON(bfqd->burst_size > bfqd->bfq_large_burst_thresh); -+ - if (bfqd->burst_size == bfqd->bfq_large_burst_thresh) { - struct bfq_queue *pos, *bfqq_item; - struct hlist_node *n; -@@ -671,15 +734,19 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * other to consider this burst as large. - */ - bfqd->large_burst = true; -+ bfq_log_bfqq(bfqd, bfqq, "add_to_burst: large burst started"); - - /* - * We can now mark all queues in the burst list as - * belonging to a large burst. - */ - hlist_for_each_entry(bfqq_item, &bfqd->burst_list, -- burst_list_node) -+ burst_list_node) { - bfq_mark_bfqq_in_large_burst(bfqq_item); -+ bfq_log_bfqq(bfqd, bfqq_item, "marked in large burst"); -+ } - bfq_mark_bfqq_in_large_burst(bfqq); -+ bfq_log_bfqq(bfqd, bfqq, "marked in large burst"); - - /* - * From now on, and until the current burst finishes, any -@@ -691,67 +758,79 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - hlist_for_each_entry_safe(pos, n, &bfqd->burst_list, - burst_list_node) - hlist_del_init(&pos->burst_list_node); -- } else /* burst not yet large: add bfqq to the burst list */ -+ } else /* -+ * Burst not yet large: add bfqq to the burst list. Do -+ * not increment the ref counter for bfqq, because bfqq -+ * is removed from the burst list before freeing bfqq -+ * in put_queue. -+ */ - hlist_add_head(&bfqq->burst_list_node, &bfqd->burst_list); - } - - /* -- * If many queues happen to become active shortly after each other, then, -- * to help the processes associated to these queues get their job done as -- * soon as possible, it is usually better to not grant either weight-raising -- * or device idling to these queues. In this comment we describe, firstly, -- * the reasons why this fact holds, and, secondly, the next function, which -- * implements the main steps needed to properly mark these queues so that -- * they can then be treated in a different way. -+ * If many queues belonging to the same group happen to be created -+ * shortly after each other, then the processes associated with these -+ * queues have typically a common goal. In particular, bursts of queue -+ * creations are usually caused by services or applications that spawn -+ * many parallel threads/processes. Examples are systemd during boot, -+ * or git grep. To help these processes get their job done as soon as -+ * possible, it is usually better to not grant either weight-raising -+ * or device idling to their queues. - * -- * As for the terminology, we say that a queue becomes active, i.e., -- * switches from idle to backlogged, either when it is created (as a -- * consequence of the arrival of an I/O request), or, if already existing, -- * when a new request for the queue arrives while the queue is idle. -- * Bursts of activations, i.e., activations of different queues occurring -- * shortly after each other, are typically caused by services or applications -- * that spawn or reactivate many parallel threads/processes. Examples are -- * systemd during boot or git grep. -+ * In this comment we describe, firstly, the reasons why this fact -+ * holds, and, secondly, the next function, which implements the main -+ * steps needed to properly mark these queues so that they can then be -+ * treated in a different way. - * -- * These services or applications benefit mostly from a high throughput: -- * the quicker the requests of the activated queues are cumulatively served, -- * the sooner the target job of these queues gets completed. As a consequence, -- * weight-raising any of these queues, which also implies idling the device -- * for it, is almost always counterproductive: in most cases it just lowers -- * throughput. -+ * The above services or applications benefit mostly from a high -+ * throughput: the quicker the requests of the activated queues are -+ * cumulatively served, the sooner the target job of these queues gets -+ * completed. As a consequence, weight-raising any of these queues, -+ * which also implies idling the device for it, is almost always -+ * counterproductive. In most cases it just lowers throughput. - * -- * On the other hand, a burst of activations may be also caused by the start -- * of an application that does not consist in a lot of parallel I/O-bound -- * threads. In fact, with a complex application, the burst may be just a -- * consequence of the fact that several processes need to be executed to -- * start-up the application. To start an application as quickly as possible, -- * the best thing to do is to privilege the I/O related to the application -- * with respect to all other I/O. Therefore, the best strategy to start as -- * quickly as possible an application that causes a burst of activations is -- * to weight-raise all the queues activated during the burst. This is the -+ * On the other hand, a burst of queue creations may be caused also by -+ * the start of an application that does not consist of a lot of -+ * parallel I/O-bound threads. In fact, with a complex application, -+ * several short processes may need to be executed to start-up the -+ * application. In this respect, to start an application as quickly as -+ * possible, the best thing to do is in any case to privilege the I/O -+ * related to the application with respect to all other -+ * I/O. Therefore, the best strategy to start as quickly as possible -+ * an application that causes a burst of queue creations is to -+ * weight-raise all the queues created during the burst. This is the - * exact opposite of the best strategy for the other type of bursts. - * -- * In the end, to take the best action for each of the two cases, the two -- * types of bursts need to be distinguished. Fortunately, this seems -- * relatively easy to do, by looking at the sizes of the bursts. In -- * particular, we found a threshold such that bursts with a larger size -- * than that threshold are apparently caused only by services or commands -- * such as systemd or git grep. For brevity, hereafter we call just 'large' -- * these bursts. BFQ *does not* weight-raise queues whose activations occur -- * in a large burst. In addition, for each of these queues BFQ performs or -- * does not perform idling depending on which choice boosts the throughput -- * most. The exact choice depends on the device and request pattern at -+ * In the end, to take the best action for each of the two cases, the -+ * two types of bursts need to be distinguished. Fortunately, this -+ * seems relatively easy, by looking at the sizes of the bursts. In -+ * particular, we found a threshold such that only bursts with a -+ * larger size than that threshold are apparently caused by -+ * services or commands such as systemd or git grep. For brevity, -+ * hereafter we call just 'large' these bursts. BFQ *does not* -+ * weight-raise queues whose creation occurs in a large burst. In -+ * addition, for each of these queues BFQ performs or does not perform -+ * idling depending on which choice boosts the throughput more. The -+ * exact choice depends on the device and request pattern at - * hand. - * -- * Turning back to the next function, it implements all the steps needed -- * to detect the occurrence of a large burst and to properly mark all the -- * queues belonging to it (so that they can then be treated in a different -- * way). This goal is achieved by maintaining a special "burst list" that -- * holds, temporarily, the queues that belong to the burst in progress. The -- * list is then used to mark these queues as belonging to a large burst if -- * the burst does become large. The main steps are the following. -+ * Unfortunately, false positives may occur while an interactive task -+ * is starting (e.g., an application is being started). The -+ * consequence is that the queues associated with the task do not -+ * enjoy weight raising as expected. Fortunately these false positives -+ * are very rare. They typically occur if some service happens to -+ * start doing I/O exactly when the interactive task starts. - * -- * . when the very first queue is activated, the queue is inserted into the -+ * Turning back to the next function, it implements all the steps -+ * needed to detect the occurrence of a large burst and to properly -+ * mark all the queues belonging to it (so that they can then be -+ * treated in a different way). This goal is achieved by maintaining a -+ * "burst list" that holds, temporarily, the queues that belong to the -+ * burst in progress. The list is then used to mark these queues as -+ * belonging to a large burst if the burst does become large. The main -+ * steps are the following. -+ * -+ * . when the very first queue is created, the queue is inserted into the - * list (as it could be the first queue in a possible burst) - * - * . if the current burst has not yet become large, and a queue Q that does -@@ -772,13 +851,13 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * - * . the device enters a large-burst mode - * -- * . if a queue Q that does not belong to the burst is activated while -+ * . if a queue Q that does not belong to the burst is created while - * the device is in large-burst mode and shortly after the last time - * at which a queue either entered the burst list or was marked as - * belonging to the current large burst, then Q is immediately marked - * as belonging to a large burst. - * -- * . if a queue Q that does not belong to the burst is activated a while -+ * . if a queue Q that does not belong to the burst is created a while - * later, i.e., not shortly after, than the last time at which a queue - * either entered the burst list or was marked as belonging to the - * current large burst, then the current burst is deemed as finished and: -@@ -791,52 +870,44 @@ static void bfq_add_to_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - * in a possible new burst (then the burst list contains just Q - * after this step). - */ --static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- bool idle_for_long_time) -+static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - /* -- * If bfqq happened to be activated in a burst, but has been idle -- * for at least as long as an interactive queue, then we assume -- * that, in the overall I/O initiated in the burst, the I/O -- * associated to bfqq is finished. So bfqq does not need to be -- * treated as a queue belonging to a burst anymore. Accordingly, -- * we reset bfqq's in_large_burst flag if set, and remove bfqq -- * from the burst list if it's there. We do not decrement instead -- * burst_size, because the fact that bfqq does not need to belong -- * to the burst list any more does not invalidate the fact that -- * bfqq may have been activated during the current burst. -- */ -- if (idle_for_long_time) { -- hlist_del_init(&bfqq->burst_list_node); -- bfq_clear_bfqq_in_large_burst(bfqq); -- } -- -- /* - * If bfqq is already in the burst list or is part of a large -- * burst, then there is nothing else to do. -+ * burst, or finally has just been split, then there is -+ * nothing else to do. - */ - if (!hlist_unhashed(&bfqq->burst_list_node) || -- bfq_bfqq_in_large_burst(bfqq)) -+ bfq_bfqq_in_large_burst(bfqq) || -+ time_is_after_eq_jiffies(bfqq->split_time + -+ msecs_to_jiffies(10))) - return; - - /* -- * If bfqq's activation happens late enough, then the current -- * burst is finished, and related data structures must be reset. -+ * If bfqq's creation happens late enough, or bfqq belongs to -+ * a different group than the burst group, then the current -+ * burst is finished, and related data structures must be -+ * reset. - * -- * In this respect, consider the special case where bfqq is the very -- * first queue being activated. In this case, last_ins_in_burst is -- * not yet significant when we get here. But it is easy to verify -- * that, whether or not the following condition is true, bfqq will -- * end up being inserted into the burst list. In particular the -- * list will happen to contain only bfqq. And this is exactly what -- * has to happen, as bfqq may be the first queue in a possible -+ * In this respect, consider the special case where bfqq is -+ * the very first queue created after BFQ is selected for this -+ * device. In this case, last_ins_in_burst and -+ * burst_parent_entity are not yet significant when we get -+ * here. But it is easy to verify that, whether or not the -+ * following condition is true, bfqq will end up being -+ * inserted into the burst list. In particular the list will -+ * happen to contain only bfqq. And this is exactly what has -+ * to happen, as bfqq may be the first queue of the first - * burst. - */ - if (time_is_before_jiffies(bfqd->last_ins_in_burst + -- bfqd->bfq_burst_interval)) { -+ bfqd->bfq_burst_interval) || -+ bfqq->entity.parent != bfqd->burst_parent_entity) { - bfqd->large_burst = false; - bfq_reset_burst_list(bfqd, bfqq); -- return; -+ bfq_log_bfqq(bfqd, bfqq, -+ "handle_burst: late activation or different group"); -+ goto end; - } - - /* -@@ -845,8 +916,9 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * bfqq as belonging to this large burst immediately. - */ - if (bfqd->large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, "handle_burst: marked in burst"); - bfq_mark_bfqq_in_large_burst(bfqq); -- return; -+ goto end; - } - - /* -@@ -855,25 +927,490 @@ static void bfq_handle_burst(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * queue. Then we add bfqq to the burst. - */ - bfq_add_to_burst(bfqd, bfqq); -+end: -+ /* -+ * At this point, bfqq either has been added to the current -+ * burst or has caused the current burst to terminate and a -+ * possible new burst to start. In particular, in the second -+ * case, bfqq has become the first queue in the possible new -+ * burst. In both cases last_ins_in_burst needs to be moved -+ * forward. -+ */ -+ bfqd->last_ins_in_burst = jiffies; -+ -+} -+ -+static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ return entity->budget - entity->service; -+} -+ -+/* -+ * If enough samples have been computed, return the current max budget -+ * stored in bfqd, which is dynamically updated according to the -+ * estimated disk peak rate; otherwise return the default max budget -+ */ -+static int bfq_max_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget; -+ else -+ return bfqd->bfq_max_budget; -+} -+ -+/* -+ * Return min budget, which is a fraction of the current or default -+ * max budget (trying with 1/32) -+ */ -+static int bfq_min_budget(struct bfq_data *bfqd) -+{ -+ if (bfqd->budgets_assigned < bfq_stats_min_budgets) -+ return bfq_default_max_budget / 32; -+ else -+ return bfqd->bfq_max_budget / 32; -+} -+ -+static void bfq_bfqq_expire(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool compensate, -+ enum bfqq_expiration reason); -+ -+/* -+ * The next function, invoked after the input queue bfqq switches from -+ * idle to busy, updates the budget of bfqq. The function also tells -+ * whether the in-service queue should be expired, by returning -+ * true. The purpose of expiring the in-service queue is to give bfqq -+ * the chance to possibly preempt the in-service queue, and the reason -+ * for preempting the in-service queue is to achieve one of the two -+ * goals below. -+ * -+ * 1. Guarantee to bfqq its reserved bandwidth even if bfqq has -+ * expired because it has remained idle. In particular, bfqq may have -+ * expired for one of the following two reasons: -+ * -+ * - BFQ_BFQQ_NO_MORE_REQUEST bfqq did not enjoy any device idling and -+ * did not make it to issue a new request before its last request -+ * was served; -+ * -+ * - BFQ_BFQQ_TOO_IDLE bfqq did enjoy device idling, but did not issue -+ * a new request before the expiration of the idling-time. -+ * -+ * Even if bfqq has expired for one of the above reasons, the process -+ * associated with the queue may be however issuing requests greedily, -+ * and thus be sensitive to the bandwidth it receives (bfqq may have -+ * remained idle for other reasons: CPU high load, bfqq not enjoying -+ * idling, I/O throttling somewhere in the path from the process to -+ * the I/O scheduler, ...). But if, after every expiration for one of -+ * the above two reasons, bfqq has to wait for the service of at least -+ * one full budget of another queue before being served again, then -+ * bfqq is likely to get a much lower bandwidth or resource time than -+ * its reserved ones. To address this issue, two countermeasures need -+ * to be taken. -+ * -+ * First, the budget and the timestamps of bfqq need to be updated in -+ * a special way on bfqq reactivation: they need to be updated as if -+ * bfqq did not remain idle and did not expire. In fact, if they are -+ * computed as if bfqq expired and remained idle until reactivation, -+ * then the process associated with bfqq is treated as if, instead of -+ * being greedy, it stopped issuing requests when bfqq remained idle, -+ * and restarts issuing requests only on this reactivation. In other -+ * words, the scheduler does not help the process recover the "service -+ * hole" between bfqq expiration and reactivation. As a consequence, -+ * the process receives a lower bandwidth than its reserved one. In -+ * contrast, to recover this hole, the budget must be updated as if -+ * bfqq was not expired at all before this reactivation, i.e., it must -+ * be set to the value of the remaining budget when bfqq was -+ * expired. Along the same line, timestamps need to be assigned the -+ * value they had the last time bfqq was selected for service, i.e., -+ * before last expiration. Thus timestamps need to be back-shifted -+ * with respect to their normal computation (see [1] for more details -+ * on this tricky aspect). -+ * -+ * Secondly, to allow the process to recover the hole, the in-service -+ * queue must be expired too, to give bfqq the chance to preempt it -+ * immediately. In fact, if bfqq has to wait for a full budget of the -+ * in-service queue to be completed, then it may become impossible to -+ * let the process recover the hole, even if the back-shifted -+ * timestamps of bfqq are lower than those of the in-service queue. If -+ * this happens for most or all of the holes, then the process may not -+ * receive its reserved bandwidth. In this respect, it is worth noting -+ * that, being the service of outstanding requests unpreemptible, a -+ * little fraction of the holes may however be unrecoverable, thereby -+ * causing a little loss of bandwidth. -+ * -+ * The last important point is detecting whether bfqq does need this -+ * bandwidth recovery. In this respect, the next function deems the -+ * process associated with bfqq greedy, and thus allows it to recover -+ * the hole, if: 1) the process is waiting for the arrival of a new -+ * request (which implies that bfqq expired for one of the above two -+ * reasons), and 2) such a request has arrived soon. The first -+ * condition is controlled through the flag non_blocking_wait_rq, -+ * while the second through the flag arrived_in_time. If both -+ * conditions hold, then the function computes the budget in the -+ * above-described special way, and signals that the in-service queue -+ * should be expired. Timestamp back-shifting is done later in -+ * __bfq_activate_entity. -+ * -+ * 2. Reduce latency. Even if timestamps are not backshifted to let -+ * the process associated with bfqq recover a service hole, bfqq may -+ * however happen to have, after being (re)activated, a lower finish -+ * timestamp than the in-service queue. That is, the next budget of -+ * bfqq may have to be completed before the one of the in-service -+ * queue. If this is the case, then preempting the in-service queue -+ * allows this goal to be achieved, apart from the unpreemptible, -+ * outstanding requests mentioned above. -+ * -+ * Unfortunately, regardless of which of the above two goals one wants -+ * to achieve, service trees need first to be updated to know whether -+ * the in-service queue must be preempted. To have service trees -+ * correctly updated, the in-service queue must be expired and -+ * rescheduled, and bfqq must be scheduled too. This is one of the -+ * most costly operations (in future versions, the scheduling -+ * mechanism may be re-designed in such a way to make it possible to -+ * know whether preemption is needed without needing to update service -+ * trees). In addition, queue preemptions almost always cause random -+ * I/O, and thus loss of throughput. Because of these facts, the next -+ * function adopts the following simple scheme to avoid both costly -+ * operations and too frequent preemptions: it requests the expiration -+ * of the in-service queue (unconditionally) only for queues that need -+ * to recover a hole, or that either are weight-raised or deserve to -+ * be weight-raised. -+ */ -+static bool bfq_bfqq_update_budg_for_activation(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ bool arrived_in_time, -+ bool wr_or_deserves_wr) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ if (bfq_bfqq_non_blocking_wait_rq(bfqq) && arrived_in_time) { -+ /* -+ * We do not clear the flag non_blocking_wait_rq here, as -+ * the latter is used in bfq_activate_bfqq to signal -+ * that timestamps need to be back-shifted (and is -+ * cleared right after). -+ */ -+ -+ /* -+ * In next assignment we rely on that either -+ * entity->service or entity->budget are not updated -+ * on expiration if bfqq is empty (see -+ * __bfq_bfqq_recalc_budget). Thus both quantities -+ * remain unchanged after such an expiration, and the -+ * following statement therefore assigns to -+ * entity->budget the remaining budget on such an -+ * expiration. For clarity, entity->service is not -+ * updated on expiration in any case, and, in normal -+ * operation, is reset only when bfqq is selected for -+ * service (see bfq_get_next_queue). -+ */ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = min_t(unsigned long, -+ bfq_bfqq_budget_left(bfqq), -+ bfqq->max_budget); -+ -+ BUG_ON(entity->budget < 0); -+ return true; -+ } -+ -+ BUG_ON(bfqq->max_budget < 0); -+ entity->budget = max_t(unsigned long, bfqq->max_budget, -+ bfq_serv_to_charge(bfqq->next_rq, bfqq)); -+ BUG_ON(entity->budget < 0); -+ -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+ return wr_or_deserves_wr; -+} -+ -+static void bfq_update_bfqq_wr_on_rq_arrival(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ unsigned int old_wr_coeff, -+ bool wr_or_deserves_wr, -+ bool interactive, -+ bool in_burst, -+ bool soft_rt) -+{ -+ if (old_wr_coeff == 1 && wr_or_deserves_wr) { -+ /* start a weight-raising period */ -+ if (interactive) { -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else { -+ bfqq->wr_start_at_switch_to_srt = jiffies; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ } -+ /* -+ * If needed, further reduce budget to make sure it is -+ * close to bfqq's backlog, so as to reduce the -+ * scheduling-error component due to a too large -+ * budget. Do not care about throughput consequences, -+ * but only about latency. Finally, do not assign a -+ * too small budget either, to avoid increasing -+ * latency by causing too frequent expirations. -+ */ -+ bfqq->entity.budget = min_t(unsigned long, -+ bfqq->entity.budget, -+ 2 * bfq_min_budget(bfqd)); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais starting at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } else if (old_wr_coeff > 1) { -+ if (interactive) { /* update wr coeff and duration */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ } else if (in_burst) { -+ bfqq->wr_coeff = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "wrais ending at %lu, rais_max_time %u", -+ jiffies, -+ jiffies_to_msecs(bfqq-> -+ wr_cur_max_time)); -+ } else if (soft_rt) { -+ /* -+ * The application is now or still meeting the -+ * requirements for being deemed soft rt. We -+ * can then correctly and safely (re)charge -+ * the weight-raising duration for the -+ * application with the weight-raising -+ * duration for soft rt applications. -+ * -+ * In particular, doing this recharge now, i.e., -+ * before the weight-raising period for the -+ * application finishes, reduces the probability -+ * of the following negative scenario: -+ * 1) the weight of a soft rt application is -+ * raised at startup (as for any newly -+ * created application), -+ * 2) since the application is not interactive, -+ * at a certain time weight-raising is -+ * stopped for the application, -+ * 3) at that time the application happens to -+ * still have pending requests, and hence -+ * is destined to not have a chance to be -+ * deemed soft rt before these requests are -+ * completed (see the comments to the -+ * function bfq_bfqq_softrt_next_start() -+ * for details on soft rt detection), -+ * 4) these pending requests experience a high -+ * latency because the application is not -+ * weight-raised while they are pending. -+ */ -+ if (bfqq->wr_cur_max_time != -+ bfqd->bfq_wr_rt_max_time) { -+ bfqq->wr_start_at_switch_to_srt = -+ bfqq->last_wr_start_finish; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ -+ bfqq->wr_cur_max_time = -+ bfqd->bfq_wr_rt_max_time; -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff * -+ BFQ_SOFTRT_WEIGHT_FACTOR; -+ bfq_log_bfqq(bfqd, bfqq, -+ "switching to soft_rt wr"); -+ } else -+ bfq_log_bfqq(bfqd, bfqq, -+ "moving forward soft_rt wr duration"); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+} -+ -+static bool bfq_bfqq_idle_for_long_time(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ return bfqq->dispatched == 0 && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ bfqd->bfq_wr_min_idle_time); -+} -+ -+static void bfq_bfqq_handle_idle_busy_switch(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq, -+ int old_wr_coeff, -+ struct request *rq, -+ bool *interactive) -+{ -+ bool soft_rt, in_burst, wr_or_deserves_wr, -+ bfqq_wants_to_preempt, -+ idle_for_long_time = bfq_bfqq_idle_for_long_time(bfqd, bfqq), -+ /* -+ * See the comments on -+ * bfq_bfqq_update_budg_for_activation for -+ * details on the usage of the next variable. -+ */ -+ arrived_in_time = ktime_get_ns() <= -+ RQ_BIC(rq)->ttime.last_end_request + -+ bfqd->bfq_slice_idle * 3; -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request non-busy: " -+ "jiffies %lu, in_time %d, idle_long %d busyw %d " -+ "wr_coeff %u", -+ jiffies, arrived_in_time, -+ idle_for_long_time, -+ bfq_bfqq_non_blocking_wait_rq(bfqq), -+ old_wr_coeff); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, rq->cmd_flags); -+ -+ /* -+ * bfqq deserves to be weight-raised if: -+ * - it is sync, -+ * - it does not belong to a large burst, -+ * - it has been idle for enough time or is soft real-time, -+ * - is linked to a bfq_io_cq (it is not shared in any sense) -+ */ -+ in_burst = bfq_bfqq_in_large_burst(bfqq); -+ soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -+ !in_burst && -+ time_is_before_jiffies(bfqq->soft_rt_next_start); -+ *interactive = -+ !in_burst && -+ idle_for_long_time; -+ wr_or_deserves_wr = bfqd->low_latency && -+ (bfqq->wr_coeff > 1 || -+ (bfq_bfqq_sync(bfqq) && -+ bfqq->bic && (*interactive || soft_rt))); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "bfq_add_request: " -+ "in_burst %d, " -+ "soft_rt %d (next %lu), inter %d, bic %p", -+ bfq_bfqq_in_large_burst(bfqq), soft_rt, -+ bfqq->soft_rt_next_start, -+ *interactive, -+ bfqq->bic); -+ -+ /* -+ * Using the last flag, update budget and check whether bfqq -+ * may want to preempt the in-service queue. -+ */ -+ bfqq_wants_to_preempt = -+ bfq_bfqq_update_budg_for_activation(bfqd, bfqq, -+ arrived_in_time, -+ wr_or_deserves_wr); -+ -+ /* -+ * If bfqq happened to be activated in a burst, but has been -+ * idle for much more than an interactive queue, then we -+ * assume that, in the overall I/O initiated in the burst, the -+ * I/O associated with bfqq is finished. So bfqq does not need -+ * to be treated as a queue belonging to a burst -+ * anymore. Accordingly, we reset bfqq's in_large_burst flag -+ * if set, and remove bfqq from the burst list if it's -+ * there. We do not decrement burst_size, because the fact -+ * that bfqq does not need to belong to the burst list any -+ * more does not invalidate the fact that bfqq was created in -+ * a burst. -+ */ -+ if (likely(!bfq_bfqq_just_created(bfqq)) && -+ idle_for_long_time && -+ time_is_before_jiffies( -+ bfqq->budget_timeout + -+ msecs_to_jiffies(10000))) { -+ hlist_del_init(&bfqq->burst_list_node); -+ bfq_clear_bfqq_in_large_burst(bfqq); -+ } -+ -+ bfq_clear_bfqq_just_created(bfqq); -+ -+ if (!bfq_bfqq_IO_bound(bfqq)) { -+ if (arrived_in_time) { -+ bfqq->requests_within_timer++; -+ if (bfqq->requests_within_timer >= -+ bfqd->bfq_requests_within_timer) -+ bfq_mark_bfqq_IO_bound(bfqq); -+ } else -+ bfqq->requests_within_timer = 0; -+ bfq_log_bfqq(bfqd, bfqq, "requests in time %d", -+ bfqq->requests_within_timer); -+ } -+ -+ if (bfqd->low_latency) { -+ if (unlikely(time_is_after_jiffies(bfqq->split_time))) -+ /* wraparound */ -+ bfqq->split_time = -+ jiffies - bfqd->bfq_wr_min_idle_time - 1; -+ -+ if (time_is_before_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) { -+ bfq_update_bfqq_wr_on_rq_arrival(bfqd, bfqq, -+ old_wr_coeff, -+ wr_or_deserves_wr, -+ *interactive, -+ in_burst, -+ soft_rt); -+ -+ if (old_wr_coeff != bfqq->wr_coeff) -+ bfqq->entity.prio_changed = 1; -+ } -+ } -+ -+ bfqq->last_idle_bklogged = jiffies; -+ bfqq->service_from_backlogged = 0; -+ bfq_clear_bfqq_softrt_update(bfqq); -+ -+ bfq_add_bfqq_busy(bfqd, bfqq); -+ -+ /* -+ * Expire in-service queue only if preemption may be needed -+ * for guarantees. In this respect, the function -+ * next_queue_may_preempt just checks a simple, necessary -+ * condition, and not a sufficient condition based on -+ * timestamps. In fact, for the latter condition to be -+ * evaluated, timestamps would need first to be updated, and -+ * this operation is quite costly (see the comments on the -+ * function bfq_bfqq_update_budg_for_activation). -+ */ -+ if (bfqd->in_service_queue && bfqq_wants_to_preempt && -+ bfqd->in_service_queue->wr_coeff < bfqq->wr_coeff && -+ next_queue_may_preempt(bfqd)) { -+ struct bfq_queue *in_serv = -+ bfqd->in_service_queue; -+ BUG_ON(in_serv == bfqq); -+ -+ bfq_bfqq_expire(bfqd, bfqd->in_service_queue, -+ false, BFQ_BFQQ_PREEMPTED); -+ BUG_ON(in_serv->entity.budget < 0); -+ } - } - - static void bfq_add_request(struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); -- struct bfq_entity *entity = &bfqq->entity; - struct bfq_data *bfqd = bfqq->bfqd; - struct request *next_rq, *prev; -- unsigned long old_wr_coeff = bfqq->wr_coeff; -+ unsigned int old_wr_coeff = bfqq->wr_coeff; - bool interactive = false; - -- bfq_log_bfqq(bfqd, bfqq, "add_request %d", rq_is_sync(rq)); -+ bfq_log_bfqq(bfqd, bfqq, "add_request: size %u %s", -+ blk_rq_sectors(rq), rq_is_sync(rq) ? "S" : "A"); -+ -+ if (bfqq->wr_coeff > 1) /* queue is being weight-raised */ -+ bfq_log_bfqq(bfqd, bfqq, -+ "raising period dur %u/%u msec, old coeff %u, w %d(%d)", -+ jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqq->wr_coeff, -+ bfqq->entity.weight, bfqq->entity.orig_weight); -+ - bfqq->queued[rq_is_sync(rq)]++; - bfqd->queued++; - - elv_rb_add(&bfqq->sort_list, rq); - - /* -- * Check if this request is a better next-serve candidate. -+ * Check if this request is a better next-to-serve candidate. - */ - prev = bfqq->next_rq; - next_rq = bfq_choose_req(bfqd, bfqq->next_rq, rq, bfqd->last_position); -@@ -886,160 +1423,10 @@ static void bfq_add_request(struct request *rq) - if (prev != bfqq->next_rq) - bfq_pos_tree_add_move(bfqd, bfqq); - -- if (!bfq_bfqq_busy(bfqq)) { -- bool soft_rt, coop_or_in_burst, -- idle_for_long_time = time_is_before_jiffies( -- bfqq->budget_timeout + -- bfqd->bfq_wr_min_idle_time); -- --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- bfqg_stats_update_io_add(bfqq_group(RQ_BFQQ(rq)), bfqq, -- rq->cmd_flags); --#endif -- if (bfq_bfqq_sync(bfqq)) { -- bool already_in_burst = -- !hlist_unhashed(&bfqq->burst_list_node) || -- bfq_bfqq_in_large_burst(bfqq); -- bfq_handle_burst(bfqd, bfqq, idle_for_long_time); -- /* -- * If bfqq was not already in the current burst, -- * then, at this point, bfqq either has been -- * added to the current burst or has caused the -- * current burst to terminate. In particular, in -- * the second case, bfqq has become the first -- * queue in a possible new burst. -- * In both cases last_ins_in_burst needs to be -- * moved forward. -- */ -- if (!already_in_burst) -- bfqd->last_ins_in_burst = jiffies; -- } -- -- coop_or_in_burst = bfq_bfqq_in_large_burst(bfqq) || -- bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh; -- soft_rt = bfqd->bfq_wr_max_softrt_rate > 0 && -- !coop_or_in_burst && -- time_is_before_jiffies(bfqq->soft_rt_next_start); -- interactive = !coop_or_in_burst && idle_for_long_time; -- entity->budget = max_t(unsigned long, bfqq->max_budget, -- bfq_serv_to_charge(next_rq, bfqq)); -- -- if (!bfq_bfqq_IO_bound(bfqq)) { -- if (time_before(jiffies, -- RQ_BIC(rq)->ttime.last_end_request + -- bfqd->bfq_slice_idle)) { -- bfqq->requests_within_timer++; -- if (bfqq->requests_within_timer >= -- bfqd->bfq_requests_within_timer) -- bfq_mark_bfqq_IO_bound(bfqq); -- } else -- bfqq->requests_within_timer = 0; -- } -- -- if (!bfqd->low_latency) -- goto add_bfqq_busy; -- -- if (bfq_bfqq_just_split(bfqq)) -- goto set_prio_changed; -- -- /* -- * If the queue: -- * - is not being boosted, -- * - has been idle for enough time, -- * - is not a sync queue or is linked to a bfq_io_cq (it is -- * shared "for its nature" or it is not shared and its -- * requests have not been redirected to a shared queue) -- * start a weight-raising period. -- */ -- if (old_wr_coeff == 1 && (interactive || soft_rt) && -- (!bfq_bfqq_sync(bfqq) || bfqq->bic)) { -- bfqq->wr_coeff = bfqd->bfq_wr_coeff; -- if (interactive) -- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- else -- bfqq->wr_cur_max_time = -- bfqd->bfq_wr_rt_max_time; -- bfq_log_bfqq(bfqd, bfqq, -- "wrais starting at %lu, rais_max_time %u", -- jiffies, -- jiffies_to_msecs(bfqq->wr_cur_max_time)); -- } else if (old_wr_coeff > 1) { -- if (interactive) -- bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -- else if (coop_or_in_burst || -- (bfqq->wr_cur_max_time == -- bfqd->bfq_wr_rt_max_time && -- !soft_rt)) { -- bfqq->wr_coeff = 1; -- bfq_log_bfqq(bfqd, bfqq, -- "wrais ending at %lu, rais_max_time %u", -- jiffies, -- jiffies_to_msecs(bfqq-> -- wr_cur_max_time)); -- } else if (time_before( -- bfqq->last_wr_start_finish + -- bfqq->wr_cur_max_time, -- jiffies + -- bfqd->bfq_wr_rt_max_time) && -- soft_rt) { -- /* -- * -- * The remaining weight-raising time is lower -- * than bfqd->bfq_wr_rt_max_time, which means -- * that the application is enjoying weight -- * raising either because deemed soft-rt in -- * the near past, or because deemed interactive -- * a long ago. -- * In both cases, resetting now the current -- * remaining weight-raising time for the -- * application to the weight-raising duration -- * for soft rt applications would not cause any -- * latency increase for the application (as the -- * new duration would be higher than the -- * remaining time). -- * -- * In addition, the application is now meeting -- * the requirements for being deemed soft rt. -- * In the end we can correctly and safely -- * (re)charge the weight-raising duration for -- * the application with the weight-raising -- * duration for soft rt applications. -- * -- * In particular, doing this recharge now, i.e., -- * before the weight-raising period for the -- * application finishes, reduces the probability -- * of the following negative scenario: -- * 1) the weight of a soft rt application is -- * raised at startup (as for any newly -- * created application), -- * 2) since the application is not interactive, -- * at a certain time weight-raising is -- * stopped for the application, -- * 3) at that time the application happens to -- * still have pending requests, and hence -- * is destined to not have a chance to be -- * deemed soft rt before these requests are -- * completed (see the comments to the -- * function bfq_bfqq_softrt_next_start() -- * for details on soft rt detection), -- * 4) these pending requests experience a high -- * latency because the application is not -- * weight-raised while they are pending. -- */ -- bfqq->last_wr_start_finish = jiffies; -- bfqq->wr_cur_max_time = -- bfqd->bfq_wr_rt_max_time; -- } -- } --set_prio_changed: -- if (old_wr_coeff != bfqq->wr_coeff) -- entity->prio_changed = 1; --add_bfqq_busy: -- bfqq->last_idle_bklogged = jiffies; -- bfqq->service_from_backlogged = 0; -- bfq_clear_bfqq_softrt_update(bfqq); -- bfq_add_bfqq_busy(bfqd, bfqq); -- } else { -+ if (!bfq_bfqq_busy(bfqq)) /* switching to busy ... */ -+ bfq_bfqq_handle_idle_busy_switch(bfqd, bfqq, old_wr_coeff, -+ rq, &interactive); -+ else { - if (bfqd->low_latency && old_wr_coeff == 1 && !rq_is_sync(rq) && - time_is_before_jiffies( - bfqq->last_wr_start_finish + -@@ -1048,16 +1435,43 @@ static void bfq_add_request(struct request *rq) - bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); - - bfqd->wr_busy_queues++; -- entity->prio_changed = 1; -+ bfqq->entity.prio_changed = 1; - bfq_log_bfqq(bfqd, bfqq, -- "non-idle wrais starting at %lu, rais_max_time %u", -- jiffies, -- jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ "non-idle wrais starting, " -+ "wr_max_time %u wr_busy %d", -+ jiffies_to_msecs(bfqq->wr_cur_max_time), -+ bfqd->wr_busy_queues); - } - if (prev != bfqq->next_rq) - bfq_updated_next_req(bfqd, bfqq); - } - -+ /* -+ * Assign jiffies to last_wr_start_finish in the following -+ * cases: -+ * -+ * . if bfqq is not going to be weight-raised, because, for -+ * non weight-raised queues, last_wr_start_finish stores the -+ * arrival time of the last request; as of now, this piece -+ * of information is used only for deciding whether to -+ * weight-raise async queues -+ * -+ * . if bfqq is not weight-raised, because, if bfqq is now -+ * switching to weight-raised, then last_wr_start_finish -+ * stores the time when weight-raising starts -+ * -+ * . if bfqq is interactive, because, regardless of whether -+ * bfqq is currently weight-raised, the weight-raising -+ * period must start or restart (this case is considered -+ * separately because it is not detected by the above -+ * conditions, if bfqq is already weight-raised) -+ * -+ * last_wr_start_finish has to be updated also if bfqq is soft -+ * real-time, because the weight-raising period is constantly -+ * restarted on idle-to-busy transitions for these queues, but -+ * this is already done in bfq_bfqq_handle_idle_busy_switch if -+ * needed. -+ */ - if (bfqd->low_latency && - (old_wr_coeff == 1 || bfqq->wr_coeff == 1 || interactive)) - bfqq->last_wr_start_finish = jiffies; -@@ -1074,22 +1488,32 @@ static struct request *bfq_find_rq_fmerge(struct bfq_data *bfqd, - if (!bic) - return NULL; - -- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); -+ bfqq = bic_to_bfqq(bic, op_is_sync(bio->bi_opf)); - if (bfqq) - return elv_rb_find(&bfqq->sort_list, bio_end_sector(bio)); - - return NULL; - } - --static void bfq_activate_request(struct request_queue *q, struct request *rq) -+static sector_t get_sdist(sector_t last_pos, struct request *rq) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; -- -- bfqd->rq_in_driver++; -- bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -- bfq_log(bfqd, "activate_request: new bfqd->last_position %llu", -- (unsigned long long) bfqd->last_position); --} -+ sector_t sdist = 0; -+ -+ if (last_pos) { -+ if (last_pos < blk_rq_pos(rq)) -+ sdist = blk_rq_pos(rq) - last_pos; -+ else -+ sdist = last_pos - blk_rq_pos(rq); -+ } -+ -+ return sdist; -+} -+ -+static void bfq_activate_request(struct request_queue *q, struct request *rq) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ bfqd->rq_in_driver++; -+} - - static void bfq_deactivate_request(struct request_queue *q, struct request *rq) - { -@@ -1105,6 +1529,9 @@ static void bfq_remove_request(struct request *rq) - struct bfq_data *bfqd = bfqq->bfqd; - const int sync = rq_is_sync(rq); - -+ BUG_ON(bfqq->entity.service > bfqq->entity.budget && -+ bfqq == bfqd->in_service_queue); -+ - if (bfqq->next_rq == rq) { - bfqq->next_rq = bfq_find_next_rq(bfqd, bfqq, rq); - bfq_updated_next_req(bfqd, bfqq); -@@ -1118,8 +1545,26 @@ static void bfq_remove_request(struct request *rq) - elv_rb_del(&bfqq->sort_list, rq); - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -- if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) -- bfq_del_bfqq_busy(bfqd, bfqq, 1); -+ bfqq->next_rq = NULL; -+ -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ if (bfq_bfqq_busy(bfqq) && bfqq != bfqd->in_service_queue) { -+ bfq_del_bfqq_busy(bfqd, bfqq, false); -+ /* bfqq emptied. In normal operation, when -+ * bfqq is empty, bfqq->entity.service and -+ * bfqq->entity.budget must contain, -+ * respectively, the service received and the -+ * budget used last time bfqq emptied. These -+ * facts do not hold in this case, as at least -+ * this last removal occurred while bfqq is -+ * not in service. To avoid inconsistencies, -+ * reset both bfqq->entity.service and -+ * bfqq->entity.budget. -+ */ -+ bfqq->entity.budget = bfqq->entity.service = 0; -+ } -+ - /* - * Remove queue from request-position tree as it is empty. - */ -@@ -1133,9 +1578,7 @@ static void bfq_remove_request(struct request *rq) - BUG_ON(bfqq->meta_pending == 0); - bfqq->meta_pending--; - } --#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_remove(bfqq_group(bfqq), rq->cmd_flags); --#endif - } - - static int bfq_merge(struct request_queue *q, struct request **req, -@@ -1145,7 +1588,7 @@ static int bfq_merge(struct request_queue *q, struct request **req, - struct request *__rq; - - __rq = bfq_find_rq_fmerge(bfqd, bio); -- if (__rq && elv_rq_merge_ok(__rq, bio)) { -+ if (__rq && elv_bio_merge_ok(__rq, bio)) { - *req = __rq; - return ELEVATOR_FRONT_MERGE; - } -@@ -1190,7 +1633,7 @@ static void bfq_merged_request(struct request_queue *q, struct request *req, - static void bfq_bio_merged(struct request_queue *q, struct request *req, - struct bio *bio) - { -- bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_rw); -+ bfqg_stats_update_io_merged(bfqq_group(RQ_BFQQ(req)), bio->bi_opf); - } - #endif - -@@ -1210,7 +1653,7 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, - */ - if (bfqq == next_bfqq && - !list_empty(&rq->queuelist) && !list_empty(&next->queuelist) && -- time_before(next->fifo_time, rq->fifo_time)) { -+ next->fifo_time < rq->fifo_time) { - list_del_init(&rq->queuelist); - list_replace_init(&next->queuelist, &rq->queuelist); - rq->fifo_time = next->fifo_time; -@@ -1220,21 +1663,30 @@ static void bfq_merged_requests(struct request_queue *q, struct request *rq, - bfqq->next_rq = rq; - - bfq_remove_request(next); --#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_io_merged(bfqq_group(bfqq), next->cmd_flags); --#endif - } - - /* Must be called with bfqq != NULL */ - static void bfq_bfqq_end_wr(struct bfq_queue *bfqq) - { - BUG_ON(!bfqq); -+ - if (bfq_bfqq_busy(bfqq)) - bfqq->bfqd->wr_busy_queues--; - bfqq->wr_coeff = 1; - bfqq->wr_cur_max_time = 0; -- /* Trigger a weight change on the next activation of the queue */ -+ bfqq->last_wr_start_finish = jiffies; -+ /* -+ * Trigger a weight change on the next invocation of -+ * __bfq_entity_update_weight_prio. -+ */ - bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "end_wr: wrais ending at %lu, rais_max_time %u", -+ bfqq->last_wr_start_finish, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "end_wr: wr_busy %d", -+ bfqq->bfqd->wr_busy_queues); - } - - static void bfq_end_wr_async_queues(struct bfq_data *bfqd, -@@ -1277,7 +1729,7 @@ static int bfq_rq_close_to_sector(void *io_struct, bool request, - sector_t sector) - { - return abs(bfq_io_struct_pos(io_struct, request) - sector) <= -- BFQQ_SEEK_THR; -+ BFQQ_CLOSE_THR; - } - - static struct bfq_queue *bfqq_find_close(struct bfq_data *bfqd, -@@ -1399,7 +1851,7 @@ bfq_setup_merge(struct bfq_queue *bfqq, struct bfq_queue *new_bfqq) - * throughput. - */ - bfqq->new_bfqq = new_bfqq; -- atomic_add(process_refs, &new_bfqq->ref); -+ new_bfqq->ref += process_refs; - return new_bfqq; - } - -@@ -1430,9 +1882,23 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - } - - /* -- * Attempt to schedule a merge of bfqq with the currently in-service queue -- * or with a close queue among the scheduled queues. -- * Return NULL if no merge was scheduled, a pointer to the shared bfq_queue -+ * If this function returns true, then bfqq cannot be merged. The idea -+ * is that true cooperation happens very early after processes start -+ * to do I/O. Usually, late cooperations are just accidental false -+ * positives. In case bfqq is weight-raised, such false positives -+ * would evidently degrade latency guarantees for bfqq. -+ */ -+static bool wr_from_too_long(struct bfq_queue *bfqq) -+{ -+ return bfqq->wr_coeff > 1 && -+ time_is_before_jiffies(bfqq->last_wr_start_finish + -+ msecs_to_jiffies(100)); -+} -+ -+/* -+ * Attempt to schedule a merge of bfqq with the currently in-service -+ * queue or with a close queue among the scheduled queues. Return -+ * NULL if no merge was scheduled, a pointer to the shared bfq_queue - * structure otherwise. - * - * The OOM queue is not allowed to participate to cooperation: in fact, since -@@ -1441,6 +1907,18 @@ static bool bfq_may_be_close_cooperator(struct bfq_queue *bfqq, - * handle merging with the OOM queue would be quite complex and expensive - * to maintain. Besides, in such a critical condition as an out of memory, - * the benefits of queue merging may be little relevant, or even negligible. -+ * -+ * Weight-raised queues can be merged only if their weight-raising -+ * period has just started. In fact cooperating processes are usually -+ * started together. Thus, with this filter we avoid false positives -+ * that would jeopardize low-latency guarantees. -+ * -+ * WARNING: queue merging may impair fairness among non-weight raised -+ * queues, for at least two reasons: 1) the original weight of a -+ * merged queue may change during the merged state, 2) even being the -+ * weight the same, a merged queue may be bloated with many more -+ * requests than the ones produced by its originally-associated -+ * process. - */ - static struct bfq_queue * - bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, -@@ -1450,16 +1928,32 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - - if (bfqq->new_bfqq) - return bfqq->new_bfqq; -- if (!io_struct || unlikely(bfqq == &bfqd->oom_bfqq)) -+ -+ if (io_struct && wr_from_too_long(bfqq) && -+ likely(bfqq != &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have looked for coop, but bfq%d wr", -+ bfqq->pid); -+ -+ if (!io_struct || -+ wr_from_too_long(bfqq) || -+ unlikely(bfqq == &bfqd->oom_bfqq)) - return NULL; -- /* If device has only one backlogged bfq_queue, don't search. */ -+ -+ /* If there is only one backlogged queue, don't search. */ - if (bfqd->busy_queues == 1) - return NULL; - - in_service_bfqq = bfqd->in_service_queue; - -+ if (in_service_bfqq && in_service_bfqq != bfqq && -+ bfqd->in_service_bic && wr_from_too_long(in_service_bfqq) -+ && likely(in_service_bfqq == &bfqd->oom_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have tried merge with in-service-queue, but wr"); -+ - if (!in_service_bfqq || in_service_bfqq == bfqq || -- !bfqd->in_service_bic || -+ !bfqd->in_service_bic || wr_from_too_long(in_service_bfqq) || - unlikely(in_service_bfqq == &bfqd->oom_bfqq)) - goto check_scheduled; - -@@ -1481,7 +1975,15 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - - BUG_ON(new_bfqq && bfqq->entity.parent != new_bfqq->entity.parent); - -- if (new_bfqq && likely(new_bfqq != &bfqd->oom_bfqq) && -+ if (new_bfqq && wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && -+ bfq_may_be_close_cooperator(bfqq, new_bfqq)) -+ bfq_log_bfqq(bfqd, bfqq, -+ "would have merged with bfq%d, but wr", -+ new_bfqq->pid); -+ -+ if (new_bfqq && !wr_from_too_long(new_bfqq) && -+ likely(new_bfqq != &bfqd->oom_bfqq) && - bfq_may_be_close_cooperator(bfqq, new_bfqq)) - return bfq_setup_merge(bfqq, new_bfqq); - -@@ -1490,53 +1992,25 @@ bfq_setup_cooperator(struct bfq_data *bfqd, struct bfq_queue *bfqq, - - static void bfq_bfqq_save_state(struct bfq_queue *bfqq) - { -+ struct bfq_io_cq *bic = bfqq->bic; -+ - /* - * If !bfqq->bic, the queue is already shared or its requests - * have already been redirected to a shared queue; both idle window - * and weight raising state have already been saved. Do nothing. - */ -- if (!bfqq->bic) -+ if (!bic) - return; -- if (bfqq->bic->wr_time_left) -- /* -- * This is the queue of a just-started process, and would -- * deserve weight raising: we set wr_time_left to the full -- * weight-raising duration to trigger weight-raising when -- * and if the queue is split and the first request of the -- * queue is enqueued. -- */ -- bfqq->bic->wr_time_left = bfq_wr_duration(bfqq->bfqd); -- else if (bfqq->wr_coeff > 1) { -- unsigned long wr_duration = -- jiffies - bfqq->last_wr_start_finish; -- /* -- * It may happen that a queue's weight raising period lasts -- * longer than its wr_cur_max_time, as weight raising is -- * handled only when a request is enqueued or dispatched (it -- * does not use any timer). If the weight raising period is -- * about to end, don't save it. -- */ -- if (bfqq->wr_cur_max_time <= wr_duration) -- bfqq->bic->wr_time_left = 0; -- else -- bfqq->bic->wr_time_left = -- bfqq->wr_cur_max_time - wr_duration; -- /* -- * The bfq_queue is becoming shared or the requests of the -- * process owning the queue are being redirected to a shared -- * queue. Stop the weight raising period of the queue, as in -- * both cases it should not be owned by an interactive or -- * soft real-time application. -- */ -- bfq_bfqq_end_wr(bfqq); -- } else -- bfqq->bic->wr_time_left = 0; -- bfqq->bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -- bfqq->bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -- bfqq->bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -- bfqq->bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -- bfqq->bic->cooperations++; -- bfqq->bic->failed_cooperations = 0; -+ -+ bic->saved_idle_window = bfq_bfqq_idle_window(bfqq); -+ bic->saved_IO_bound = bfq_bfqq_IO_bound(bfqq); -+ bic->saved_in_large_burst = bfq_bfqq_in_large_burst(bfqq); -+ bic->was_in_burst_list = !hlist_unhashed(&bfqq->burst_list_node); -+ bic->saved_wr_coeff = bfqq->wr_coeff; -+ bic->saved_wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -+ bic->saved_last_wr_start_finish = bfqq->last_wr_start_finish; -+ bic->saved_wr_cur_max_time = bfqq->wr_cur_max_time; -+ BUG_ON(time_is_after_jiffies(bfqq->last_wr_start_finish)); - } - - static void bfq_get_bic_reference(struct bfq_queue *bfqq) -@@ -1561,6 +2035,40 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - if (bfq_bfqq_IO_bound(bfqq)) - bfq_mark_bfqq_IO_bound(new_bfqq); - bfq_clear_bfqq_IO_bound(bfqq); -+ -+ /* -+ * If bfqq is weight-raised, then let new_bfqq inherit -+ * weight-raising. To reduce false positives, neglect the case -+ * where bfqq has just been created, but has not yet made it -+ * to be weight-raised (which may happen because EQM may merge -+ * bfqq even before bfq_add_request is executed for the first -+ * time for bfqq). Handling this case would however be very -+ * easy, thanks to the flag just_created. -+ */ -+ if (new_bfqq->wr_coeff == 1 && bfqq->wr_coeff > 1) { -+ new_bfqq->wr_coeff = bfqq->wr_coeff; -+ new_bfqq->wr_cur_max_time = bfqq->wr_cur_max_time; -+ new_bfqq->last_wr_start_finish = bfqq->last_wr_start_finish; -+ new_bfqq->wr_start_at_switch_to_srt = bfqq->wr_start_at_switch_to_srt; -+ if (bfq_bfqq_busy(new_bfqq)) -+ bfqd->wr_busy_queues++; -+ new_bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, new_bfqq, -+ "wr start after merge with %d, rais_max_time %u", -+ bfqq->pid, -+ jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ } -+ -+ if (bfqq->wr_coeff > 1) { /* bfqq has given its wr to new_bfqq */ -+ bfqq->wr_coeff = 1; -+ bfqq->entity.prio_changed = 1; -+ if (bfq_bfqq_busy(bfqq)) -+ bfqd->wr_busy_queues--; -+ } -+ -+ bfq_log_bfqq(bfqd, new_bfqq, "merge_bfqqs: wr_busy %d", -+ bfqd->wr_busy_queues); -+ - /* - * Grab a reference to the bic, to prevent it from being destroyed - * before being possibly touched by a bfq_split_bfqq(). -@@ -1587,30 +2095,19 @@ bfq_merge_bfqqs(struct bfq_data *bfqd, struct bfq_io_cq *bic, - bfq_put_queue(bfqq); - } - --static void bfq_bfqq_increase_failed_cooperations(struct bfq_queue *bfqq) --{ -- struct bfq_io_cq *bic = bfqq->bic; -- struct bfq_data *bfqd = bfqq->bfqd; -- -- if (bic && bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh) { -- bic->failed_cooperations++; -- if (bic->failed_cooperations >= bfqd->bfq_failed_cooperations) -- bic->cooperations = 0; -- } --} -- --static int bfq_allow_merge(struct request_queue *q, struct request *rq, -- struct bio *bio) -+static int bfq_allow_bio_merge(struct request_queue *q, struct request *rq, -+ struct bio *bio) - { - struct bfq_data *bfqd = q->elevator->elevator_data; -+ bool is_sync = op_is_sync(bio->bi_opf); - struct bfq_io_cq *bic; - struct bfq_queue *bfqq, *new_bfqq; - - /* - * Disallow merge of a sync bio into an async request. - */ -- if (bfq_bio_sync(bio) && !rq_is_sync(rq)) -- return 0; -+ if (is_sync && !rq_is_sync(rq)) -+ return false; - - /* - * Lookup the bfqq that this bio will be queued with. Allow -@@ -1619,9 +2116,9 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, - */ - bic = bfq_bic_lookup(bfqd, current->io_context); - if (!bic) -- return 0; -+ return false; - -- bfqq = bic_to_bfqq(bic, bfq_bio_sync(bio)); -+ bfqq = bic_to_bfqq(bic, is_sync); - /* - * We take advantage of this function to perform an early merge - * of the queues of possible cooperating processes. -@@ -1636,30 +2133,111 @@ static int bfq_allow_merge(struct request_queue *q, struct request *rq, - * to decide whether bio and rq can be merged. - */ - bfqq = new_bfqq; -- } else -- bfq_bfqq_increase_failed_cooperations(bfqq); -+ } - } - - return bfqq == RQ_BFQQ(rq); - } - -+static int bfq_allow_rq_merge(struct request_queue *q, struct request *rq, -+ struct request *next) -+{ -+ return RQ_BFQQ(rq) == RQ_BFQQ(next); -+} -+ -+/* -+ * Set the maximum time for the in-service queue to consume its -+ * budget. This prevents seeky processes from lowering the throughput. -+ * In practice, a time-slice service scheme is used with seeky -+ * processes. -+ */ -+static void bfq_set_budget_timeout(struct bfq_data *bfqd, -+ struct bfq_queue *bfqq) -+{ -+ unsigned int timeout_coeff; -+ -+ if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -+ timeout_coeff = 1; -+ else -+ timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+ -+ bfqd->last_budget_start = ktime_get(); -+ -+ bfqq->budget_timeout = jiffies + -+ bfqd->bfq_timeout * timeout_coeff; -+ -+ bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -+ jiffies_to_msecs(bfqd->bfq_timeout * timeout_coeff)); -+} -+ - static void __bfq_set_in_service_queue(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - if (bfqq) { --#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_avg_queue_size(bfqq_group(bfqq)); --#endif - bfq_mark_bfqq_must_alloc(bfqq); -- bfq_mark_bfqq_budget_new(bfqq); - bfq_clear_bfqq_fifo_expire(bfqq); - - bfqd->budgets_assigned = (bfqd->budgets_assigned*7 + 256) / 8; - -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ -+ if (time_is_before_jiffies(bfqq->last_wr_start_finish) && -+ bfqq->wr_coeff > 1 && -+ bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_before_jiffies(bfqq->budget_timeout)) { -+ /* -+ * For soft real-time queues, move the start -+ * of the weight-raising period forward by the -+ * time the queue has not received any -+ * service. Otherwise, a relatively long -+ * service delay is likely to cause the -+ * weight-raising period of the queue to end, -+ * because of the short duration of the -+ * weight-raising period of a soft real-time -+ * queue. It is worth noting that this move -+ * is not so dangerous for the other queues, -+ * because soft real-time queues are not -+ * greedy. -+ * -+ * To not add a further variable, we use the -+ * overloaded field budget_timeout to -+ * determine for how long the queue has not -+ * received service, i.e., how much time has -+ * elapsed since the queue expired. However, -+ * this is a little imprecise, because -+ * budget_timeout is set to jiffies if bfqq -+ * not only expires, but also remains with no -+ * request. -+ */ -+ if (time_after(bfqq->budget_timeout, -+ bfqq->last_wr_start_finish)) -+ bfqq->last_wr_start_finish += -+ jiffies - bfqq->budget_timeout; -+ else -+ bfqq->last_wr_start_finish = jiffies; -+ -+ if (time_is_after_jiffies(bfqq->last_wr_start_finish)) { -+ pr_crit( -+ "BFQ WARNING:last %lu budget %lu jiffies %lu", -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout, -+ jiffies); -+ pr_crit("diff %lu", jiffies - -+ max_t(unsigned long, -+ bfqq->last_wr_start_finish, -+ bfqq->budget_timeout)); -+ bfqq->last_wr_start_finish = jiffies; -+ } -+ } -+ -+ bfq_set_budget_timeout(bfqd, bfqq); - bfq_log_bfqq(bfqd, bfqq, - "set_in_service_queue, cur-budget = %d", - bfqq->entity.budget); -- } -+ } else -+ bfq_log(bfqd, "set_in_service_queue: NULL"); - - bfqd->in_service_queue = bfqq; - } -@@ -1675,36 +2253,11 @@ static struct bfq_queue *bfq_set_in_service_queue(struct bfq_data *bfqd) - return bfqq; - } - --/* -- * If enough samples have been computed, return the current max budget -- * stored in bfqd, which is dynamically updated according to the -- * estimated disk peak rate; otherwise return the default max budget -- */ --static int bfq_max_budget(struct bfq_data *bfqd) --{ -- if (bfqd->budgets_assigned < bfq_stats_min_budgets) -- return bfq_default_max_budget; -- else -- return bfqd->bfq_max_budget; --} -- --/* -- * Return min budget, which is a fraction of the current or default -- * max budget (trying with 1/32) -- */ --static int bfq_min_budget(struct bfq_data *bfqd) --{ -- if (bfqd->budgets_assigned < bfq_stats_min_budgets) -- return bfq_default_max_budget / 32; -- else -- return bfqd->bfq_max_budget / 32; --} -- - static void bfq_arm_slice_timer(struct bfq_data *bfqd) - { - struct bfq_queue *bfqq = bfqd->in_service_queue; - struct bfq_io_cq *bic; -- unsigned long sl; -+ u32 sl; - - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); - -@@ -1728,119 +2281,366 @@ static void bfq_arm_slice_timer(struct bfq_data *bfqd) - sl = bfqd->bfq_slice_idle; - /* - * Unless the queue is being weight-raised or the scenario is -- * asymmetric, grant only minimum idle time if the queue either -- * has been seeky for long enough or has already proved to be -- * constantly seeky. -+ * asymmetric, grant only minimum idle time if the queue -+ * is seeky. A long idling is preserved for a weight-raised -+ * queue, or, more in general, in an asymemtric scenario, -+ * because a long idling is needed for guaranteeing to a queue -+ * its reserved share of the throughput (in particular, it is -+ * needed if the queue has a higher weight than some other -+ * queue). - */ -- if (bfq_sample_valid(bfqq->seek_samples) && -- ((BFQQ_SEEKY(bfqq) && bfqq->entity.service > -- bfq_max_budget(bfqq->bfqd) / 8) || -- bfq_bfqq_constantly_seeky(bfqq)) && bfqq->wr_coeff == 1 && -+ if (BFQQ_SEEKY(bfqq) && bfqq->wr_coeff == 1 && - bfq_symmetric_scenario(bfqd)) -- sl = min(sl, msecs_to_jiffies(BFQ_MIN_TT)); -- else if (bfqq->wr_coeff > 1) -- sl = sl * 3; -+ sl = min_t(u32, sl, BFQ_MIN_TT); -+ - bfqd->last_idling_start = ktime_get(); -- mod_timer(&bfqd->idle_slice_timer, jiffies + sl); --#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ hrtimer_start(&bfqd->idle_slice_timer, ns_to_ktime(sl), -+ HRTIMER_MODE_REL); - bfqg_stats_set_start_idle_time(bfqq_group(bfqq)); --#endif -- bfq_log(bfqd, "arm idle: %u/%u ms", -- jiffies_to_msecs(sl), jiffies_to_msecs(bfqd->bfq_slice_idle)); -+ bfq_log(bfqd, "arm idle: %ld/%ld ms", -+ sl / NSEC_PER_MSEC, bfqd->bfq_slice_idle / NSEC_PER_MSEC); - } - - /* -- * Set the maximum time for the in-service queue to consume its -- * budget. This prevents seeky processes from lowering the disk -- * throughput (always guaranteed with a time slice scheme as in CFQ). -+ * In autotuning mode, max_budget is dynamically recomputed as the -+ * amount of sectors transferred in timeout at the estimated peak -+ * rate. This enables BFQ to utilize a full timeslice with a full -+ * budget, even if the in-service queue is served at peak rate. And -+ * this maximises throughput with sequential workloads. - */ --static void bfq_set_budget_timeout(struct bfq_data *bfqd) -+static unsigned long bfq_calc_max_budget(struct bfq_data *bfqd) - { -- struct bfq_queue *bfqq = bfqd->in_service_queue; -- unsigned int timeout_coeff; -+ return (u64)bfqd->peak_rate * USEC_PER_MSEC * -+ jiffies_to_msecs(bfqd->bfq_timeout)>>BFQ_RATE_SHIFT; -+} - -- if (bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time) -- timeout_coeff = 1; -- else -- timeout_coeff = bfqq->entity.weight / bfqq->entity.orig_weight; -+/* -+ * Update parameters related to throughput and responsiveness, as a -+ * function of the estimated peak rate. See comments on -+ * bfq_calc_max_budget(), and on T_slow and T_fast arrays. -+ */ -+static void update_thr_responsiveness_params(struct bfq_data *bfqd) -+{ -+ int dev_type = blk_queue_nonrot(bfqd->queue); -+ -+ if (bfqd->bfq_user_max_budget == 0) { -+ bfqd->bfq_max_budget = -+ bfq_calc_max_budget(bfqd); -+ BUG_ON(bfqd->bfq_max_budget < 0); -+ bfq_log(bfqd, "new max_budget = %d", -+ bfqd->bfq_max_budget); -+ } - -- bfqd->last_budget_start = ktime_get(); -+ if (bfqd->device_speed == BFQ_BFQD_FAST && -+ bfqd->peak_rate < device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_SLOW; -+ bfqd->RT_prod = R_slow[dev_type] * -+ T_slow[dev_type]; -+ } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -+ bfqd->peak_rate > device_speed_thresh[dev_type]) { -+ bfqd->device_speed = BFQ_BFQD_FAST; -+ bfqd->RT_prod = R_fast[dev_type] * -+ T_fast[dev_type]; -+ } - -- bfq_clear_bfqq_budget_new(bfqq); -- bfqq->budget_timeout = jiffies + -- bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * timeout_coeff; -+ bfq_log(bfqd, -+"dev_type %s dev_speed_class = %s (%llu sects/sec), thresh %llu setcs/sec", -+ dev_type == 0 ? "ROT" : "NONROT", -+ bfqd->device_speed == BFQ_BFQD_FAST ? "FAST" : "SLOW", -+ bfqd->device_speed == BFQ_BFQD_FAST ? -+ (USEC_PER_SEC*(u64)R_fast[dev_type])>>BFQ_RATE_SHIFT : -+ (USEC_PER_SEC*(u64)R_slow[dev_type])>>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)device_speed_thresh[dev_type])>> -+ BFQ_RATE_SHIFT); -+} - -- bfq_log_bfqq(bfqd, bfqq, "set budget_timeout %u", -- jiffies_to_msecs(bfqd->bfq_timeout[bfq_bfqq_sync(bfqq)] * -- timeout_coeff)); -+static void bfq_reset_rate_computation(struct bfq_data *bfqd, struct request *rq) -+{ -+ if (rq != NULL) { /* new rq dispatch now, reset accordingly */ -+ bfqd->last_dispatch = bfqd->first_dispatch = ktime_get_ns() ; -+ bfqd->peak_rate_samples = 1; -+ bfqd->sequential_samples = 0; -+ bfqd->tot_sectors_dispatched = bfqd->last_rq_max_size = -+ blk_rq_sectors(rq); -+ } else /* no new rq dispatched, just reset the number of samples */ -+ bfqd->peak_rate_samples = 0; /* full re-init on next disp. */ -+ -+ bfq_log(bfqd, -+ "reset_rate_computation at end, sample %u/%u tot_sects %llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched); - } - --/* -- * Move request from internal lists to the request queue dispatch list. -- */ --static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) -+static void bfq_update_rate_reset(struct bfq_data *bfqd, struct request *rq) - { -- struct bfq_data *bfqd = q->elevator->elevator_data; -- struct bfq_queue *bfqq = RQ_BFQQ(rq); -+ u32 rate, weight, divisor; - - /* -- * For consistency, the next instruction should have been executed -- * after removing the request from the queue and dispatching it. -- * We execute instead this instruction before bfq_remove_request() -- * (and hence introduce a temporary inconsistency), for efficiency. -- * In fact, in a forced_dispatch, this prevents two counters related -- * to bfqq->dispatched to risk to be uselessly decremented if bfqq -- * is not in service, and then to be incremented again after -- * incrementing bfqq->dispatched. -+ * For the convergence property to hold (see comments on -+ * bfq_update_peak_rate()) and for the assessment to be -+ * reliable, a minimum number of samples must be present, and -+ * a minimum amount of time must have elapsed. If not so, do -+ * not compute new rate. Just reset parameters, to get ready -+ * for a new evaluation attempt. - */ -- bfqq->dispatched++; -- bfq_remove_request(rq); -- elv_dispatch_sort(q, rq); -+ if (bfqd->peak_rate_samples < BFQ_RATE_MIN_SAMPLES || -+ bfqd->delta_from_first < BFQ_RATE_MIN_INTERVAL) { -+ bfq_log(bfqd, -+ "update_rate_reset: only resetting, delta_first %lluus samples %d", -+ bfqd->delta_from_first>>10, bfqd->peak_rate_samples); -+ goto reset_computation; -+ } - -- if (bfq_bfqq_sync(bfqq)) -- bfqd->sync_flight++; --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- bfqg_stats_update_dispatch(bfqq_group(bfqq), blk_rq_bytes(rq), -- rq->cmd_flags); --#endif -+ /* -+ * If a new request completion has occurred after last -+ * dispatch, then, to approximate the rate at which requests -+ * have been served by the device, it is more precise to -+ * extend the observation interval to the last completion. -+ */ -+ bfqd->delta_from_first = -+ max_t(u64, bfqd->delta_from_first, -+ bfqd->last_completion - bfqd->first_dispatch); -+ -+ BUG_ON(bfqd->delta_from_first == 0); -+ /* -+ * Rate computed in sects/usec, and not sects/nsec, for -+ * precision issues. -+ */ -+ rate = div64_ul(bfqd->tot_sectors_dispatched<delta_from_first, NSEC_PER_USEC)); -+ -+ bfq_log(bfqd, -+"update_rate_reset: tot_sects %llu delta_first %lluus rate %llu sects/s (%d)", -+ bfqd->tot_sectors_dispatched, bfqd->delta_from_first>>10, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ rate > 20< 20M sectors/sec) -+ */ -+ if ((bfqd->sequential_samples < (3 * bfqd->peak_rate_samples)>>2 && -+ rate <= bfqd->peak_rate) || -+ rate > 20<peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ goto reset_computation; -+ } else { -+ bfq_log(bfqd, -+ "update_rate_reset: do update, samples %u/%u rate/peak %llu/%llu", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ ((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT), -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ } -+ -+ /* -+ * We have to update the peak rate, at last! To this purpose, -+ * we use a low-pass filter. We compute the smoothing constant -+ * of the filter as a function of the 'weight' of the new -+ * measured rate. -+ * -+ * As can be seen in next formulas, we define this weight as a -+ * quantity proportional to how sequential the workload is, -+ * and to how long the observation time interval is. -+ * -+ * The weight runs from 0 to 8. The maximum value of the -+ * weight, 8, yields the minimum value for the smoothing -+ * constant. At this minimum value for the smoothing constant, -+ * the measured rate contributes for half of the next value of -+ * the estimated peak rate. -+ * -+ * So, the first step is to compute the weight as a function -+ * of how sequential the workload is. Note that the weight -+ * cannot reach 9, because bfqd->sequential_samples cannot -+ * become equal to bfqd->peak_rate_samples, which, in its -+ * turn, holds true because bfqd->sequential_samples is not -+ * incremented for the first sample. -+ */ -+ weight = (9 * bfqd->sequential_samples) / bfqd->peak_rate_samples; -+ -+ /* -+ * Second step: further refine the weight as a function of the -+ * duration of the observation interval. -+ */ -+ weight = min_t(u32, 8, -+ div_u64(weight * bfqd->delta_from_first, -+ BFQ_RATE_REF_INTERVAL)); -+ -+ /* -+ * Divisor ranging from 10, for minimum weight, to 2, for -+ * maximum weight. -+ */ -+ divisor = 10 - weight; -+ BUG_ON(divisor == 0); -+ -+ /* -+ * Finally, update peak rate: -+ * -+ * peak_rate = peak_rate * (divisor-1) / divisor + rate / divisor -+ */ -+ bfqd->peak_rate *= divisor-1; -+ bfqd->peak_rate /= divisor; -+ rate /= divisor; /* smoothing constant alpha = 1/divisor */ -+ -+ bfq_log(bfqd, -+ "update_rate_reset: divisor %d tmp_peak_rate %llu tmp_rate %u", -+ divisor, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT), -+ (u32)((USEC_PER_SEC*(u64)rate)>>BFQ_RATE_SHIFT)); -+ -+ BUG_ON(bfqd->peak_rate == 0); -+ BUG_ON(bfqd->peak_rate > 20<peak_rate += rate; -+ update_thr_responsiveness_params(bfqd); -+ BUG_ON(bfqd->peak_rate > 20<peak_rate_samples == 0) { /* first dispatch */ -+ bfq_log(bfqd, -+ "update_peak_rate: goto reset, samples %d", -+ bfqd->peak_rate_samples) ; -+ bfq_reset_rate_computation(bfqd, rq); -+ goto update_last_values; /* will add one sample */ -+ } - -- if (bfq_bfqq_fifo_expire(bfqq)) -- return NULL; -+ /* -+ * Device idle for very long: the observation interval lasting -+ * up to this dispatch cannot be a valid observation interval -+ * for computing a new peak rate (similarly to the late- -+ * completion event in bfq_completed_request()). Go to -+ * update_rate_and_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - start a new observation interval with this dispatch -+ */ -+ if (now_ns - bfqd->last_dispatch > 100*NSEC_PER_MSEC && -+ bfqd->rq_in_driver == 0) { -+ bfq_log(bfqd, -+"update_peak_rate: jumping to updating&resetting delta_last %lluus samples %d", -+ (now_ns - bfqd->last_dispatch)>>10, -+ bfqd->peak_rate_samples) ; -+ goto update_rate_and_reset; -+ } - -- bfq_mark_bfqq_fifo_expire(bfqq); -+ /* Update sampling information */ -+ bfqd->peak_rate_samples++; - -- if (list_empty(&bfqq->fifo)) -- return NULL; -+ if ((bfqd->rq_in_driver > 0 || -+ now_ns - bfqd->last_completion < BFQ_MIN_TT) -+ && get_sdist(bfqd->last_position, rq) < BFQQ_SEEK_THR) -+ bfqd->sequential_samples++; - -- rq = rq_entry_fifo(bfqq->fifo.next); -+ bfqd->tot_sectors_dispatched += blk_rq_sectors(rq); - -- if (time_before(jiffies, rq->fifo_time)) -- return NULL; -+ /* Reset max observed rq size every 32 dispatches */ -+ if (likely(bfqd->peak_rate_samples % 32)) -+ bfqd->last_rq_max_size = -+ max_t(u32, blk_rq_sectors(rq), bfqd->last_rq_max_size); -+ else -+ bfqd->last_rq_max_size = blk_rq_sectors(rq); - -- return rq; -+ bfqd->delta_from_first = now_ns - bfqd->first_dispatch; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: added samples %u/%u tot_sects %llu delta_first %lluus", -+ bfqd->peak_rate_samples, bfqd->sequential_samples, -+ bfqd->tot_sectors_dispatched, -+ bfqd->delta_from_first>>10); -+ -+ /* Target observation interval not yet reached, go on sampling */ -+ if (bfqd->delta_from_first < BFQ_RATE_REF_INTERVAL) -+ goto update_last_values; -+ -+update_rate_and_reset: -+ bfq_update_rate_reset(bfqd, rq); -+update_last_values: -+ bfqd->last_position = blk_rq_pos(rq) + blk_rq_sectors(rq); -+ bfqd->last_dispatch = now_ns; -+ -+ bfq_log(bfqd, -+ "update_peak_rate: delta_first %lluus last_pos %llu peak_rate %llu", -+ (now_ns - bfqd->first_dispatch)>>10, -+ (unsigned long long) bfqd->last_position, -+ ((USEC_PER_SEC*(u64)bfqd->peak_rate)>>BFQ_RATE_SHIFT)); -+ bfq_log(bfqd, -+ "update_peak_rate: samples at end %d", bfqd->peak_rate_samples); - } - --static int bfq_bfqq_budget_left(struct bfq_queue *bfqq) -+/* -+ * Move request from internal lists to the dispatch list of the request queue -+ */ -+static void bfq_dispatch_insert(struct request_queue *q, struct request *rq) - { -- struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_queue *bfqq = RQ_BFQQ(rq); - -- return entity->budget - entity->service; -+ /* -+ * For consistency, the next instruction should have been executed -+ * after removing the request from the queue and dispatching it. -+ * We execute instead this instruction before bfq_remove_request() -+ * (and hence introduce a temporary inconsistency), for efficiency. -+ * In fact, in a forced_dispatch, this prevents two counters related -+ * to bfqq->dispatched to risk to be uselessly decremented if bfqq -+ * is not in service, and then to be incremented again after -+ * incrementing bfqq->dispatched. -+ */ -+ bfqq->dispatched++; -+ bfq_update_peak_rate(q->elevator->elevator_data, rq); -+ -+ bfq_remove_request(rq); -+ elv_dispatch_sort(q, rq); - } - - static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - BUG_ON(bfqq != bfqd->in_service_queue); - -- __bfq_bfqd_reset_in_service(bfqd); -- - /* - * If this bfqq is shared between multiple processes, check - * to make sure that those processes are still issuing I/Os -@@ -1851,20 +2651,30 @@ static void __bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_mark_bfqq_split_coop(bfqq); - - if (RB_EMPTY_ROOT(&bfqq->sort_list)) { -- /* -- * Overloading budget_timeout field to store the time -- * at which the queue remains with no backlog; used by -- * the weight-raising mechanism. -- */ -- bfqq->budget_timeout = jiffies; -- bfq_del_bfqq_busy(bfqd, bfqq, 1); -+ if (bfqq->dispatched == 0) -+ /* -+ * Overloading budget_timeout field to store -+ * the time at which the queue remains with no -+ * backlog and no outstanding request; used by -+ * the weight-raising mechanism. -+ */ -+ bfqq->budget_timeout = jiffies; -+ -+ bfq_del_bfqq_busy(bfqd, bfqq, true); - } else { -- bfq_activate_bfqq(bfqd, bfqq); -+ bfq_requeue_bfqq(bfqd, bfqq); - /* - * Resort priority tree of potential close cooperators. - */ - bfq_pos_tree_add_move(bfqd, bfqq); - } -+ -+ /* -+ * All in-service entities must have been properly deactivated -+ * or requeued before executing the next function, which -+ * resets all in-service entites as no more in service. -+ */ -+ __bfq_bfqd_reset_in_service(bfqd); - } - - /** -@@ -1883,10 +2693,19 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - struct request *next_rq; - int budget, min_budget; - -- budget = bfqq->max_budget; -+ BUG_ON(bfqq != bfqd->in_service_queue); -+ - min_budget = bfq_min_budget(bfqd); - -- BUG_ON(bfqq != bfqd->in_service_queue); -+ if (bfqq->wr_coeff == 1) -+ budget = bfqq->max_budget; -+ else /* -+ * Use a constant, low budget for weight-raised queues, -+ * to help achieve a low latency. Keep it slightly higher -+ * than the minimum possible budget, to cause a little -+ * bit fewer expirations. -+ */ -+ budget = 2 * min_budget; - - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: last budg %d, budg left %d", - bfqq->entity.budget, bfq_bfqq_budget_left(bfqq)); -@@ -1895,7 +2714,7 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - bfq_log_bfqq(bfqd, bfqq, "recalc_budg: sync %d, seeky %d", - bfq_bfqq_sync(bfqq), BFQQ_SEEKY(bfqd->in_service_queue)); - -- if (bfq_bfqq_sync(bfqq)) { -+ if (bfq_bfqq_sync(bfqq) && bfqq->wr_coeff == 1) { - switch (reason) { - /* - * Caveat: in all the following cases we trade latency -@@ -1937,14 +2756,10 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - break; - case BFQ_BFQQ_BUDGET_TIMEOUT: - /* -- * We double the budget here because: 1) it -- * gives the chance to boost the throughput if -- * this is not a seeky process (which may have -- * bumped into this timeout because of, e.g., -- * ZBR), 2) together with charge_full_budget -- * it helps give seeky processes higher -- * timestamps, and hence be served less -- * frequently. -+ * We double the budget here because it gives -+ * the chance to boost the throughput if this -+ * is not a seeky process (and has bumped into -+ * this timeout because of, e.g., ZBR). - */ - budget = min(budget * 2, bfqd->bfq_max_budget); - break; -@@ -1961,17 +2776,49 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - budget = min(budget * 4, bfqd->bfq_max_budget); - break; - case BFQ_BFQQ_NO_MORE_REQUESTS: -- /* -- * Leave the budget unchanged. -- */ -+ /* -+ * For queues that expire for this reason, it -+ * is particularly important to keep the -+ * budget close to the actual service they -+ * need. Doing so reduces the timestamp -+ * misalignment problem described in the -+ * comments in the body of -+ * __bfq_activate_entity. In fact, suppose -+ * that a queue systematically expires for -+ * BFQ_BFQQ_NO_MORE_REQUESTS and presents a -+ * new request in time to enjoy timestamp -+ * back-shifting. The larger the budget of the -+ * queue is with respect to the service the -+ * queue actually requests in each service -+ * slot, the more times the queue can be -+ * reactivated with the same virtual finish -+ * time. It follows that, even if this finish -+ * time is pushed to the system virtual time -+ * to reduce the consequent timestamp -+ * misalignment, the queue unjustly enjoys for -+ * many re-activations a lower finish time -+ * than all newly activated queues. -+ * -+ * The service needed by bfqq is measured -+ * quite precisely by bfqq->entity.service. -+ * Since bfqq does not enjoy device idling, -+ * bfqq->entity.service is equal to the number -+ * of sectors that the process associated with -+ * bfqq requested to read/write before waiting -+ * for request completions, or blocking for -+ * other reasons. -+ */ -+ budget = max_t(int, bfqq->entity.service, min_budget); -+ break; - default: - return; - } -- } else -+ } else if (!bfq_bfqq_sync(bfqq)) - /* -- * Async queues get always the maximum possible budget -- * (their ability to dispatch is limited by -- * @bfqd->bfq_max_budget_async_rq). -+ * Async queues get always the maximum possible -+ * budget, as for them we do not care about latency -+ * (in addition, their ability to dispatch is limited -+ * by the charging factor). - */ - budget = bfqd->bfq_max_budget; - -@@ -1982,160 +2829,120 @@ static void __bfq_bfqq_recalc_budget(struct bfq_data *bfqd, - bfqq->max_budget = min(bfqq->max_budget, bfqd->bfq_max_budget); - - /* -- * Make sure that we have enough budget for the next request. -- * Since the finish time of the bfqq must be kept in sync with -- * the budget, be sure to call __bfq_bfqq_expire() after the -+ * If there is still backlog, then assign a new budget, making -+ * sure that it is large enough for the next request. Since -+ * the finish time of bfqq must be kept in sync with the -+ * budget, be sure to call __bfq_bfqq_expire() *after* this - * update. -+ * -+ * If there is no backlog, then no need to update the budget; -+ * it will be updated on the arrival of a new request. - */ - next_rq = bfqq->next_rq; -- if (next_rq) -+ if (next_rq) { -+ BUG_ON(reason == BFQ_BFQQ_TOO_IDLE || -+ reason == BFQ_BFQQ_NO_MORE_REQUESTS); - bfqq->entity.budget = max_t(unsigned long, bfqq->max_budget, - bfq_serv_to_charge(next_rq, bfqq)); -- else -- bfqq->entity.budget = bfqq->max_budget; -+ BUG_ON(!bfq_bfqq_busy(bfqq)); -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ } - - bfq_log_bfqq(bfqd, bfqq, "head sect: %u, new budget %d", - next_rq ? blk_rq_sectors(next_rq) : 0, - bfqq->entity.budget); - } - --static unsigned long bfq_calc_max_budget(u64 peak_rate, u64 timeout) --{ -- unsigned long max_budget; -- -- /* -- * The max_budget calculated when autotuning is equal to the -- * amount of sectors transfered in timeout_sync at the -- * estimated peak rate. -- */ -- max_budget = (unsigned long)(peak_rate * 1000 * -- timeout >> BFQ_RATE_SHIFT); -- -- return max_budget; --} -- - /* -- * In addition to updating the peak rate, checks whether the process -- * is "slow", and returns 1 if so. This slow flag is used, in addition -- * to the budget timeout, to reduce the amount of service provided to -- * seeky processes, and hence reduce their chances to lower the -- * throughput. See the code for more details. -+ * Return true if the process associated with bfqq is "slow". The slow -+ * flag is used, in addition to the budget timeout, to reduce the -+ * amount of service provided to seeky processes, and thus reduce -+ * their chances to lower the throughput. More details in the comments -+ * on the function bfq_bfqq_expire(). -+ * -+ * An important observation is in order: as discussed in the comments -+ * on the function bfq_update_peak_rate(), with devices with internal -+ * queues, it is hard if ever possible to know when and for how long -+ * an I/O request is processed by the device (apart from the trivial -+ * I/O pattern where a new request is dispatched only after the -+ * previous one has been completed). This makes it hard to evaluate -+ * the real rate at which the I/O requests of each bfq_queue are -+ * served. In fact, for an I/O scheduler like BFQ, serving a -+ * bfq_queue means just dispatching its requests during its service -+ * slot (i.e., until the budget of the queue is exhausted, or the -+ * queue remains idle, or, finally, a timeout fires). But, during the -+ * service slot of a bfq_queue, around 100 ms at most, the device may -+ * be even still processing requests of bfq_queues served in previous -+ * service slots. On the opposite end, the requests of the in-service -+ * bfq_queue may be completed after the service slot of the queue -+ * finishes. -+ * -+ * Anyway, unless more sophisticated solutions are used -+ * (where possible), the sum of the sizes of the requests dispatched -+ * during the service slot of a bfq_queue is probably the only -+ * approximation available for the service received by the bfq_queue -+ * during its service slot. And this sum is the quantity used in this -+ * function to evaluate the I/O speed of a process. - */ --static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- bool compensate, enum bfqq_expiration reason) -+static bool bfq_bfqq_is_slow(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ bool compensate, enum bfqq_expiration reason, -+ unsigned long *delta_ms) - { -- u64 bw, usecs, expected, timeout; -- ktime_t delta; -- int update = 0; -+ ktime_t delta_ktime; -+ u32 delta_usecs; -+ bool slow = BFQQ_SEEKY(bfqq); /* if delta too short, use seekyness */ - -- if (!bfq_bfqq_sync(bfqq) || bfq_bfqq_budget_new(bfqq)) -+ if (!bfq_bfqq_sync(bfqq)) - return false; - - if (compensate) -- delta = bfqd->last_idling_start; -+ delta_ktime = bfqd->last_idling_start; - else -- delta = ktime_get(); -- delta = ktime_sub(delta, bfqd->last_budget_start); -- usecs = ktime_to_us(delta); -- -- /* Don't trust short/unrealistic values. */ -- if (usecs < 100 || usecs >= LONG_MAX) -- return false; -- -- /* -- * Calculate the bandwidth for the last slice. We use a 64 bit -- * value to store the peak rate, in sectors per usec in fixed -- * point math. We do so to have enough precision in the estimate -- * and to avoid overflows. -- */ -- bw = (u64)bfqq->entity.service << BFQ_RATE_SHIFT; -- do_div(bw, (unsigned long)usecs); -+ delta_ktime = ktime_get(); -+ delta_ktime = ktime_sub(delta_ktime, bfqd->last_budget_start); -+ delta_usecs = ktime_to_us(delta_ktime); -+ -+ /* don't trust short/unrealistic values. */ -+ if (delta_usecs < 1000 || delta_usecs >= LONG_MAX) { -+ if (blk_queue_nonrot(bfqd->queue)) -+ /* -+ * give same worst-case guarantees as idling -+ * for seeky -+ */ -+ *delta_ms = BFQ_MIN_TT / NSEC_PER_MSEC; -+ else /* charge at least one seek */ -+ *delta_ms = bfq_slice_idle / NSEC_PER_MSEC; -+ -+ bfq_log(bfqd, "bfq_bfqq_is_slow: unrealistic %u", delta_usecs); -+ -+ return slow; -+ } - -- timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); -+ *delta_ms = delta_usecs / USEC_PER_MSEC; - - /* -- * Use only long (> 20ms) intervals to filter out spikes for -- * the peak rate estimation. -+ * Use only long (> 20ms) intervals to filter out excessive -+ * spikes in service rate estimation. - */ -- if (usecs > 20000) { -- if (bw > bfqd->peak_rate || -- (!BFQQ_SEEKY(bfqq) && -- reason == BFQ_BFQQ_BUDGET_TIMEOUT)) { -- bfq_log(bfqd, "measured bw =%llu", bw); -- /* -- * To smooth oscillations use a low-pass filter with -- * alpha=7/8, i.e., -- * new_rate = (7/8) * old_rate + (1/8) * bw -- */ -- do_div(bw, 8); -- if (bw == 0) -- return 0; -- bfqd->peak_rate *= 7; -- do_div(bfqd->peak_rate, 8); -- bfqd->peak_rate += bw; -- update = 1; -- bfq_log(bfqd, "new peak_rate=%llu", bfqd->peak_rate); -- } -- -- update |= bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES - 1; -- -- if (bfqd->peak_rate_samples < BFQ_PEAK_RATE_SAMPLES) -- bfqd->peak_rate_samples++; -- -- if (bfqd->peak_rate_samples == BFQ_PEAK_RATE_SAMPLES && -- update) { -- int dev_type = blk_queue_nonrot(bfqd->queue); -- -- if (bfqd->bfq_user_max_budget == 0) { -- bfqd->bfq_max_budget = -- bfq_calc_max_budget(bfqd->peak_rate, -- timeout); -- bfq_log(bfqd, "new max_budget=%d", -- bfqd->bfq_max_budget); -- } -- if (bfqd->device_speed == BFQ_BFQD_FAST && -- bfqd->peak_rate < device_speed_thresh[dev_type]) { -- bfqd->device_speed = BFQ_BFQD_SLOW; -- bfqd->RT_prod = R_slow[dev_type] * -- T_slow[dev_type]; -- } else if (bfqd->device_speed == BFQ_BFQD_SLOW && -- bfqd->peak_rate > device_speed_thresh[dev_type]) { -- bfqd->device_speed = BFQ_BFQD_FAST; -- bfqd->RT_prod = R_fast[dev_type] * -- T_fast[dev_type]; -- } -- } -+ if (delta_usecs > 20000) { -+ /* -+ * Caveat for rotational devices: processes doing I/O -+ * in the slower disk zones tend to be slow(er) even -+ * if not seeky. In this respect, the estimated peak -+ * rate is likely to be an average over the disk -+ * surface. Accordingly, to not be too harsh with -+ * unlucky processes, a process is deemed slow only if -+ * its rate has been lower than half of the estimated -+ * peak rate. -+ */ -+ slow = bfqq->entity.service < bfqd->bfq_max_budget / 2; -+ bfq_log(bfqd, "bfq_bfqq_is_slow: relative rate %d/%d", -+ bfqq->entity.service, bfqd->bfq_max_budget); - } - -- /* -- * If the process has been served for a too short time -- * interval to let its possible sequential accesses prevail on -- * the initial seek time needed to move the disk head on the -- * first sector it requested, then give the process a chance -- * and for the moment return false. -- */ -- if (bfqq->entity.budget <= bfq_max_budget(bfqd) / 8) -- return false; -- -- /* -- * A process is considered ``slow'' (i.e., seeky, so that we -- * cannot treat it fairly in the service domain, as it would -- * slow down too much the other processes) if, when a slice -- * ends for whatever reason, it has received service at a -- * rate that would not be high enough to complete the budget -- * before the budget timeout expiration. -- */ -- expected = bw * 1000 * timeout >> BFQ_RATE_SHIFT; -+ bfq_log_bfqq(bfqd, bfqq, "bfq_bfqq_is_slow: slow %d", slow); - -- /* -- * Caveat: processes doing IO in the slower disk zones will -- * tend to be slow(er) even if not seeky. And the estimated -- * peak rate will actually be an average over the disk -- * surface. Hence, to not be too harsh with unlucky processes, -- * we keep a budget/3 margin of safety before declaring a -- * process slow. -- */ -- return expected > (4 * bfqq->entity.budget) / 3; -+ return slow; - } - - /* -@@ -2193,20 +3000,35 @@ static bool bfq_update_peak_rate(struct bfq_data *bfqd, struct bfq_queue *bfqq, - static unsigned long bfq_bfqq_softrt_next_start(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { -+ bfq_log_bfqq(bfqd, bfqq, -+"softrt_next_start: service_blkg %lu soft_rate %u sects/sec interval %u", -+ bfqq->service_from_backlogged, -+ bfqd->bfq_wr_max_softrt_rate, -+ jiffies_to_msecs(HZ * bfqq->service_from_backlogged / -+ bfqd->bfq_wr_max_softrt_rate)); -+ - return max(bfqq->last_idle_bklogged + - HZ * bfqq->service_from_backlogged / - bfqd->bfq_wr_max_softrt_rate, -- jiffies + bfqq->bfqd->bfq_slice_idle + 4); -+ jiffies + nsecs_to_jiffies(bfqq->bfqd->bfq_slice_idle) + 4); - } - - /* -- * Return the largest-possible time instant such that, for as long as possible, -- * the current time will be lower than this time instant according to the macro -- * time_is_before_jiffies(). -+ * Return the farthest future time instant according to jiffies -+ * macros. - */ --static unsigned long bfq_infinity_from_now(unsigned long now) -+static unsigned long bfq_greatest_from_now(void) - { -- return now + ULONG_MAX / 2; -+ return jiffies + MAX_JIFFY_OFFSET; -+} -+ -+/* -+ * Return the farthest past time instant according to jiffies -+ * macros. -+ */ -+static unsigned long bfq_smallest_from_now(void) -+{ -+ return jiffies - MAX_JIFFY_OFFSET; - } - - /** -@@ -2216,28 +3038,24 @@ static unsigned long bfq_infinity_from_now(unsigned long now) - * @compensate: if true, compensate for the time spent idling. - * @reason: the reason causing the expiration. - * -+ * If the process associated with bfqq does slow I/O (e.g., because it -+ * issues random requests), we charge bfqq with the time it has been -+ * in service instead of the service it has received (see -+ * bfq_bfqq_charge_time for details on how this goal is achieved). As -+ * a consequence, bfqq will typically get higher timestamps upon -+ * reactivation, and hence it will be rescheduled as if it had -+ * received more service than what it has actually received. In the -+ * end, bfqq receives less service in proportion to how slowly its -+ * associated process consumes its budgets (and hence how seriously it -+ * tends to lower the throughput). In addition, this time-charging -+ * strategy guarantees time fairness among slow processes. In -+ * contrast, if the process associated with bfqq is not slow, we -+ * charge bfqq exactly with the service it has received. - * -- * If the process associated to the queue is slow (i.e., seeky), or in -- * case of budget timeout, or, finally, if it is async, we -- * artificially charge it an entire budget (independently of the -- * actual service it received). As a consequence, the queue will get -- * higher timestamps than the correct ones upon reactivation, and -- * hence it will be rescheduled as if it had received more service -- * than what it actually received. In the end, this class of processes -- * will receive less service in proportion to how slowly they consume -- * their budgets (and hence how seriously they tend to lower the -- * throughput). -- * -- * In contrast, when a queue expires because it has been idling for -- * too much or because it exhausted its budget, we do not touch the -- * amount of service it has received. Hence when the queue will be -- * reactivated and its timestamps updated, the latter will be in sync -- * with the actual service received by the queue until expiration. -- * -- * Charging a full budget to the first type of queues and the exact -- * service to the others has the effect of using the WF2Q+ policy to -- * schedule the former on a timeslice basis, without violating the -- * service domain guarantees of the latter. -+ * Charging time to the first type of queues and the exact service to -+ * the other has the effect of using the WF2Q+ policy to schedule the -+ * former on a timeslice basis, without violating service domain -+ * guarantees among the latter. - */ - static void bfq_bfqq_expire(struct bfq_data *bfqd, - struct bfq_queue *bfqq, -@@ -2245,41 +3063,52 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - enum bfqq_expiration reason) - { - bool slow; -+ unsigned long delta = 0; -+ struct bfq_entity *entity = &bfqq->entity; - - BUG_ON(bfqq != bfqd->in_service_queue); - - /* -- * Update disk peak rate for autotuning and check whether the -- * process is slow (see bfq_update_peak_rate). -+ * Check whether the process is slow (see bfq_bfqq_is_slow). -+ */ -+ slow = bfq_bfqq_is_slow(bfqd, bfqq, compensate, reason, &delta); -+ -+ /* -+ * Increase service_from_backlogged before next statement, -+ * because the possible next invocation of -+ * bfq_bfqq_charge_time would likely inflate -+ * entity->service. In contrast, service_from_backlogged must -+ * contain real service, to enable the soft real-time -+ * heuristic to correctly compute the bandwidth consumed by -+ * bfqq. - */ -- slow = bfq_update_peak_rate(bfqd, bfqq, compensate, reason); -+ bfqq->service_from_backlogged += entity->service; - - /* -- * As above explained, 'punish' slow (i.e., seeky), timed-out -- * and async queues, to favor sequential sync workloads. -+ * As above explained, charge slow (typically seeky) and -+ * timed-out queues with the time and not the service -+ * received, to favor sequential workloads. - * -- * Processes doing I/O in the slower disk zones will tend to be -- * slow(er) even if not seeky. Hence, since the estimated peak -- * rate is actually an average over the disk surface, these -- * processes may timeout just for bad luck. To avoid punishing -- * them we do not charge a full budget to a process that -- * succeeded in consuming at least 2/3 of its budget. -+ * Processes doing I/O in the slower disk zones will tend to -+ * be slow(er) even if not seeky. Therefore, since the -+ * estimated peak rate is actually an average over the disk -+ * surface, these processes may timeout just for bad luck. To -+ * avoid punishing them, do not charge time to processes that -+ * succeeded in consuming at least 2/3 of their budget. This -+ * allows BFQ to preserve enough elasticity to still perform -+ * bandwidth, and not time, distribution with little unlucky -+ * or quasi-sequential processes. - */ -- if (slow || (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -- bfq_bfqq_budget_left(bfqq) >= bfqq->entity.budget / 3)) -- bfq_bfqq_charge_full_budget(bfqq); -- -- bfqq->service_from_backlogged += bfqq->entity.service; -+ if (bfqq->wr_coeff == 1 && -+ (slow || -+ (reason == BFQ_BFQQ_BUDGET_TIMEOUT && -+ bfq_bfqq_budget_left(bfqq) >= entity->budget / 3))) -+ bfq_bfqq_charge_time(bfqd, bfqq, delta); - -- if (BFQQ_SEEKY(bfqq) && reason == BFQ_BFQQ_BUDGET_TIMEOUT && -- !bfq_bfqq_constantly_seeky(bfqq)) { -- bfq_mark_bfqq_constantly_seeky(bfqq); -- if (!blk_queue_nonrot(bfqd->queue)) -- bfqd->const_seeky_busy_in_flight_queues++; -- } -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); - - if (reason == BFQ_BFQQ_TOO_IDLE && -- bfqq->entity.service <= 2 * bfqq->entity.budget / 10) -+ entity->service <= 2 * entity->budget / 10) - bfq_clear_bfqq_IO_bound(bfqq); - - if (bfqd->low_latency && bfqq->wr_coeff == 1) -@@ -2288,19 +3117,23 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - if (bfqd->low_latency && bfqd->bfq_wr_max_softrt_rate > 0 && - RB_EMPTY_ROOT(&bfqq->sort_list)) { - /* -- * If we get here, and there are no outstanding requests, -- * then the request pattern is isochronous (see the comments -- * to the function bfq_bfqq_softrt_next_start()). Hence we -- * can compute soft_rt_next_start. If, instead, the queue -- * still has outstanding requests, then we have to wait -- * for the completion of all the outstanding requests to -+ * If we get here, and there are no outstanding -+ * requests, then the request pattern is isochronous -+ * (see the comments on the function -+ * bfq_bfqq_softrt_next_start()). Thus we can compute -+ * soft_rt_next_start. If, instead, the queue still -+ * has outstanding requests, then we have to wait for -+ * the completion of all the outstanding requests to - * discover whether the request pattern is actually - * isochronous. - */ -- if (bfqq->dispatched == 0) -+ BUG_ON(bfqd->busy_queues < 1); -+ if (bfqq->dispatched == 0) { - bfqq->soft_rt_next_start = - bfq_bfqq_softrt_next_start(bfqd, bfqq); -- else { -+ bfq_log_bfqq(bfqd, bfqq, "new soft_rt_next %lu", -+ bfqq->soft_rt_next_start); -+ } else { - /* - * The application is still waiting for the - * completion of one or more requests: -@@ -2317,7 +3150,7 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - * happened to be in the past. - */ - bfqq->soft_rt_next_start = -- bfq_infinity_from_now(jiffies); -+ bfq_greatest_from_now(); - /* - * Schedule an update of soft_rt_next_start to when - * the task may be discovered to be isochronous. -@@ -2327,15 +3160,27 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - } - - bfq_log_bfqq(bfqd, bfqq, -- "expire (%d, slow %d, num_disp %d, idle_win %d)", reason, -- slow, bfqq->dispatched, bfq_bfqq_idle_window(bfqq)); -+ "expire (%d, slow %d, num_disp %d, idle_win %d, weight %d)", -+ reason, slow, bfqq->dispatched, -+ bfq_bfqq_idle_window(bfqq), entity->weight); - - /* - * Increase, decrease or leave budget unchanged according to - * reason. - */ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); - __bfq_bfqq_recalc_budget(bfqd, bfqq, reason); -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); - __bfq_bfqq_expire(bfqd, bfqq); -+ -+ BUG_ON(!bfq_bfqq_busy(bfqq) && reason == BFQ_BFQQ_BUDGET_EXHAUSTED && -+ !bfq_class_idle(bfqq)); -+ -+ if (!bfq_bfqq_busy(bfqq) && -+ reason != BFQ_BFQQ_BUDGET_TIMEOUT && -+ reason != BFQ_BFQQ_BUDGET_EXHAUSTED) -+ bfq_mark_bfqq_non_blocking_wait_rq(bfqq); - } - - /* -@@ -2345,20 +3190,17 @@ static void bfq_bfqq_expire(struct bfq_data *bfqd, - */ - static bool bfq_bfqq_budget_timeout(struct bfq_queue *bfqq) - { -- if (bfq_bfqq_budget_new(bfqq) || -- time_before(jiffies, bfqq->budget_timeout)) -- return false; -- return true; -+ return time_is_before_eq_jiffies(bfqq->budget_timeout); - } - - /* -- * If we expire a queue that is waiting for the arrival of a new -- * request, we may prevent the fictitious timestamp back-shifting that -- * allows the guarantees of the queue to be preserved (see [1] for -- * this tricky aspect). Hence we return true only if this condition -- * does not hold, or if the queue is slow enough to deserve only to be -- * kicked off for preserving a high throughput. --*/ -+ * If we expire a queue that is actively waiting (i.e., with the -+ * device idled) for the arrival of a new request, then we may incur -+ * the timestamp misalignment problem described in the body of the -+ * function __bfq_activate_entity. Hence we return true only if this -+ * condition does not hold, or if the queue is slow enough to deserve -+ * only to be kicked off for preserving a high throughput. -+ */ - static bool bfq_may_expire_for_budg_timeout(struct bfq_queue *bfqq) - { - bfq_log_bfqq(bfqq->bfqd, bfqq, -@@ -2400,10 +3242,12 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - { - struct bfq_data *bfqd = bfqq->bfqd; - bool idling_boosts_thr, idling_boosts_thr_without_issues, -- all_queues_seeky, on_hdd_and_not_all_queues_seeky, - idling_needed_for_service_guarantees, - asymmetric_scenario; - -+ if (bfqd->strict_guarantees) -+ return true; -+ - /* - * The next variable takes into account the cases where idling - * boosts the throughput. -@@ -2466,74 +3310,27 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - bfqd->wr_busy_queues == 0; - - /* -- * There are then two cases where idling must be performed not -+ * There is then a case where idling must be performed not - * for throughput concerns, but to preserve service -- * guarantees. In the description of these cases, we say, for -- * short, that a queue is sequential/random if the process -- * associated to the queue issues sequential/random requests -- * (in the second case the queue may be tagged as seeky or -- * even constantly_seeky). -- * -- * To introduce the first case, we note that, since -- * bfq_bfqq_idle_window(bfqq) is false if the device is -- * NCQ-capable and bfqq is random (see -- * bfq_update_idle_window()), then, from the above two -- * assignments it follows that -- * idling_boosts_thr_without_issues is false if the device is -- * NCQ-capable and bfqq is random. Therefore, for this case, -- * device idling would never be allowed if we used just -- * idling_boosts_thr_without_issues to decide whether to allow -- * it. And, beneficially, this would imply that throughput -- * would always be boosted also with random I/O on NCQ-capable -- * HDDs. -+ * guarantees. - * -- * But we must be careful on this point, to avoid an unfair -- * treatment for bfqq. In fact, because of the same above -- * assignments, idling_boosts_thr_without_issues is, on the -- * other hand, true if 1) the device is an HDD and bfqq is -- * sequential, and 2) there are no busy weight-raised -- * queues. As a consequence, if we used just -- * idling_boosts_thr_without_issues to decide whether to idle -- * the device, then with an HDD we might easily bump into a -- * scenario where queues that are sequential and I/O-bound -- * would enjoy idling, whereas random queues would not. The -- * latter might then get a low share of the device throughput, -- * simply because the former would get many requests served -- * after being set as in service, while the latter would not. -- * -- * To address this issue, we start by setting to true a -- * sentinel variable, on_hdd_and_not_all_queues_seeky, if the -- * device is rotational and not all queues with pending or -- * in-flight requests are constantly seeky (i.e., there are -- * active sequential queues, and bfqq might then be mistreated -- * if it does not enjoy idling because it is random). -- */ -- all_queues_seeky = bfq_bfqq_constantly_seeky(bfqq) && -- bfqd->busy_in_flight_queues == -- bfqd->const_seeky_busy_in_flight_queues; -- -- on_hdd_and_not_all_queues_seeky = -- !blk_queue_nonrot(bfqd->queue) && !all_queues_seeky; -- -- /* -- * To introduce the second case where idling needs to be -- * performed to preserve service guarantees, we can note that -- * allowing the drive to enqueue more than one request at a -- * time, and hence delegating de facto final scheduling -- * decisions to the drive's internal scheduler, causes loss of -- * control on the actual request service order. In particular, -- * the critical situation is when requests from different -- * processes happens to be present, at the same time, in the -- * internal queue(s) of the drive. In such a situation, the -- * drive, by deciding the service order of the -- * internally-queued requests, does determine also the actual -- * throughput distribution among these processes. But the -- * drive typically has no notion or concern about per-process -- * throughput distribution, and makes its decisions only on a -- * per-request basis. Therefore, the service distribution -- * enforced by the drive's internal scheduler is likely to -- * coincide with the desired device-throughput distribution -- * only in a completely symmetric scenario where: -+ * To introduce this case, we can note that allowing the drive -+ * to enqueue more than one request at a time, and hence -+ * delegating de facto final scheduling decisions to the -+ * drive's internal scheduler, entails loss of control on the -+ * actual request service order. In particular, the critical -+ * situation is when requests from different processes happen -+ * to be present, at the same time, in the internal queue(s) -+ * of the drive. In such a situation, the drive, by deciding -+ * the service order of the internally-queued requests, does -+ * determine also the actual throughput distribution among -+ * these processes. But the drive typically has no notion or -+ * concern about per-process throughput distribution, and -+ * makes its decisions only on a per-request basis. Therefore, -+ * the service distribution enforced by the drive's internal -+ * scheduler is likely to coincide with the desired -+ * device-throughput distribution only in a completely -+ * symmetric scenario where: - * (i) each of these processes must get the same throughput as - * the others; - * (ii) all these processes have the same I/O pattern -@@ -2555,26 +3352,53 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * words, only if sub-condition (i) holds, then idling is - * allowed, and the device tends to be prevented from queueing - * many requests, possibly of several processes. The reason -- * for not controlling also sub-condition (ii) is that, first, -- * in the case of an HDD, the asymmetry in terms of types of -- * I/O patterns is already taken in to account in the above -- * sentinel variable -- * on_hdd_and_not_all_queues_seeky. Secondly, in the case of a -- * flash-based device, we prefer however to privilege -- * throughput (and idling lowers throughput for this type of -- * devices), for the following reasons: -- * 1) differently from HDDs, the service time of random -- * requests is not orders of magnitudes lower than the service -- * time of sequential requests; thus, even if processes doing -- * sequential I/O get a preferential treatment with respect to -- * others doing random I/O, the consequences are not as -- * dramatic as with HDDs; -- * 2) if a process doing random I/O does need strong -- * throughput guarantees, it is hopefully already being -- * weight-raised, or the user is likely to have assigned it a -- * higher weight than the other processes (and thus -- * sub-condition (i) is likely to be false, which triggers -- * idling). -+ * for not controlling also sub-condition (ii) is that we -+ * exploit preemption to preserve guarantees in case of -+ * symmetric scenarios, even if (ii) does not hold, as -+ * explained in the next two paragraphs. -+ * -+ * Even if a queue, say Q, is expired when it remains idle, Q -+ * can still preempt the new in-service queue if the next -+ * request of Q arrives soon (see the comments on -+ * bfq_bfqq_update_budg_for_activation). If all queues and -+ * groups have the same weight, this form of preemption, -+ * combined with the hole-recovery heuristic described in the -+ * comments on function bfq_bfqq_update_budg_for_activation, -+ * are enough to preserve a correct bandwidth distribution in -+ * the mid term, even without idling. In fact, even if not -+ * idling allows the internal queues of the device to contain -+ * many requests, and thus to reorder requests, we can rather -+ * safely assume that the internal scheduler still preserves a -+ * minimum of mid-term fairness. The motivation for using -+ * preemption instead of idling is that, by not idling, -+ * service guarantees are preserved without minimally -+ * sacrificing throughput. In other words, both a high -+ * throughput and its desired distribution are obtained. -+ * -+ * More precisely, this preemption-based, idleless approach -+ * provides fairness in terms of IOPS, and not sectors per -+ * second. This can be seen with a simple example. Suppose -+ * that there are two queues with the same weight, but that -+ * the first queue receives requests of 8 sectors, while the -+ * second queue receives requests of 1024 sectors. In -+ * addition, suppose that each of the two queues contains at -+ * most one request at a time, which implies that each queue -+ * always remains idle after it is served. Finally, after -+ * remaining idle, each queue receives very quickly a new -+ * request. It follows that the two queues are served -+ * alternatively, preempting each other if needed. This -+ * implies that, although both queues have the same weight, -+ * the queue with large requests receives a service that is -+ * 1024/8 times as high as the service received by the other -+ * queue. -+ * -+ * On the other hand, device idling is performed, and thus -+ * pure sector-domain guarantees are provided, for the -+ * following queues, which are likely to need stronger -+ * throughput guarantees: weight-raised queues, and queues -+ * with a higher weight than other queues. When such queues -+ * are active, sub-condition (i) is false, which triggers -+ * device idling. - * - * According to the above considerations, the next variable is - * true (only) if sub-condition (i) holds. To compute the -@@ -2582,7 +3406,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * the function bfq_symmetric_scenario(), but also check - * whether bfqq is being weight-raised, because - * bfq_symmetric_scenario() does not take into account also -- * weight-raised queues (see comments to -+ * weight-raised queues (see comments on - * bfq_weights_tree_add()). - * - * As a side note, it is worth considering that the above -@@ -2604,17 +3428,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * bfqq. Such a case is when bfqq became active in a burst of - * queue activations. Queues that became active during a large - * burst benefit only from throughput, as discussed in the -- * comments to bfq_handle_burst. Thus, if bfqq became active -+ * comments on bfq_handle_burst. Thus, if bfqq became active - * in a burst and not idling the device maximizes throughput, - * then the device must no be idled, because not idling the - * device provides bfqq and all other queues in the burst with -- * maximum benefit. Combining this and the two cases above, we -- * can now establish when idling is actually needed to -- * preserve service guarantees. -+ * maximum benefit. Combining this and the above case, we can -+ * now establish when idling is actually needed to preserve -+ * service guarantees. - */ - idling_needed_for_service_guarantees = -- (on_hdd_and_not_all_queues_seeky || asymmetric_scenario) && -- !bfq_bfqq_in_large_burst(bfqq); -+ asymmetric_scenario && !bfq_bfqq_in_large_burst(bfqq); - - /* - * We have now all the components we need to compute the return -@@ -2624,6 +3447,16 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * 2) idling either boosts the throughput (without issues), or - * is necessary to preserve service guarantees. - */ -+ bfq_log_bfqq(bfqd, bfqq, "may_idle: sync %d idling_boosts_thr %d", -+ bfq_bfqq_sync(bfqq), idling_boosts_thr); -+ -+ bfq_log_bfqq(bfqd, bfqq, -+ "may_idle: wr_busy %d boosts %d IO-bound %d guar %d", -+ bfqd->wr_busy_queues, -+ idling_boosts_thr_without_issues, -+ bfq_bfqq_IO_bound(bfqq), -+ idling_needed_for_service_guarantees); -+ - return bfq_bfqq_sync(bfqq) && - (idling_boosts_thr_without_issues || - idling_needed_for_service_guarantees); -@@ -2635,7 +3468,7 @@ static bool bfq_bfqq_may_idle(struct bfq_queue *bfqq) - * 1) the queue must remain in service and cannot be expired, and - * 2) the device must be idled to wait for the possible arrival of a new - * request for the queue. -- * See the comments to the function bfq_bfqq_may_idle for the reasons -+ * See the comments on the function bfq_bfqq_may_idle for the reasons - * why performing device idling is the best choice to boost the throughput - * and preserve service guarantees when bfq_bfqq_may_idle itself - * returns true. -@@ -2665,18 +3498,33 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - bfq_log_bfqq(bfqd, bfqq, "select_queue: already in-service queue"); - - if (bfq_may_expire_for_budg_timeout(bfqq) && -- !timer_pending(&bfqd->idle_slice_timer) && -+ !hrtimer_active(&bfqd->idle_slice_timer) && - !bfq_bfqq_must_idle(bfqq)) - goto expire; - -+check_queue: -+ /* -+ * This loop is rarely executed more than once. Even when it -+ * happens, it is much more convenient to re-execute this loop -+ * than to return NULL and trigger a new dispatch to get a -+ * request served. -+ */ - next_rq = bfqq->next_rq; - /* - * If bfqq has requests queued and it has enough budget left to - * serve them, keep the queue, otherwise expire it. - */ - if (next_rq) { -+ BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -+ - if (bfq_serv_to_charge(next_rq, bfqq) > - bfq_bfqq_budget_left(bfqq)) { -+ /* -+ * Expire the queue for budget exhaustion, -+ * which makes sure that the next budget is -+ * enough to serve the next request, even if -+ * it comes from the fifo expired path. -+ */ - reason = BFQ_BFQQ_BUDGET_EXHAUSTED; - goto expire; - } else { -@@ -2685,7 +3533,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * not disable disk idling even when a new request - * arrives. - */ -- if (timer_pending(&bfqd->idle_slice_timer)) { -+ if (bfq_bfqq_wait_request(bfqq)) { -+ BUG_ON(!hrtimer_active(&bfqd->idle_slice_timer)); - /* - * If we get here: 1) at least a new request - * has arrived but we have not disabled the -@@ -2700,10 +3549,8 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * So we disable idling. - */ - bfq_clear_bfqq_wait_request(bfqq); -- del_timer(&bfqd->idle_slice_timer); --#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqg_stats_update_idle_time(bfqq_group(bfqq)); --#endif - } - goto keep_queue; - } -@@ -2714,7 +3561,7 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - * for a new request, or has requests waiting for a completion and - * may idle after their completion, then keep it anyway. - */ -- if (timer_pending(&bfqd->idle_slice_timer) || -+ if (hrtimer_active(&bfqd->idle_slice_timer) || - (bfqq->dispatched != 0 && bfq_bfqq_may_idle(bfqq))) { - bfqq = NULL; - goto keep_queue; -@@ -2725,9 +3572,16 @@ static struct bfq_queue *bfq_select_queue(struct bfq_data *bfqd) - bfq_bfqq_expire(bfqd, bfqq, false, reason); - new_queue: - bfqq = bfq_set_in_service_queue(bfqd); -- bfq_log(bfqd, "select_queue: new queue %d returned", -- bfqq ? bfqq->pid : 0); -+ if (bfqq) { -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: checking new queue"); -+ goto check_queue; -+ } - keep_queue: -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, "select_queue: returned this queue"); -+ else -+ bfq_log(bfqd, "select_queue: no queue returned"); -+ - return bfqq; - } - -@@ -2736,6 +3590,9 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - struct bfq_entity *entity = &bfqq->entity; - - if (bfqq->wr_coeff > 1) { /* queue is being weight-raised */ -+ BUG_ON(bfqq->wr_cur_max_time == bfqd->bfq_wr_rt_max_time && -+ time_is_after_jiffies(bfqq->last_wr_start_finish)); -+ - bfq_log_bfqq(bfqd, bfqq, - "raising period dur %u/%u msec, old coeff %u, w %d(%d)", - jiffies_to_msecs(jiffies - bfqq->last_wr_start_finish), -@@ -2749,22 +3606,30 @@ static void bfq_update_wr_data(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_log_bfqq(bfqd, bfqq, "WARN: pending prio change"); - - /* -- * If the queue was activated in a burst, or -- * too much time has elapsed from the beginning -- * of this weight-raising period, or the queue has -- * exceeded the acceptable number of cooperations, -- * then end weight raising. -+ * If the queue was activated in a burst, or too much -+ * time has elapsed from the beginning of this -+ * weight-raising period, then end weight raising. - */ -- if (bfq_bfqq_in_large_burst(bfqq) || -- bfq_bfqq_cooperations(bfqq) >= bfqd->bfq_coop_thresh || -- time_is_before_jiffies(bfqq->last_wr_start_finish + -- bfqq->wr_cur_max_time)) { -- bfqq->last_wr_start_finish = jiffies; -- bfq_log_bfqq(bfqd, bfqq, -- "wrais ending at %lu, rais_max_time %u", -- bfqq->last_wr_start_finish, -- jiffies_to_msecs(bfqq->wr_cur_max_time)); -+ if (bfq_bfqq_in_large_burst(bfqq)) - bfq_bfqq_end_wr(bfqq); -+ else if (time_is_before_jiffies(bfqq->last_wr_start_finish + -+ bfqq->wr_cur_max_time)) { -+ if (bfqq->wr_cur_max_time != bfqd->bfq_wr_rt_max_time || -+ time_is_before_jiffies(bfqq->wr_start_at_switch_to_srt + -+ bfq_wr_duration(bfqd))) -+ bfq_bfqq_end_wr(bfqq); -+ else { -+ /* switch back to interactive wr */ -+ bfqq->wr_coeff = bfqd->bfq_wr_coeff; -+ bfqq->wr_cur_max_time = bfq_wr_duration(bfqd); -+ bfqq->last_wr_start_finish = -+ bfqq->wr_start_at_switch_to_srt; -+ BUG_ON(time_is_after_jiffies( -+ bfqq->last_wr_start_finish)); -+ bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqd, bfqq, -+ "back to interactive wr"); -+ } - } - } - /* Update weight both if it must be raised and if it must be lowered */ -@@ -2782,46 +3647,34 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - struct bfq_queue *bfqq) - { - int dispatched = 0; -- struct request *rq; -+ struct request *rq = bfqq->next_rq; - unsigned long service_to_charge; - - BUG_ON(RB_EMPTY_ROOT(&bfqq->sort_list)); -- -- /* Follow expired path, else get first next available. */ -- rq = bfq_check_fifo(bfqq); -- if (!rq) -- rq = bfqq->next_rq; -+ BUG_ON(!rq); - service_to_charge = bfq_serv_to_charge(rq, bfqq); - -- if (service_to_charge > bfq_bfqq_budget_left(bfqq)) { -- /* -- * This may happen if the next rq is chosen in fifo order -- * instead of sector order. The budget is properly -- * dimensioned to be always sufficient to serve the next -- * request only if it is chosen in sector order. The reason -- * is that it would be quite inefficient and little useful -- * to always make sure that the budget is large enough to -- * serve even the possible next rq in fifo order. -- * In fact, requests are seldom served in fifo order. -- * -- * Expire the queue for budget exhaustion, and make sure -- * that the next act_budget is enough to serve the next -- * request, even if it comes from the fifo expired path. -- */ -- bfqq->next_rq = rq; -- /* -- * Since this dispatch is failed, make sure that -- * a new one will be performed -- */ -- if (!bfqd->rq_in_driver) -- bfq_schedule_dispatch(bfqd); -- goto expire; -- } -+ BUG_ON(service_to_charge > bfq_bfqq_budget_left(bfqq)); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); - -- /* Finally, insert request into driver dispatch list. */ - bfq_bfqq_served(bfqq, service_to_charge); -+ -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); -+ - bfq_dispatch_insert(bfqd->queue, rq); - -+ /* -+ * If weight raising has to terminate for bfqq, then next -+ * function causes an immediate update of bfqq's weight, -+ * without waiting for next activation. As a consequence, on -+ * expiration, bfqq will be timestamped as if has never been -+ * weight-raised during this service slot, even if it has -+ * received part or even most of the service as a -+ * weight-raised queue. This inflates bfqq's timestamps, which -+ * is beneficial, as bfqq is then more willing to leave the -+ * device immediately to possible other weight-raised queues. -+ */ - bfq_update_wr_data(bfqd, bfqq); - - bfq_log_bfqq(bfqd, bfqq, -@@ -2837,9 +3690,7 @@ static int bfq_dispatch_request(struct bfq_data *bfqd, - bfqd->in_service_bic = RQ_BIC(rq); - } - -- if (bfqd->busy_queues > 1 && ((!bfq_bfqq_sync(bfqq) && -- dispatched >= bfqd->bfq_max_budget_async_rq) || -- bfq_class_idle(bfqq))) -+ if (bfqd->busy_queues > 1 && bfq_class_idle(bfqq)) - goto expire; - - return dispatched; -@@ -2885,8 +3736,8 @@ static int bfq_forced_dispatch(struct bfq_data *bfqd) - st = bfq_entity_service_tree(&bfqq->entity); - - dispatched += __bfq_forced_dispatch_bfqq(bfqq); -- bfqq->max_budget = bfq_max_budget(bfqd); - -+ bfqq->max_budget = bfq_max_budget(bfqd); - bfq_forget_idle(st); - } - -@@ -2899,37 +3750,37 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; -- int max_dispatch; - - bfq_log(bfqd, "dispatch requests: %d busy queues", bfqd->busy_queues); -+ - if (bfqd->busy_queues == 0) - return 0; - - if (unlikely(force)) - return bfq_forced_dispatch(bfqd); - -+ /* -+ * Force device to serve one request at a time if -+ * strict_guarantees is true. Forcing this service scheme is -+ * currently the ONLY way to guarantee that the request -+ * service order enforced by the scheduler is respected by a -+ * queueing device. Otherwise the device is free even to make -+ * some unlucky request wait for as long as the device -+ * wishes. -+ * -+ * Of course, serving one request at at time may cause loss of -+ * throughput. -+ */ -+ if (bfqd->strict_guarantees && bfqd->rq_in_driver > 0) -+ return 0; -+ - bfqq = bfq_select_queue(bfqd); - if (!bfqq) - return 0; - -- if (bfq_class_idle(bfqq)) -- max_dispatch = 1; -- -- if (!bfq_bfqq_sync(bfqq)) -- max_dispatch = bfqd->bfq_max_budget_async_rq; -- -- if (!bfq_bfqq_sync(bfqq) && bfqq->dispatched >= max_dispatch) { -- if (bfqd->busy_queues > 1) -- return 0; -- if (bfqq->dispatched >= 4 * max_dispatch) -- return 0; -- } -- -- if (bfqd->sync_flight != 0 && !bfq_bfqq_sync(bfqq)) -- return 0; -+ BUG_ON(bfqq->entity.budget < bfqq->entity.service); - -- bfq_clear_bfqq_wait_request(bfqq); -- BUG_ON(timer_pending(&bfqd->idle_slice_timer)); -+ BUG_ON(bfq_bfqq_wait_request(bfqq)); - - if (!bfq_dispatch_request(bfqd, bfqq)) - return 0; -@@ -2937,6 +3788,8 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - bfq_log_bfqq(bfqd, bfqq, "dispatched %s request", - bfq_bfqq_sync(bfqq) ? "sync" : "async"); - -+ BUG_ON(bfqq->next_rq == NULL && -+ bfqq->entity.budget < bfqq->entity.service); - return 1; - } - -@@ -2948,23 +3801,21 @@ static int bfq_dispatch_requests(struct request_queue *q, int force) - */ - static void bfq_put_queue(struct bfq_queue *bfqq) - { -- struct bfq_data *bfqd = bfqq->bfqd; - #ifdef CONFIG_BFQ_GROUP_IOSCHED - struct bfq_group *bfqg = bfqq_group(bfqq); - #endif - -- BUG_ON(atomic_read(&bfqq->ref) <= 0); -+ BUG_ON(bfqq->ref <= 0); - -- bfq_log_bfqq(bfqd, bfqq, "put_queue: %p %d", bfqq, -- atomic_read(&bfqq->ref)); -- if (!atomic_dec_and_test(&bfqq->ref)) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p %d", bfqq, bfqq->ref); -+ bfqq->ref--; -+ if (bfqq->ref) - return; - - BUG_ON(rb_first(&bfqq->sort_list)); - BUG_ON(bfqq->allocated[READ] + bfqq->allocated[WRITE] != 0); - BUG_ON(bfqq->entity.tree); - BUG_ON(bfq_bfqq_busy(bfqq)); -- BUG_ON(bfqd->in_service_queue == bfqq); - - if (bfq_bfqq_sync(bfqq)) - /* -@@ -2977,7 +3828,7 @@ static void bfq_put_queue(struct bfq_queue *bfqq) - */ - hlist_del_init(&bfqq->burst_list_node); - -- bfq_log_bfqq(bfqd, bfqq, "put_queue: %p freed", bfqq); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "put_queue: %p freed", bfqq); - - kmem_cache_free(bfq_pool, bfqq); - #ifdef CONFIG_BFQ_GROUP_IOSCHED -@@ -3011,8 +3862,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_schedule_dispatch(bfqd); - } - -- bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, -- atomic_read(&bfqq->ref)); -+ bfq_log_bfqq(bfqd, bfqq, "exit_bfqq: %p, %d", bfqq, bfqq->ref); - - bfq_put_cooperator(bfqq); - -@@ -3021,28 +3871,7 @@ static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - - static void bfq_init_icq(struct io_cq *icq) - { -- struct bfq_io_cq *bic = icq_to_bic(icq); -- -- bic->ttime.last_end_request = jiffies; -- /* -- * A newly created bic indicates that the process has just -- * started doing I/O, and is probably mapping into memory its -- * executable and libraries: it definitely needs weight raising. -- * There is however the possibility that the process performs, -- * for a while, I/O close to some other process. EQM intercepts -- * this behavior and may merge the queue corresponding to the -- * process with some other queue, BEFORE the weight of the queue -- * is raised. Merged queues are not weight-raised (they are assumed -- * to belong to processes that benefit only from high throughput). -- * If the merge is basically the consequence of an accident, then -- * the queue will be split soon and will get back its old weight. -- * It is then important to write down somewhere that this queue -- * does need weight raising, even if it did not make it to get its -- * weight raised before being merged. To this purpose, we overload -- * the field raising_time_left and assign 1 to it, to mark the queue -- * as needing weight raising. -- */ -- bic->wr_time_left = 1; -+ icq_to_bic(icq)->ttime.last_end_request = ktime_get_ns() - (1ULL<<32); - } - - static void bfq_exit_icq(struct io_cq *icq) -@@ -3050,21 +3879,21 @@ static void bfq_exit_icq(struct io_cq *icq) - struct bfq_io_cq *bic = icq_to_bic(icq); - struct bfq_data *bfqd = bic_to_bfqd(bic); - -- if (bic->bfqq[BLK_RW_ASYNC]) { -- bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_ASYNC]); -- bic->bfqq[BLK_RW_ASYNC] = NULL; -+ if (bic_to_bfqq(bic, false)) { -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, false)); -+ bic_set_bfqq(bic, NULL, false); - } - -- if (bic->bfqq[BLK_RW_SYNC]) { -+ if (bic_to_bfqq(bic, true)) { - /* - * If the bic is using a shared queue, put the reference - * taken on the io_context when the bic started using a - * shared bfq_queue. - */ -- if (bfq_bfqq_coop(bic->bfqq[BLK_RW_SYNC])) -+ if (bfq_bfqq_coop(bic_to_bfqq(bic, true))) - put_io_context(icq->ioc); -- bfq_exit_bfqq(bfqd, bic->bfqq[BLK_RW_SYNC]); -- bic->bfqq[BLK_RW_SYNC] = NULL; -+ bfq_exit_bfqq(bfqd, bic_to_bfqq(bic, true)); -+ bic_set_bfqq(bic, NULL, true); - } - } - -@@ -3072,8 +3901,8 @@ static void bfq_exit_icq(struct io_cq *icq) - * Update the entity prio values; note that the new values will not - * be used until the next (re)activation. - */ --static void --bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) -+static void bfq_set_next_ioprio_data(struct bfq_queue *bfqq, -+ struct bfq_io_cq *bic) - { - struct task_struct *tsk = current; - int ioprio_class; -@@ -3105,7 +3934,7 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) - break; - } - -- if (bfqq->new_ioprio < 0 || bfqq->new_ioprio >= IOPRIO_BE_NR) { -+ if (bfqq->new_ioprio >= IOPRIO_BE_NR) { - pr_crit("bfq_set_next_ioprio_data: new_ioprio %d\n", - bfqq->new_ioprio); - BUG(); -@@ -3113,45 +3942,40 @@ bfq_set_next_ioprio_data(struct bfq_queue *bfqq, struct bfq_io_cq *bic) - - bfqq->entity.new_weight = bfq_ioprio_to_weight(bfqq->new_ioprio); - bfqq->entity.prio_changed = 1; -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "set_next_ioprio_data: bic_class %d prio %d class %d", -+ ioprio_class, bfqq->new_ioprio, bfqq->new_ioprio_class); - } - - static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio) - { -- struct bfq_data *bfqd; -- struct bfq_queue *bfqq, *new_bfqq; -+ struct bfq_data *bfqd = bic_to_bfqd(bic); -+ struct bfq_queue *bfqq; - unsigned long uninitialized_var(flags); - int ioprio = bic->icq.ioc->ioprio; - -- bfqd = bfq_get_bfqd_locked(&(bic->icq.q->elevator->elevator_data), -- &flags); - /* - * This condition may trigger on a newly created bic, be sure to - * drop the lock before returning. - */ - if (unlikely(!bfqd) || likely(bic->ioprio == ioprio)) -- goto out; -+ return; - - bic->ioprio = ioprio; - -- bfqq = bic->bfqq[BLK_RW_ASYNC]; -+ bfqq = bic_to_bfqq(bic, false); - if (bfqq) { -- new_bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic, -- GFP_ATOMIC); -- if (new_bfqq) { -- bic->bfqq[BLK_RW_ASYNC] = new_bfqq; -- bfq_log_bfqq(bfqd, bfqq, -- "check_ioprio_change: bfqq %p %d", -- bfqq, atomic_read(&bfqq->ref)); -- bfq_put_queue(bfqq); -- } -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, BLK_RW_ASYNC, bic); -+ bic_set_bfqq(bic, bfqq, false); -+ bfq_log_bfqq(bfqd, bfqq, -+ "check_ioprio_change: bfqq %p %d", -+ bfqq, bfqq->ref); - } - -- bfqq = bic->bfqq[BLK_RW_SYNC]; -+ bfqq = bic_to_bfqq(bic, true); - if (bfqq) - bfq_set_next_ioprio_data(bfqq, bic); -- --out: -- bfq_put_bfqd_unlock(bfqd, &flags); - } - - static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -@@ -3160,8 +3984,9 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - RB_CLEAR_NODE(&bfqq->entity.rb_node); - INIT_LIST_HEAD(&bfqq->fifo); - INIT_HLIST_NODE(&bfqq->burst_list_node); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); - -- atomic_set(&bfqq->ref, 0); -+ bfqq->ref = 0; - bfqq->bfqd = bfqd; - - if (bic) -@@ -3171,6 +3996,7 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - if (!bfq_class_idle(bfqq)) - bfq_mark_bfqq_idle_window(bfqq); - bfq_mark_bfqq_sync(bfqq); -+ bfq_mark_bfqq_just_created(bfqq); - } else - bfq_clear_bfqq_sync(bfqq); - bfq_mark_bfqq_IO_bound(bfqq); -@@ -3180,72 +4006,19 @@ static void bfq_init_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, - bfqq->pid = pid; - - bfqq->wr_coeff = 1; -- bfqq->last_wr_start_finish = 0; -+ bfqq->last_wr_start_finish = jiffies; -+ bfqq->wr_start_at_switch_to_srt = bfq_smallest_from_now(); -+ bfqq->budget_timeout = bfq_smallest_from_now(); -+ bfqq->split_time = bfq_smallest_from_now(); -+ - /* - * Set to the value for which bfqq will not be deemed as - * soft rt when it becomes backlogged. - */ -- bfqq->soft_rt_next_start = bfq_infinity_from_now(jiffies); --} -- --static struct bfq_queue *bfq_find_alloc_queue(struct bfq_data *bfqd, -- struct bio *bio, int is_sync, -- struct bfq_io_cq *bic, -- gfp_t gfp_mask) --{ -- struct bfq_group *bfqg; -- struct bfq_queue *bfqq, *new_bfqq = NULL; -- struct blkcg *blkcg; -- --retry: -- rcu_read_lock(); -- -- blkcg = bio_blkcg(bio); -- bfqg = bfq_find_alloc_group(bfqd, blkcg); -- /* bic always exists here */ -- bfqq = bic_to_bfqq(bic, is_sync); -- -- /* -- * Always try a new alloc if we fall back to the OOM bfqq -- * originally, since it should just be a temporary situation. -- */ -- if (!bfqq || bfqq == &bfqd->oom_bfqq) { -- bfqq = NULL; -- if (new_bfqq) { -- bfqq = new_bfqq; -- new_bfqq = NULL; -- } else if (gfpflags_allow_blocking(gfp_mask)) { -- rcu_read_unlock(); -- spin_unlock_irq(bfqd->queue->queue_lock); -- new_bfqq = kmem_cache_alloc_node(bfq_pool, -- gfp_mask | __GFP_ZERO, -- bfqd->queue->node); -- spin_lock_irq(bfqd->queue->queue_lock); -- if (new_bfqq) -- goto retry; -- } else { -- bfqq = kmem_cache_alloc_node(bfq_pool, -- gfp_mask | __GFP_ZERO, -- bfqd->queue->node); -- } -- -- if (bfqq) { -- bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -- is_sync); -- bfq_init_entity(&bfqq->entity, bfqg); -- bfq_log_bfqq(bfqd, bfqq, "allocated"); -- } else { -- bfqq = &bfqd->oom_bfqq; -- bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -- } -- } -- -- if (new_bfqq) -- kmem_cache_free(bfq_pool, new_bfqq); -- -- rcu_read_unlock(); -+ bfqq->soft_rt_next_start = bfq_greatest_from_now(); - -- return bfqq; -+ /* first request is almost certainly seeky */ -+ bfqq->seek_history = 1; - } - - static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, -@@ -3268,90 +4041,93 @@ static struct bfq_queue **bfq_async_queue_prio(struct bfq_data *bfqd, - } - - static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -- struct bio *bio, int is_sync, -- struct bfq_io_cq *bic, gfp_t gfp_mask) -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic) - { - const int ioprio = IOPRIO_PRIO_DATA(bic->ioprio); - const int ioprio_class = IOPRIO_PRIO_CLASS(bic->ioprio); - struct bfq_queue **async_bfqq = NULL; -- struct bfq_queue *bfqq = NULL; -+ struct bfq_queue *bfqq; -+ struct bfq_group *bfqg; - -- if (!is_sync) { -- struct blkcg *blkcg; -- struct bfq_group *bfqg; -+ rcu_read_lock(); -+ -+ bfqg = bfq_find_set_group(bfqd, bio_blkcg(bio)); -+ if (!bfqg) { -+ bfqq = &bfqd->oom_bfqq; -+ goto out; -+ } - -- rcu_read_lock(); -- blkcg = bio_blkcg(bio); -- rcu_read_unlock(); -- bfqg = bfq_find_alloc_group(bfqd, blkcg); -+ if (!is_sync) { - async_bfqq = bfq_async_queue_prio(bfqd, bfqg, ioprio_class, - ioprio); - bfqq = *async_bfqq; -+ if (bfqq) -+ goto out; - } - -- if (!bfqq) -- bfqq = bfq_find_alloc_queue(bfqd, bio, is_sync, bic, gfp_mask); -+ bfqq = kmem_cache_alloc_node(bfq_pool, -+ GFP_NOWAIT | __GFP_ZERO | __GFP_NOWARN, -+ bfqd->queue->node); -+ -+ if (bfqq) { -+ bfq_init_bfqq(bfqd, bfqq, bic, current->pid, -+ is_sync); -+ bfq_init_entity(&bfqq->entity, bfqg); -+ bfq_log_bfqq(bfqd, bfqq, "allocated"); -+ } else { -+ bfqq = &bfqd->oom_bfqq; -+ bfq_log_bfqq(bfqd, bfqq, "using oom bfqq"); -+ goto out; -+ } - - /* - * Pin the queue now that it's allocated, scheduler exit will - * prune it. - */ -- if (!is_sync && !(*async_bfqq)) { -- atomic_inc(&bfqq->ref); -+ if (async_bfqq) { -+ bfqq->ref++; /* -+ * Extra group reference, w.r.t. sync -+ * queue. This extra reference is removed -+ * only if bfqq->bfqg disappears, to -+ * guarantee that this queue is not freed -+ * until its group goes away. -+ */ - bfq_log_bfqq(bfqd, bfqq, "get_queue, bfqq not in async: %p, %d", -- bfqq, atomic_read(&bfqq->ref)); -+ bfqq, bfqq->ref); - *async_bfqq = bfqq; - } - -- atomic_inc(&bfqq->ref); -- bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, -- atomic_read(&bfqq->ref)); -+out: -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "get_queue, at end: %p, %d", bfqq, bfqq->ref); -+ rcu_read_unlock(); - return bfqq; - } - - static void bfq_update_io_thinktime(struct bfq_data *bfqd, - struct bfq_io_cq *bic) - { -- unsigned long elapsed = jiffies - bic->ttime.last_end_request; -- unsigned long ttime = min(elapsed, 2UL * bfqd->bfq_slice_idle); -+ struct bfq_ttime *ttime = &bic->ttime; -+ u64 elapsed = ktime_get_ns() - bic->ttime.last_end_request; - -- bic->ttime.ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -- bic->ttime.ttime_total = (7*bic->ttime.ttime_total + 256*ttime) / 8; -- bic->ttime.ttime_mean = (bic->ttime.ttime_total + 128) / -- bic->ttime.ttime_samples; -+ elapsed = min_t(u64, elapsed, 2 * bfqd->bfq_slice_idle); -+ -+ ttime->ttime_samples = (7*bic->ttime.ttime_samples + 256) / 8; -+ ttime->ttime_total = div_u64(7*ttime->ttime_total + 256*elapsed, 8); -+ ttime->ttime_mean = div64_ul(ttime->ttime_total + 128, -+ ttime->ttime_samples); - } - --static void bfq_update_io_seektime(struct bfq_data *bfqd, -- struct bfq_queue *bfqq, -- struct request *rq) -+static void -+bfq_update_io_seektime(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ struct request *rq) - { -- sector_t sdist; -- u64 total; -- -- if (bfqq->last_request_pos < blk_rq_pos(rq)) -- sdist = blk_rq_pos(rq) - bfqq->last_request_pos; -- else -- sdist = bfqq->last_request_pos - blk_rq_pos(rq); -- -- /* -- * Don't allow the seek distance to get too large from the -- * odd fragment, pagein, etc. -- */ -- if (bfqq->seek_samples == 0) /* first request, not really a seek */ -- sdist = 0; -- else if (bfqq->seek_samples <= 60) /* second & third seek */ -- sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*1024); -- else -- sdist = min(sdist, (bfqq->seek_mean * 4) + 2*1024*64); -- -- bfqq->seek_samples = (7*bfqq->seek_samples + 256) / 8; -- bfqq->seek_total = (7*bfqq->seek_total + (u64)256*sdist) / 8; -- total = bfqq->seek_total + (bfqq->seek_samples/2); -- do_div(total, bfqq->seek_samples); -- bfqq->seek_mean = (sector_t)total; -- -- bfq_log_bfqq(bfqd, bfqq, "dist=%llu mean=%llu", (u64)sdist, -- (u64)bfqq->seek_mean); -+ bfqq->seek_history <<= 1; -+ bfqq->seek_history |= -+ get_sdist(bfqq->last_request_pos, rq) > BFQQ_SEEK_THR && -+ (!blk_queue_nonrot(bfqd->queue) || -+ blk_rq_sectors(rq) < BFQQ_SECT_THR_NONROT); - } - - /* -@@ -3369,7 +4145,8 @@ static void bfq_update_idle_window(struct bfq_data *bfqd, - return; - - /* Idle window just restored, statistics are meaningless. */ -- if (bfq_bfqq_just_split(bfqq)) -+ if (time_is_after_eq_jiffies(bfqq->split_time + -+ bfqd->bfq_wr_min_idle_time)) - return; - - enable_idle = bfq_bfqq_idle_window(bfqq); -@@ -3409,22 +4186,13 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - - bfq_update_io_thinktime(bfqd, bic); - bfq_update_io_seektime(bfqd, bfqq, rq); -- if (!BFQQ_SEEKY(bfqq) && bfq_bfqq_constantly_seeky(bfqq)) { -- bfq_clear_bfqq_constantly_seeky(bfqq); -- if (!blk_queue_nonrot(bfqd->queue)) { -- BUG_ON(!bfqd->const_seeky_busy_in_flight_queues); -- bfqd->const_seeky_busy_in_flight_queues--; -- } -- } - if (bfqq->entity.service > bfq_max_budget(bfqd) / 8 || - !BFQQ_SEEKY(bfqq)) - bfq_update_idle_window(bfqd, bfqq, bic); -- bfq_clear_bfqq_just_split(bfqq); - - bfq_log_bfqq(bfqd, bfqq, -- "rq_enqueued: idle_window=%d (seeky %d, mean %llu)", -- bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq), -- (unsigned long long) bfqq->seek_mean); -+ "rq_enqueued: idle_window=%d (seeky %d)", -+ bfq_bfqq_idle_window(bfqq), BFQQ_SEEKY(bfqq)); - - bfqq->last_request_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); - -@@ -3438,14 +4206,15 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * is small and the queue is not to be expired, then - * just exit. - * -- * In this way, if the disk is being idled to wait for -- * a new request from the in-service queue, we avoid -- * unplugging the device and committing the disk to serve -- * just a small request. On the contrary, we wait for -- * the block layer to decide when to unplug the device: -- * hopefully, new requests will be merged to this one -- * quickly, then the device will be unplugged and -- * larger requests will be dispatched. -+ * In this way, if the device is being idled to wait -+ * for a new request from the in-service queue, we -+ * avoid unplugging the device and committing the -+ * device to serve just a small request. On the -+ * contrary, we wait for the block layer to decide -+ * when to unplug the device: hopefully, new requests -+ * will be merged to this one quickly, then the device -+ * will be unplugged and larger requests will be -+ * dispatched. - */ - if (small_req && !budget_timeout) - return; -@@ -3457,10 +4226,8 @@ static void bfq_rq_enqueued(struct bfq_data *bfqd, struct bfq_queue *bfqq, - * timer. - */ - bfq_clear_bfqq_wait_request(bfqq); -- del_timer(&bfqd->idle_slice_timer); --#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqg_stats_update_idle_time(bfqq_group(bfqq)); --#endif - - /* - * The queue is not empty, because a new request just -@@ -3504,28 +4271,20 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - */ - new_bfqq->allocated[rq_data_dir(rq)]++; - bfqq->allocated[rq_data_dir(rq)]--; -- atomic_inc(&new_bfqq->ref); -+ new_bfqq->ref++; -+ bfq_clear_bfqq_just_created(bfqq); - bfq_put_queue(bfqq); - if (bic_to_bfqq(RQ_BIC(rq), 1) == bfqq) - bfq_merge_bfqqs(bfqd, RQ_BIC(rq), - bfqq, new_bfqq); - rq->elv.priv[1] = new_bfqq; - bfqq = new_bfqq; -- } else -- bfq_bfqq_increase_failed_cooperations(bfqq); -+ } - } - - bfq_add_request(rq); - -- /* -- * Here a newly-created bfq_queue has already started a weight-raising -- * period: clear raising_time_left to prevent bfq_bfqq_save_state() -- * from assigning it a full weight-raising period. See the detailed -- * comments about this field in bfq_init_icq(). -- */ -- if (bfqq->bic) -- bfqq->bic->wr_time_left = 0; -- rq->fifo_time = jiffies + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; -+ rq->fifo_time = ktime_get_ns() + bfqd->bfq_fifo_expire[rq_is_sync(rq)]; - list_add_tail(&rq->queuelist, &bfqq->fifo); - - bfq_rq_enqueued(bfqd, bfqq, rq); -@@ -3533,8 +4292,8 @@ static void bfq_insert_request(struct request_queue *q, struct request *rq) - - static void bfq_update_hw_tag(struct bfq_data *bfqd) - { -- bfqd->max_rq_in_driver = max(bfqd->max_rq_in_driver, -- bfqd->rq_in_driver); -+ bfqd->max_rq_in_driver = max_t(int, bfqd->max_rq_in_driver, -+ bfqd->rq_in_driver); - - if (bfqd->hw_tag == 1) - return; -@@ -3560,48 +4319,85 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - { - struct bfq_queue *bfqq = RQ_BFQQ(rq); - struct bfq_data *bfqd = bfqq->bfqd; -- bool sync = bfq_bfqq_sync(bfqq); -+ u64 now_ns; -+ u32 delta_us; - -- bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left (%d)", -- blk_rq_sectors(rq), sync); -+ bfq_log_bfqq(bfqd, bfqq, "completed one req with %u sects left", -+ blk_rq_sectors(rq)); - -+ assert_spin_locked(bfqd->queue->queue_lock); - bfq_update_hw_tag(bfqd); - - BUG_ON(!bfqd->rq_in_driver); - BUG_ON(!bfqq->dispatched); - bfqd->rq_in_driver--; - bfqq->dispatched--; --#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_completion(bfqq_group(bfqq), - rq_start_time_ns(rq), -- rq_io_start_time_ns(rq), rq->cmd_flags); --#endif -+ rq_io_start_time_ns(rq), -+ rq->cmd_flags); - - if (!bfqq->dispatched && !bfq_bfqq_busy(bfqq)) { -+ BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -+ /* -+ * Set budget_timeout (which we overload to store the -+ * time at which the queue remains with no backlog and -+ * no outstanding request; used by the weight-raising -+ * mechanism). -+ */ -+ bfqq->budget_timeout = jiffies; -+ - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); -- if (!blk_queue_nonrot(bfqd->queue)) { -- BUG_ON(!bfqd->busy_in_flight_queues); -- bfqd->busy_in_flight_queues--; -- if (bfq_bfqq_constantly_seeky(bfqq)) { -- BUG_ON(!bfqd-> -- const_seeky_busy_in_flight_queues); -- bfqd->const_seeky_busy_in_flight_queues--; -- } -- } - } - -- if (sync) { -- bfqd->sync_flight--; -- RQ_BIC(rq)->ttime.last_end_request = jiffies; -- } -+ now_ns = ktime_get_ns(); -+ -+ RQ_BIC(rq)->ttime.last_end_request = now_ns; -+ -+ /* -+ * Using us instead of ns, to get a reasonable precision in -+ * computing rate in next check. -+ */ -+ delta_us = div_u64(now_ns - bfqd->last_completion, NSEC_PER_USEC); -+ -+ bfq_log(bfqd, "rq_completed: delta %uus/%luus max_size %u rate %llu/%llu", -+ delta_us, BFQ_MIN_TT/NSEC_PER_USEC, bfqd->last_rq_max_size, -+ (USEC_PER_SEC* -+ (u64)((bfqd->last_rq_max_size<>BFQ_RATE_SHIFT, -+ (USEC_PER_SEC*(u64)(1UL<<(BFQ_RATE_SHIFT-10)))>>BFQ_RATE_SHIFT); -+ -+ /* -+ * If the request took rather long to complete, and, according -+ * to the maximum request size recorded, this completion latency -+ * implies that the request was certainly served at a very low -+ * rate (less than 1M sectors/sec), then the whole observation -+ * interval that lasts up to this time instant cannot be a -+ * valid time interval for computing a new peak rate. Invoke -+ * bfq_update_rate_reset to have the following three steps -+ * taken: -+ * - close the observation interval at the last (previous) -+ * request dispatch or completion -+ * - compute rate, if possible, for that observation interval -+ * - reset to zero samples, which will trigger a proper -+ * re-initialization of the observation interval on next -+ * dispatch -+ */ -+ if (delta_us > BFQ_MIN_TT/NSEC_PER_USEC && -+ (bfqd->last_rq_max_size<last_completion = now_ns; - - /* -- * If we are waiting to discover whether the request pattern of the -- * task associated with the queue is actually isochronous, and -- * both requisites for this condition to hold are satisfied, then -- * compute soft_rt_next_start (see the comments to the function -- * bfq_bfqq_softrt_next_start()). -+ * If we are waiting to discover whether the request pattern -+ * of the task associated with the queue is actually -+ * isochronous, and both requisites for this condition to hold -+ * are now satisfied, then compute soft_rt_next_start (see the -+ * comments on the function bfq_bfqq_softrt_next_start()). We -+ * schedule this delayed check when bfqq expires, if it still -+ * has in-flight requests. - */ - if (bfq_bfqq_softrt_update(bfqq) && bfqq->dispatched == 0 && - RB_EMPTY_ROOT(&bfqq->sort_list)) -@@ -3613,10 +4409,7 @@ static void bfq_completed_request(struct request_queue *q, struct request *rq) - * or if we want to idle in case it has no pending requests. - */ - if (bfqd->in_service_queue == bfqq) { -- if (bfq_bfqq_budget_new(bfqq)) -- bfq_set_budget_timeout(bfqd); -- -- if (bfq_bfqq_must_idle(bfqq)) { -+ if (bfqq->dispatched == 0 && bfq_bfqq_must_idle(bfqq)) { - bfq_arm_slice_timer(bfqd); - goto out; - } else if (bfq_may_expire_for_budg_timeout(bfqq)) -@@ -3646,7 +4439,7 @@ static int __bfq_may_queue(struct bfq_queue *bfqq) - return ELV_MQUEUE_MAY; - } - --static int bfq_may_queue(struct request_queue *q, int rw) -+static int bfq_may_queue(struct request_queue *q, unsigned int op) - { - struct bfq_data *bfqd = q->elevator->elevator_data; - struct task_struct *tsk = current; -@@ -3663,7 +4456,7 @@ static int bfq_may_queue(struct request_queue *q, int rw) - if (!bic) - return ELV_MQUEUE_MAY; - -- bfqq = bic_to_bfqq(bic, rw_is_sync(rw)); -+ bfqq = bic_to_bfqq(bic, op_is_sync(op)); - if (bfqq) - return __bfq_may_queue(bfqq); - -@@ -3687,14 +4480,14 @@ static void bfq_put_request(struct request *rq) - rq->elv.priv[1] = NULL; - - bfq_log_bfqq(bfqq->bfqd, bfqq, "put_request %p, %d", -- bfqq, atomic_read(&bfqq->ref)); -+ bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } - } - - /* - * Returns NULL if a new bfqq should be allocated, or the old bfqq if this -- * was the last process referring to said bfqq. -+ * was the last process referring to that bfqq. - */ - static struct bfq_queue * - bfq_split_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq) -@@ -3732,11 +4525,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - unsigned long flags; - bool split = false; - -- might_sleep_if(gfpflags_allow_blocking(gfp_mask)); -- -- bfq_check_ioprio_change(bic, bio); -- - spin_lock_irqsave(q->queue_lock, flags); -+ bfq_check_ioprio_change(bic, bio); - - if (!bic) - goto queue_fail; -@@ -3746,23 +4536,47 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - new_queue: - bfqq = bic_to_bfqq(bic, is_sync); - if (!bfqq || bfqq == &bfqd->oom_bfqq) { -- bfqq = bfq_get_queue(bfqd, bio, is_sync, bic, gfp_mask); -+ if (bfqq) -+ bfq_put_queue(bfqq); -+ bfqq = bfq_get_queue(bfqd, bio, is_sync, bic); -+ BUG_ON(!hlist_unhashed(&bfqq->burst_list_node)); -+ - bic_set_bfqq(bic, bfqq, is_sync); - if (split && is_sync) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: was_in_list %d " -+ "was_in_large_burst %d " -+ "large burst in progress %d", -+ bic->was_in_burst_list, -+ bic->saved_in_large_burst, -+ bfqd->large_burst); -+ - if ((bic->was_in_burst_list && bfqd->large_burst) || -- bic->saved_in_large_burst) -+ bic->saved_in_large_burst) { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: marking in " -+ "large burst"); - bfq_mark_bfqq_in_large_burst(bfqq); -- else { -+ } else { -+ bfq_log_bfqq(bfqd, bfqq, -+ "set_request: clearing in " -+ "large burst"); - bfq_clear_bfqq_in_large_burst(bfqq); - if (bic->was_in_burst_list) - hlist_add_head(&bfqq->burst_list_node, - &bfqd->burst_list); - } -+ bfqq->split_time = jiffies; - } - } else { - /* If the queue was seeky for too long, break it apart. */ - if (bfq_bfqq_coop(bfqq) && bfq_bfqq_split_coop(bfqq)) { - bfq_log_bfqq(bfqd, bfqq, "breaking apart bfqq"); -+ -+ /* Update bic before losing reference to bfqq */ -+ if (bfq_bfqq_in_large_burst(bfqq)) -+ bic->saved_in_large_burst = true; -+ - bfqq = bfq_split_bfqq(bic, bfqq); - split = true; - if (!bfqq) -@@ -3771,9 +4585,8 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - } - - bfqq->allocated[rw]++; -- atomic_inc(&bfqq->ref); -- bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, -- atomic_read(&bfqq->ref)); -+ bfqq->ref++; -+ bfq_log_bfqq(bfqd, bfqq, "set_request: bfqq %p, %d", bfqq, bfqq->ref); - - rq->elv.priv[0] = bic; - rq->elv.priv[1] = bfqq; -@@ -3788,7 +4601,6 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - if (likely(bfqq != &bfqd->oom_bfqq) && bfqq_process_refs(bfqq) == 1) { - bfqq->bic = bic; - if (split) { -- bfq_mark_bfqq_just_split(bfqq); - /* - * If the queue has just been split from a shared - * queue, restore the idle window and the possible -@@ -3798,6 +4610,9 @@ static int bfq_set_request(struct request_queue *q, struct request *rq, - } - } - -+ if (unlikely(bfq_bfqq_just_created(bfqq))) -+ bfq_handle_burst(bfqd, bfqq); -+ - spin_unlock_irqrestore(q->queue_lock, flags); - - return 0; -@@ -3824,9 +4639,10 @@ static void bfq_kick_queue(struct work_struct *work) - * Handler of the expiration of the timer running if the in-service queue - * is idling inside its time slice. - */ --static void bfq_idle_slice_timer(unsigned long data) -+static enum hrtimer_restart bfq_idle_slice_timer(struct hrtimer *timer) - { -- struct bfq_data *bfqd = (struct bfq_data *)data; -+ struct bfq_data *bfqd = container_of(timer, struct bfq_data, -+ idle_slice_timer); - struct bfq_queue *bfqq; - unsigned long flags; - enum bfqq_expiration reason; -@@ -3844,6 +4660,8 @@ static void bfq_idle_slice_timer(unsigned long data) - */ - if (bfqq) { - bfq_log_bfqq(bfqd, bfqq, "slice_timer expired"); -+ bfq_clear_bfqq_wait_request(bfqq); -+ - if (bfq_bfqq_budget_timeout(bfqq)) - /* - * Also here the queue can be safely expired -@@ -3869,11 +4687,12 @@ static void bfq_idle_slice_timer(unsigned long data) - bfq_schedule_dispatch(bfqd); - - spin_unlock_irqrestore(bfqd->queue->queue_lock, flags); -+ return HRTIMER_NORESTART; - } - - static void bfq_shutdown_timer_wq(struct bfq_data *bfqd) - { -- del_timer_sync(&bfqd->idle_slice_timer); -+ hrtimer_cancel(&bfqd->idle_slice_timer); - cancel_work_sync(&bfqd->unplug_work); - } - -@@ -3885,9 +4704,9 @@ static void __bfq_put_async_bfqq(struct bfq_data *bfqd, - - bfq_log(bfqd, "put_async_bfqq: %p", bfqq); - if (bfqq) { -- bfq_bfqq_move(bfqd, bfqq, &bfqq->entity, root_group); -+ bfq_bfqq_move(bfqd, bfqq, root_group); - bfq_log_bfqq(bfqd, bfqq, "put_async_bfqq: putting %p, %d", -- bfqq, atomic_read(&bfqq->ref)); -+ bfqq, bfqq->ref); - bfq_put_queue(bfqq); - *bfqq_ptr = NULL; - } -@@ -3922,19 +4741,18 @@ static void bfq_exit_queue(struct elevator_queue *e) - - BUG_ON(bfqd->in_service_queue); - list_for_each_entry_safe(bfqq, n, &bfqd->idle_list, bfqq_list) -- bfq_deactivate_bfqq(bfqd, bfqq, 0); -+ bfq_deactivate_bfqq(bfqd, bfqq, false, false); - - spin_unlock_irq(q->queue_lock); - - bfq_shutdown_timer_wq(bfqd); - -- synchronize_rcu(); -- -- BUG_ON(timer_pending(&bfqd->idle_slice_timer)); -+ BUG_ON(hrtimer_active(&bfqd->idle_slice_timer)); - - #ifdef CONFIG_BFQ_GROUP_IOSCHED - blkcg_deactivate_policy(q, &blkcg_policy_bfq); - #else -+ bfq_put_async_queues(bfqd, bfqd->root_group); - kfree(bfqd->root_group); - #endif - -@@ -3954,6 +4772,7 @@ static void bfq_init_root_group(struct bfq_group *root_group, - root_group->rq_pos_tree = RB_ROOT; - for (i = 0; i < BFQ_IOPRIO_CLASSES; i++) - root_group->sched_data.service_tree[i] = BFQ_SERVICE_TREE_INIT; -+ root_group->sched_data.bfq_class_idle_last_service = jiffies; - } - - static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) -@@ -3978,11 +4797,14 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - * will not attempt to free it. - */ - bfq_init_bfqq(bfqd, &bfqd->oom_bfqq, NULL, 1, 0); -- atomic_inc(&bfqd->oom_bfqq.ref); -+ bfqd->oom_bfqq.ref++; - bfqd->oom_bfqq.new_ioprio = BFQ_DEFAULT_QUEUE_IOPRIO; - bfqd->oom_bfqq.new_ioprio_class = IOPRIO_CLASS_BE; - bfqd->oom_bfqq.entity.new_weight = - bfq_ioprio_to_weight(bfqd->oom_bfqq.new_ioprio); -+ -+ /* oom_bfqq does not participate to bursts */ -+ bfq_clear_bfqq_just_created(&bfqd->oom_bfqq); - /* - * Trigger weight initialization, according to ioprio, at the - * oom_bfqq's first activation. The oom_bfqq's ioprio and ioprio -@@ -4001,13 +4823,10 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - goto out_free; - bfq_init_root_group(bfqd->root_group, bfqd); - bfq_init_entity(&bfqd->oom_bfqq.entity, bfqd->root_group); --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- bfqd->active_numerous_groups = 0; --#endif - -- init_timer(&bfqd->idle_slice_timer); -+ hrtimer_init(&bfqd->idle_slice_timer, CLOCK_MONOTONIC, -+ HRTIMER_MODE_REL); - bfqd->idle_slice_timer.function = bfq_idle_slice_timer; -- bfqd->idle_slice_timer.data = (unsigned long)bfqd; - - bfqd->queue_weights_tree = RB_ROOT; - bfqd->group_weights_tree = RB_ROOT; -@@ -4027,21 +4846,19 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - bfqd->bfq_back_max = bfq_back_max; - bfqd->bfq_back_penalty = bfq_back_penalty; - bfqd->bfq_slice_idle = bfq_slice_idle; -- bfqd->bfq_class_idle_last_service = 0; -- bfqd->bfq_max_budget_async_rq = bfq_max_budget_async_rq; -- bfqd->bfq_timeout[BLK_RW_ASYNC] = bfq_timeout_async; -- bfqd->bfq_timeout[BLK_RW_SYNC] = bfq_timeout_sync; -+ bfqd->bfq_timeout = bfq_timeout; - -- bfqd->bfq_coop_thresh = 2; -- bfqd->bfq_failed_cooperations = 7000; - bfqd->bfq_requests_within_timer = 120; - -- bfqd->bfq_large_burst_thresh = 11; -- bfqd->bfq_burst_interval = msecs_to_jiffies(500); -+ bfqd->bfq_large_burst_thresh = 8; -+ bfqd->bfq_burst_interval = msecs_to_jiffies(180); - - bfqd->low_latency = true; - -- bfqd->bfq_wr_coeff = 20; -+ /* -+ * Trade-off between responsiveness and fairness. -+ */ -+ bfqd->bfq_wr_coeff = 30; - bfqd->bfq_wr_rt_max_time = msecs_to_jiffies(300); - bfqd->bfq_wr_max_time = 0; - bfqd->bfq_wr_min_idle_time = msecs_to_jiffies(2000); -@@ -4053,16 +4870,15 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - * video. - */ - bfqd->wr_busy_queues = 0; -- bfqd->busy_in_flight_queues = 0; -- bfqd->const_seeky_busy_in_flight_queues = 0; - - /* -- * Begin by assuming, optimistically, that the device peak rate is -- * equal to the highest reference rate. -+ * Begin by assuming, optimistically, that the device is a -+ * high-speed one, and that its peak rate is equal to 2/3 of -+ * the highest reference rate. - */ - bfqd->RT_prod = R_fast[blk_queue_nonrot(bfqd->queue)] * - T_fast[blk_queue_nonrot(bfqd->queue)]; -- bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)]; -+ bfqd->peak_rate = R_fast[blk_queue_nonrot(bfqd->queue)] * 2 / 3; - bfqd->device_speed = BFQ_BFQD_FAST; - - return 0; -@@ -4088,7 +4904,7 @@ static int __init bfq_slab_setup(void) - - static ssize_t bfq_var_show(unsigned int var, char *page) - { -- return sprintf(page, "%d\n", var); -+ return sprintf(page, "%u\n", var); - } - - static ssize_t bfq_var_store(unsigned long *var, const char *page, -@@ -4159,21 +4975,21 @@ static ssize_t bfq_weights_show(struct elevator_queue *e, char *page) - static ssize_t __FUNC(struct elevator_queue *e, char *page) \ - { \ - struct bfq_data *bfqd = e->elevator_data; \ -- unsigned int __data = __VAR; \ -- if (__CONV) \ -+ u64 __data = __VAR; \ -+ if (__CONV == 1) \ - __data = jiffies_to_msecs(__data); \ -+ else if (__CONV == 2) \ -+ __data = div_u64(__data, NSEC_PER_MSEC); \ - return bfq_var_show(__data, (page)); \ - } --SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 1); --SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 1); -+SHOW_FUNCTION(bfq_fifo_expire_sync_show, bfqd->bfq_fifo_expire[1], 2); -+SHOW_FUNCTION(bfq_fifo_expire_async_show, bfqd->bfq_fifo_expire[0], 2); - SHOW_FUNCTION(bfq_back_seek_max_show, bfqd->bfq_back_max, 0); - SHOW_FUNCTION(bfq_back_seek_penalty_show, bfqd->bfq_back_penalty, 0); --SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 1); -+SHOW_FUNCTION(bfq_slice_idle_show, bfqd->bfq_slice_idle, 2); - SHOW_FUNCTION(bfq_max_budget_show, bfqd->bfq_user_max_budget, 0); --SHOW_FUNCTION(bfq_max_budget_async_rq_show, -- bfqd->bfq_max_budget_async_rq, 0); --SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout[BLK_RW_SYNC], 1); --SHOW_FUNCTION(bfq_timeout_async_show, bfqd->bfq_timeout[BLK_RW_ASYNC], 1); -+SHOW_FUNCTION(bfq_timeout_sync_show, bfqd->bfq_timeout, 1); -+SHOW_FUNCTION(bfq_strict_guarantees_show, bfqd->strict_guarantees, 0); - SHOW_FUNCTION(bfq_low_latency_show, bfqd->low_latency, 0); - SHOW_FUNCTION(bfq_wr_coeff_show, bfqd->bfq_wr_coeff, 0); - SHOW_FUNCTION(bfq_wr_rt_max_time_show, bfqd->bfq_wr_rt_max_time, 1); -@@ -4183,6 +4999,17 @@ SHOW_FUNCTION(bfq_wr_min_inter_arr_async_show, bfqd->bfq_wr_min_inter_arr_async, - SHOW_FUNCTION(bfq_wr_max_softrt_rate_show, bfqd->bfq_wr_max_softrt_rate, 0); - #undef SHOW_FUNCTION - -+#define USEC_SHOW_FUNCTION(__FUNC, __VAR) \ -+static ssize_t __FUNC(struct elevator_queue *e, char *page) \ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ u64 __data = __VAR; \ -+ __data = div_u64(__data, NSEC_PER_USEC); \ -+ return bfq_var_show(__data, (page)); \ -+} -+USEC_SHOW_FUNCTION(bfq_slice_idle_us_show, bfqd->bfq_slice_idle); -+#undef USEC_SHOW_FUNCTION -+ - #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ - static ssize_t \ - __FUNC(struct elevator_queue *e, const char *page, size_t count) \ -@@ -4194,24 +5021,22 @@ __FUNC(struct elevator_queue *e, const char *page, size_t count) \ - __data = (MIN); \ - else if (__data > (MAX)) \ - __data = (MAX); \ -- if (__CONV) \ -+ if (__CONV == 1) \ - *(__PTR) = msecs_to_jiffies(__data); \ -+ else if (__CONV == 2) \ -+ *(__PTR) = (u64)__data * NSEC_PER_MSEC; \ - else \ - *(__PTR) = __data; \ - return ret; \ - } - STORE_FUNCTION(bfq_fifo_expire_sync_store, &bfqd->bfq_fifo_expire[1], 1, -- INT_MAX, 1); -+ INT_MAX, 2); - STORE_FUNCTION(bfq_fifo_expire_async_store, &bfqd->bfq_fifo_expire[0], 1, -- INT_MAX, 1); -+ INT_MAX, 2); - STORE_FUNCTION(bfq_back_seek_max_store, &bfqd->bfq_back_max, 0, INT_MAX, 0); - STORE_FUNCTION(bfq_back_seek_penalty_store, &bfqd->bfq_back_penalty, 1, - INT_MAX, 0); --STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 1); --STORE_FUNCTION(bfq_max_budget_async_rq_store, &bfqd->bfq_max_budget_async_rq, -- 1, INT_MAX, 0); --STORE_FUNCTION(bfq_timeout_async_store, &bfqd->bfq_timeout[BLK_RW_ASYNC], 0, -- INT_MAX, 1); -+STORE_FUNCTION(bfq_slice_idle_store, &bfqd->bfq_slice_idle, 0, INT_MAX, 2); - STORE_FUNCTION(bfq_wr_coeff_store, &bfqd->bfq_wr_coeff, 1, INT_MAX, 0); - STORE_FUNCTION(bfq_wr_max_time_store, &bfqd->bfq_wr_max_time, 0, INT_MAX, 1); - STORE_FUNCTION(bfq_wr_rt_max_time_store, &bfqd->bfq_wr_rt_max_time, 0, INT_MAX, -@@ -4224,6 +5049,23 @@ STORE_FUNCTION(bfq_wr_max_softrt_rate_store, &bfqd->bfq_wr_max_softrt_rate, 0, - INT_MAX, 0); - #undef STORE_FUNCTION - -+#define USEC_STORE_FUNCTION(__FUNC, __PTR, MIN, MAX) \ -+static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count)\ -+{ \ -+ struct bfq_data *bfqd = e->elevator_data; \ -+ unsigned long uninitialized_var(__data); \ -+ int ret = bfq_var_store(&__data, (page), count); \ -+ if (__data < (MIN)) \ -+ __data = (MIN); \ -+ else if (__data > (MAX)) \ -+ __data = (MAX); \ -+ *(__PTR) = (u64)__data * NSEC_PER_USEC; \ -+ return ret; \ -+} -+USEC_STORE_FUNCTION(bfq_slice_idle_us_store, &bfqd->bfq_slice_idle, 0, -+ UINT_MAX); -+#undef USEC_STORE_FUNCTION -+ - /* do nothing for the moment */ - static ssize_t bfq_weights_store(struct elevator_queue *e, - const char *page, size_t count) -@@ -4231,16 +5073,6 @@ static ssize_t bfq_weights_store(struct elevator_queue *e, - return count; - } - --static unsigned long bfq_estimated_max_budget(struct bfq_data *bfqd) --{ -- u64 timeout = jiffies_to_msecs(bfqd->bfq_timeout[BLK_RW_SYNC]); -- -- if (bfqd->peak_rate_samples >= BFQ_PEAK_RATE_SAMPLES) -- return bfq_calc_max_budget(bfqd->peak_rate, timeout); -- else -- return bfq_default_max_budget; --} -- - static ssize_t bfq_max_budget_store(struct elevator_queue *e, - const char *page, size_t count) - { -@@ -4249,7 +5081,7 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, - int ret = bfq_var_store(&__data, (page), count); - - if (__data == 0) -- bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); - else { - if (__data > INT_MAX) - __data = INT_MAX; -@@ -4261,6 +5093,10 @@ static ssize_t bfq_max_budget_store(struct elevator_queue *e, - return ret; - } - -+/* -+ * Leaving this name to preserve name compatibility with cfq -+ * parameters, but this timeout is used for both sync and async. -+ */ - static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, - const char *page, size_t count) - { -@@ -4273,9 +5109,27 @@ static ssize_t bfq_timeout_sync_store(struct elevator_queue *e, - else if (__data > INT_MAX) - __data = INT_MAX; - -- bfqd->bfq_timeout[BLK_RW_SYNC] = msecs_to_jiffies(__data); -+ bfqd->bfq_timeout = msecs_to_jiffies(__data); - if (bfqd->bfq_user_max_budget == 0) -- bfqd->bfq_max_budget = bfq_estimated_max_budget(bfqd); -+ bfqd->bfq_max_budget = bfq_calc_max_budget(bfqd); -+ -+ return ret; -+} -+ -+static ssize_t bfq_strict_guarantees_store(struct elevator_queue *e, -+ const char *page, size_t count) -+{ -+ struct bfq_data *bfqd = e->elevator_data; -+ unsigned long uninitialized_var(__data); -+ int ret = bfq_var_store(&__data, (page), count); -+ -+ if (__data > 1) -+ __data = 1; -+ if (!bfqd->strict_guarantees && __data == 1 -+ && bfqd->bfq_slice_idle < 8 * NSEC_PER_MSEC) -+ bfqd->bfq_slice_idle = 8 * NSEC_PER_MSEC; -+ -+ bfqd->strict_guarantees = __data; - - return ret; - } -@@ -4305,10 +5159,10 @@ static struct elv_fs_entry bfq_attrs[] = { - BFQ_ATTR(back_seek_max), - BFQ_ATTR(back_seek_penalty), - BFQ_ATTR(slice_idle), -+ BFQ_ATTR(slice_idle_us), - BFQ_ATTR(max_budget), -- BFQ_ATTR(max_budget_async_rq), - BFQ_ATTR(timeout_sync), -- BFQ_ATTR(timeout_async), -+ BFQ_ATTR(strict_guarantees), - BFQ_ATTR(low_latency), - BFQ_ATTR(wr_coeff), - BFQ_ATTR(wr_max_time), -@@ -4328,7 +5182,8 @@ static struct elevator_type iosched_bfq = { - #ifdef CONFIG_BFQ_GROUP_IOSCHED - .elevator_bio_merged_fn = bfq_bio_merged, - #endif -- .elevator_allow_merge_fn = bfq_allow_merge, -+ .elevator_allow_bio_merge_fn = bfq_allow_bio_merge, -+ .elevator_allow_rq_merge_fn = bfq_allow_rq_merge, - .elevator_dispatch_fn = bfq_dispatch_requests, - .elevator_add_req_fn = bfq_insert_request, - .elevator_activate_req_fn = bfq_activate_request, -@@ -4351,18 +5206,28 @@ static struct elevator_type iosched_bfq = { - .elevator_owner = THIS_MODULE, - }; - -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static struct blkcg_policy blkcg_policy_bfq = { -+ .dfl_cftypes = bfq_blkg_files, -+ .legacy_cftypes = bfq_blkcg_legacy_files, -+ -+ .cpd_alloc_fn = bfq_cpd_alloc, -+ .cpd_init_fn = bfq_cpd_init, -+ .cpd_bind_fn = bfq_cpd_init, -+ .cpd_free_fn = bfq_cpd_free, -+ -+ .pd_alloc_fn = bfq_pd_alloc, -+ .pd_init_fn = bfq_pd_init, -+ .pd_offline_fn = bfq_pd_offline, -+ .pd_free_fn = bfq_pd_free, -+ .pd_reset_stats_fn = bfq_pd_reset_stats, -+}; -+#endif -+ - static int __init bfq_init(void) - { - int ret; -- -- /* -- * Can be 0 on HZ < 1000 setups. -- */ -- if (bfq_slice_idle == 0) -- bfq_slice_idle = 1; -- -- if (bfq_timeout_async == 0) -- bfq_timeout_async = 1; -+ char msg[60] = "BFQ I/O-scheduler: v8r8"; - - #ifdef CONFIG_BFQ_GROUP_IOSCHED - ret = blkcg_policy_register(&blkcg_policy_bfq); -@@ -4375,27 +5240,46 @@ static int __init bfq_init(void) - goto err_pol_unreg; - - /* -- * Times to load large popular applications for the typical systems -- * installed on the reference devices (see the comments before the -- * definitions of the two arrays). -+ * Times to load large popular applications for the typical -+ * systems installed on the reference devices (see the -+ * comments before the definitions of the next two -+ * arrays). Actually, we use slightly slower values, as the -+ * estimated peak rate tends to be smaller than the actual -+ * peak rate. The reason for this last fact is that estimates -+ * are computed over much shorter time intervals than the long -+ * intervals typically used for benchmarking. Why? First, to -+ * adapt more quickly to variations. Second, because an I/O -+ * scheduler cannot rely on a peak-rate-evaluation workload to -+ * be run for a long time. - */ -- T_slow[0] = msecs_to_jiffies(2600); -- T_slow[1] = msecs_to_jiffies(1000); -- T_fast[0] = msecs_to_jiffies(5500); -- T_fast[1] = msecs_to_jiffies(2000); -+ T_slow[0] = msecs_to_jiffies(3500); /* actually 4 sec */ -+ T_slow[1] = msecs_to_jiffies(6000); /* actually 6.5 sec */ -+ T_fast[0] = msecs_to_jiffies(7000); /* actually 8 sec */ -+ T_fast[1] = msecs_to_jiffies(2500); /* actually 3 sec */ - - /* -- * Thresholds that determine the switch between speed classes (see -- * the comments before the definition of the array). -+ * Thresholds that determine the switch between speed classes -+ * (see the comments before the definition of the array -+ * device_speed_thresh). These thresholds are biased towards -+ * transitions to the fast class. This is safer than the -+ * opposite bias. In fact, a wrong transition to the slow -+ * class results in short weight-raising periods, because the -+ * speed of the device then tends to be higher that the -+ * reference peak rate. On the opposite end, a wrong -+ * transition to the fast class tends to increase -+ * weight-raising periods, because of the opposite reason. - */ -- device_speed_thresh[0] = (R_fast[0] + R_slow[0]) / 2; -- device_speed_thresh[1] = (R_fast[1] + R_slow[1]) / 2; -+ device_speed_thresh[0] = (4 * R_slow[0]) / 3; -+ device_speed_thresh[1] = (4 * R_slow[1]) / 3; - - ret = elv_register(&iosched_bfq); - if (ret) - goto err_pol_unreg; - -- pr_info("BFQ I/O-scheduler: v7r11"); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ strcat(msg, " (with cgroups support)"); -+#endif -+ pr_info("%s", msg); - - return 0; - -diff --git a/block/bfq-sched.c b/block/bfq-sched.c -index a5ed694..2e9dc59 100644 ---- a/block/bfq-sched.c -+++ b/block/bfq-sched.c -@@ -7,28 +7,166 @@ - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * -- * Copyright (C) 2010 Paolo Valente -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2016 Paolo Valente -+ */ -+ -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+ -+/** -+ * bfq_gt - compare two timestamps. -+ * @a: first ts. -+ * @b: second ts. -+ * -+ * Return @a > @b, dealing with wrapping correctly. -+ */ -+static int bfq_gt(u64 a, u64 b) -+{ -+ return (s64)(a - b) > 0; -+} -+ -+static struct bfq_entity *bfq_root_active_entity(struct rb_root *tree) -+{ -+ struct rb_node *node = tree->rb_node; -+ -+ return rb_entry(node, struct bfq_entity, rb_node); -+} -+ -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd); -+ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service); -+ -+/** -+ * bfq_update_next_in_service - update sd->next_in_service -+ * @sd: sched_data for which to perform the update. -+ * @new_entity: if not NULL, pointer to the entity whose activation, -+ * requeueing or repositionig triggered the invocation of -+ * this function. -+ * -+ * This function is called to update sd->next_in_service, which, in -+ * its turn, may change as a consequence of the insertion or -+ * extraction of an entity into/from one of the active trees of -+ * sd. These insertions/extractions occur as a consequence of -+ * activations/deactivations of entities, with some activations being -+ * 'true' activations, and other activations being requeueings (i.e., -+ * implementing the second, requeueing phase of the mechanism used to -+ * reposition an entity in its active tree; see comments on -+ * __bfq_activate_entity and __bfq_requeue_entity for details). In -+ * both the last two activation sub-cases, new_entity points to the -+ * just activated or requeued entity. -+ * -+ * Returns true if sd->next_in_service changes in such a way that -+ * entity->parent may become the next_in_service for its parent -+ * entity. - */ -+static bool bfq_update_next_in_service(struct bfq_sched_data *sd, -+ struct bfq_entity *new_entity) -+{ -+ struct bfq_entity *next_in_service = sd->next_in_service; -+ struct bfq_queue *bfqq; -+ bool parent_sched_may_change = false; -+ -+ /* -+ * If this update is triggered by the activation, requeueing -+ * or repositiong of an entity that does not coincide with -+ * sd->next_in_service, then a full lookup in the active tree -+ * can be avoided. In fact, it is enough to check whether the -+ * just-modified entity has a higher priority than -+ * sd->next_in_service, or, even if it has the same priority -+ * as sd->next_in_service, is eligible and has a lower virtual -+ * finish time than sd->next_in_service. If this compound -+ * condition holds, then the new entity becomes the new -+ * next_in_service. Otherwise no change is needed. -+ */ -+ if (new_entity && new_entity != sd->next_in_service) { -+ /* -+ * Flag used to decide whether to replace -+ * sd->next_in_service with new_entity. Tentatively -+ * set to true, and left as true if -+ * sd->next_in_service is NULL. -+ */ -+ bool replace_next = true; -+ -+ /* -+ * If there is already a next_in_service candidate -+ * entity, then compare class priorities or timestamps -+ * to decide whether to replace sd->service_tree with -+ * new_entity. -+ */ -+ if (next_in_service) { -+ unsigned int new_entity_class_idx = -+ bfq_class_idx(new_entity); -+ struct bfq_service_tree *st = -+ sd->service_tree + new_entity_class_idx; -+ -+ /* -+ * For efficiency, evaluate the most likely -+ * sub-condition first. -+ */ -+ replace_next = -+ (new_entity_class_idx == -+ bfq_class_idx(next_in_service) -+ && -+ !bfq_gt(new_entity->start, st->vtime) -+ && -+ bfq_gt(next_in_service->finish, -+ new_entity->finish)) -+ || -+ new_entity_class_idx < -+ bfq_class_idx(next_in_service); -+ } -+ -+ if (replace_next) -+ next_in_service = new_entity; -+ } else /* invoked because of a deactivation: lookup needed */ -+ next_in_service = bfq_lookup_next_entity(sd); -+ -+ if (next_in_service) { -+ parent_sched_may_change = !sd->next_in_service || -+ bfq_update_parent_budget(next_in_service); -+ } -+ -+ sd->next_in_service = next_in_service; -+ -+ if (!next_in_service) -+ return parent_sched_may_change; - -+ bfqq = bfq_entity_to_bfqq(next_in_service); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_next_in_service: chosen this queue"); - #ifdef CONFIG_BFQ_GROUP_IOSCHED --#define for_each_entity(entity) \ -+ else { -+ struct bfq_group *bfqg = -+ container_of(next_in_service, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "update_next_in_service: chosen this entity"); -+ } -+#endif -+ return parent_sched_may_change; -+} -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+/* both next loops stop at one of the child entities of the root group */ -+#define for_each_entity(entity) \ - for (; entity ; entity = entity->parent) - - #define for_each_entity_safe(entity, parent) \ - for (; entity && ({ parent = entity->parent; 1; }); entity = parent) - -- --static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -- int extract, -- struct bfq_data *bfqd); -- --static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -- --static void bfq_update_budget(struct bfq_entity *next_in_service) -+/* -+ * Returns true if this budget changes may let next_in_service->parent -+ * become the next_in_service entity for its parent entity. -+ */ -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) - { - struct bfq_entity *bfqg_entity; - struct bfq_group *bfqg; - struct bfq_sched_data *group_sd; -+ bool ret = false; - - BUG_ON(!next_in_service); - -@@ -41,60 +179,68 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) - * as it must never become an in-service entity. - */ - bfqg_entity = bfqg->my_entity; -- if (bfqg_entity) -+ if (bfqg_entity) { -+ if (bfqg_entity->budget > next_in_service->budget) -+ ret = true; - bfqg_entity->budget = next_in_service->budget; -+ } -+ -+ return ret; - } - --static int bfq_update_next_in_service(struct bfq_sched_data *sd) -+/* -+ * This function tells whether entity stops being a candidate for next -+ * service, according to the following logic. -+ * -+ * This function is invoked for an entity that is about to be set in -+ * service. If such an entity is a queue, then the entity is no longer -+ * a candidate for next service (i.e, a candidate entity to serve -+ * after the in-service entity is expired). The function then returns -+ * true. -+ * -+ * In contrast, the entity could stil be a candidate for next service -+ * if it is not a queue, and has more than one child. In fact, even if -+ * one of its children is about to be set in service, other children -+ * may still be the next to serve. As a consequence, a non-queue -+ * entity is not a candidate for next-service only if it has only one -+ * child. And only if this condition holds, then the function returns -+ * true for a non-queue entity. -+ */ -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - { -- struct bfq_entity *next_in_service; -+ struct bfq_group *bfqg; - -- if (sd->in_service_entity) -- /* will update/requeue at the end of service */ -- return 0; -+ if (bfq_entity_to_bfqq(entity)) -+ return true; - -- /* -- * NOTE: this can be improved in many ways, such as returning -- * 1 (and thus propagating upwards the update) only when the -- * budget changes, or caching the bfqq that will be scheduled -- * next from this subtree. By now we worry more about -- * correctness than about performance... -- */ -- next_in_service = bfq_lookup_next_entity(sd, 0, NULL); -- sd->next_in_service = next_in_service; -+ bfqg = container_of(entity, struct bfq_group, entity); - -- if (next_in_service) -- bfq_update_budget(next_in_service); -+ BUG_ON(bfqg == ((struct bfq_data *)(bfqg->bfqd))->root_group); -+ BUG_ON(bfqg->active_entities == 0); -+ if (bfqg->active_entities == 1) -+ return true; - -- return 1; -+ return false; - } - --static void bfq_check_next_in_service(struct bfq_sched_data *sd, -- struct bfq_entity *entity) --{ -- BUG_ON(sd->next_in_service != entity); --} --#else -+#else /* CONFIG_BFQ_GROUP_IOSCHED */ - #define for_each_entity(entity) \ - for (; entity ; entity = NULL) - - #define for_each_entity_safe(entity, parent) \ - for (parent = NULL; entity ; entity = parent) - --static int bfq_update_next_in_service(struct bfq_sched_data *sd) -+static bool bfq_update_parent_budget(struct bfq_entity *next_in_service) - { -- return 0; -+ return false; - } - --static void bfq_check_next_in_service(struct bfq_sched_data *sd, -- struct bfq_entity *entity) -+static bool bfq_no_longer_next_in_service(struct bfq_entity *entity) - { -+ return true; - } - --static void bfq_update_budget(struct bfq_entity *next_in_service) --{ --} --#endif -+#endif /* CONFIG_BFQ_GROUP_IOSCHED */ - - /* - * Shift for timestamp calculations. This actually limits the maximum -@@ -105,18 +251,6 @@ static void bfq_update_budget(struct bfq_entity *next_in_service) - */ - #define WFQ_SERVICE_SHIFT 22 - --/** -- * bfq_gt - compare two timestamps. -- * @a: first ts. -- * @b: second ts. -- * -- * Return @a > @b, dealing with wrapping correctly. -- */ --static int bfq_gt(u64 a, u64 b) --{ -- return (s64)(a - b) > 0; --} -- - static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity) - { - struct bfq_queue *bfqq = NULL; -@@ -151,20 +285,36 @@ static u64 bfq_delta(unsigned long service, unsigned long weight) - static void bfq_calc_finish(struct bfq_entity *entity, unsigned long service) - { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ unsigned long long start, finish, delta; - - BUG_ON(entity->weight == 0); - - entity->finish = entity->start + - bfq_delta(service, entity->weight); - -+ start = ((entity->start>>10)*1000)>>12; -+ finish = ((entity->finish>>10)*1000)>>12; -+ delta = ((bfq_delta(service, entity->weight)>>10)*1000)>>12; -+ - if (bfqq) { - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: serv %lu, w %d", - service, entity->weight); - bfq_log_bfqq(bfqq->bfqd, bfqq, - "calc_finish: start %llu, finish %llu, delta %llu", -- entity->start, entity->finish, -- bfq_delta(service, entity->weight)); -+ start, finish, delta); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_finish group: serv %lu, w %d", -+ service, entity->weight); -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_finish group: start %llu, finish %llu, delta %llu", -+ start, finish, delta); -+#endif - } - } - -@@ -293,10 +443,26 @@ static void bfq_update_min(struct bfq_entity *entity, struct rb_node *node) - static void bfq_update_active_node(struct rb_node *node) - { - struct bfq_entity *entity = rb_entry(node, struct bfq_entity, rb_node); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - entity->min_start = entity->start; - bfq_update_min(entity, node->rb_right); - bfq_update_min(entity, node->rb_left); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "update_active_node: new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "update_active_node: new min_start %llu", -+ ((entity->min_start>>10)*1000)>>12); -+#endif -+ } - } - - /** -@@ -386,8 +552,6 @@ static void bfq_active_insert(struct bfq_service_tree *st, - BUG_ON(!bfqg); - BUG_ON(!bfqd); - bfqg->active_entities++; -- if (bfqg->active_entities == 2) -- bfqd->active_numerous_groups++; - } - #endif - } -@@ -399,7 +563,7 @@ static void bfq_active_insert(struct bfq_service_tree *st, - static unsigned short bfq_ioprio_to_weight(int ioprio) - { - BUG_ON(ioprio < 0 || ioprio >= IOPRIO_BE_NR); -- return IOPRIO_BE_NR * BFQ_WEIGHT_CONVERSION_COEFF - ioprio; -+ return (IOPRIO_BE_NR - ioprio) * BFQ_WEIGHT_CONVERSION_COEFF; - } - - /** -@@ -422,9 +586,9 @@ static void bfq_get_entity(struct bfq_entity *entity) - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); - - if (bfqq) { -- atomic_inc(&bfqq->ref); -+ bfqq->ref++; - bfq_log_bfqq(bfqq->bfqd, bfqq, "get_entity: %p %d", -- bfqq, atomic_read(&bfqq->ref)); -+ bfqq, bfqq->ref); - } - } - -@@ -499,10 +663,6 @@ static void bfq_active_extract(struct bfq_service_tree *st, - BUG_ON(!bfqd); - BUG_ON(!bfqg->active_entities); - bfqg->active_entities--; -- if (bfqg->active_entities == 1) { -- BUG_ON(!bfqd->active_numerous_groups); -- bfqd->active_numerous_groups--; -- } - } - #endif - } -@@ -547,12 +707,12 @@ static void bfq_forget_entity(struct bfq_service_tree *st, - - BUG_ON(!entity->on_st); - -- entity->on_st = 0; -+ entity->on_st = false; - st->wsum -= entity->weight; - if (bfqq) { - sd = entity->sched_data; - bfq_log_bfqq(bfqq->bfqd, bfqq, "forget_entity: %p %d", -- bfqq, atomic_read(&bfqq->ref)); -+ bfqq, bfqq->ref); - bfq_put_queue(bfqq); - } - } -@@ -602,7 +762,7 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - - if (entity->prio_changed) { - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -- unsigned short prev_weight, new_weight; -+ unsigned int prev_weight, new_weight; - struct bfq_data *bfqd = NULL; - struct rb_root *root; - #ifdef CONFIG_BFQ_GROUP_IOSCHED -@@ -630,7 +790,10 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - entity->new_weight > BFQ_MAX_WEIGHT) { - pr_crit("update_weight_prio: new_weight %d\n", - entity->new_weight); -- BUG(); -+ if (entity->new_weight < BFQ_MIN_WEIGHT) -+ entity->new_weight = BFQ_MIN_WEIGHT; -+ else -+ entity->new_weight = BFQ_MAX_WEIGHT; - } - entity->orig_weight = entity->new_weight; - if (bfqq) -@@ -661,6 +824,13 @@ __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, - * associated with its new weight. - */ - if (prev_weight != new_weight) { -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "weight changed %d %d(%d %d)", -+ prev_weight, new_weight, -+ entity->orig_weight, -+ bfqq->wr_coeff); -+ - root = bfqq ? &bfqd->queue_weights_tree : - &bfqd->group_weights_tree; - bfq_weights_tree_remove(bfqd, entity, root); -@@ -707,7 +877,7 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - st = bfq_entity_service_tree(entity); - - entity->service += served; -- BUG_ON(entity->service > entity->budget); -+ - BUG_ON(st->wsum == 0); - - st->vtime += bfq_delta(served, st->wsum); -@@ -716,234 +886,574 @@ static void bfq_bfqq_served(struct bfq_queue *bfqq, int served) - #ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_set_start_empty_time(bfqq_group(bfqq)); - #endif -- bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs", served); -+ st = bfq_entity_service_tree(&bfqq->entity); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "bfqq_served %d secs, vtime %llu on %p", -+ served, ((st->vtime>>10)*1000)>>12, st); - } - - /** -- * bfq_bfqq_charge_full_budget - set the service to the entity budget. -+ * bfq_bfqq_charge_time - charge an amount of service equivalent to the length -+ * of the time interval during which bfqq has been in -+ * service. -+ * @bfqd: the device - * @bfqq: the queue that needs a service update. -+ * @time_ms: the amount of time during which the queue has received service -+ * -+ * If a queue does not consume its budget fast enough, then providing -+ * the queue with service fairness may impair throughput, more or less -+ * severely. For this reason, queues that consume their budget slowly -+ * are provided with time fairness instead of service fairness. This -+ * goal is achieved through the BFQ scheduling engine, even if such an -+ * engine works in the service, and not in the time domain. The trick -+ * is charging these queues with an inflated amount of service, equal -+ * to the amount of service that they would have received during their -+ * service slot if they had been fast, i.e., if their requests had -+ * been dispatched at a rate equal to the estimated peak rate. - * -- * When it's not possible to be fair in the service domain, because -- * a queue is not consuming its budget fast enough (the meaning of -- * fast depends on the timeout parameter), we charge it a full -- * budget. In this way we should obtain a sort of time-domain -- * fairness among all the seeky/slow queues. -+ * It is worth noting that time fairness can cause important -+ * distortions in terms of bandwidth distribution, on devices with -+ * internal queueing. The reason is that I/O requests dispatched -+ * during the service slot of a queue may be served after that service -+ * slot is finished, and may have a total processing time loosely -+ * correlated with the duration of the service slot. This is -+ * especially true for short service slots. - */ --static void bfq_bfqq_charge_full_budget(struct bfq_queue *bfqq) -+static void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, -+ unsigned long time_ms) - { - struct bfq_entity *entity = &bfqq->entity; -+ int tot_serv_to_charge = entity->service; -+ unsigned int timeout_ms = jiffies_to_msecs(bfq_timeout); -+ -+ if (time_ms > 0 && time_ms < timeout_ms) -+ tot_serv_to_charge = -+ (bfqd->bfq_max_budget * time_ms) / timeout_ms; - -- bfq_log_bfqq(bfqq->bfqd, bfqq, "charge_full_budget"); -+ if (tot_serv_to_charge < entity->service) -+ tot_serv_to_charge = entity->service; - -- bfq_bfqq_served(bfqq, entity->budget - entity->service); -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "charge_time: %lu/%u ms, %d/%d/%d sectors", -+ time_ms, timeout_ms, entity->service, -+ tot_serv_to_charge, entity->budget); -+ -+ /* Increase budget to avoid inconsistencies */ -+ if (tot_serv_to_charge > entity->budget) -+ entity->budget = tot_serv_to_charge; -+ -+ bfq_bfqq_served(bfqq, -+ max_t(int, 0, tot_serv_to_charge - entity->service)); -+} -+ -+static void bfq_update_fin_time_enqueue(struct bfq_entity *entity, -+ struct bfq_service_tree *st, -+ bool backshifted) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ st = __bfq_entity_update_weight_prio(st, entity); -+ bfq_calc_finish(entity, entity->budget); -+ -+ /* -+ * If some queues enjoy backshifting for a while, then their -+ * (virtual) finish timestamps may happen to become lower and -+ * lower than the system virtual time. In particular, if -+ * these queues often happen to be idle for short time -+ * periods, and during such time periods other queues with -+ * higher timestamps happen to be busy, then the backshifted -+ * timestamps of the former queues can become much lower than -+ * the system virtual time. In fact, to serve the queues with -+ * higher timestamps while the ones with lower timestamps are -+ * idle, the system virtual time may be pushed-up to much -+ * higher values than the finish timestamps of the idle -+ * queues. As a consequence, the finish timestamps of all new -+ * or newly activated queues may end up being much larger than -+ * those of lucky queues with backshifted timestamps. The -+ * latter queues may then monopolize the device for a lot of -+ * time. This would simply break service guarantees. -+ * -+ * To reduce this problem, push up a little bit the -+ * backshifted timestamps of the queue associated with this -+ * entity (only a queue can happen to have the backshifted -+ * flag set): just enough to let the finish timestamp of the -+ * queue be equal to the current value of the system virtual -+ * time. This may introduce a little unfairness among queues -+ * with backshifted timestamps, but it does not break -+ * worst-case fairness guarantees. -+ * -+ * As a special case, if bfqq is weight-raised, push up -+ * timestamps much less, to keep very low the probability that -+ * this push up causes the backshifted finish timestamps of -+ * weight-raised queues to become higher than the backshifted -+ * finish timestamps of non weight-raised queues. -+ */ -+ if (backshifted && bfq_gt(st->vtime, entity->finish)) { -+ unsigned long delta = st->vtime - entity->finish; -+ -+ if (bfqq) -+ delta /= bfqq->wr_coeff; -+ -+ entity->start += delta; -+ entity->finish += delta; -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__activate_entity: new queue finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__activate_entity: new group finish %llu", -+ ((entity->finish>>10)*1000)>>12); -+#endif -+ } -+ } -+ -+ bfq_active_insert(st, entity); -+ -+ if (bfqq) { -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__activate_entity: queue %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ } else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__activate_entity: group %seligible in st %p", -+ entity->start <= st->vtime ? "" : "non ", st); -+#endif -+ } -+ BUG_ON(RB_EMPTY_ROOT(&st->active)); -+ BUG_ON(&st->active != &sd->service_tree->active && -+ &st->active != &(sd->service_tree+1)->active && -+ &st->active != &(sd->service_tree+2)->active); - } - - /** -- * __bfq_activate_entity - activate an entity. -+ * __bfq_activate_entity - handle activation of entity. - * @entity: the entity being activated. -+ * @non_blocking_wait_rq: true if entity was waiting for a request -+ * -+ * Called for a 'true' activation, i.e., if entity is not active and -+ * one of its children receives a new request. - * -- * Called whenever an entity is activated, i.e., it is not active and one -- * of its children receives a new request, or has to be reactivated due to -- * budget exhaustion. It uses the current budget of the entity (and the -- * service received if @entity is active) of the queue to calculate its -- * timestamps. -+ * Basically, this function updates the timestamps of entity and -+ * inserts entity into its active tree, ater possible extracting it -+ * from its idle tree. - */ --static void __bfq_activate_entity(struct bfq_entity *entity) -+static void __bfq_activate_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq) - { - struct bfq_sched_data *sd = entity->sched_data; - struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ bool backshifted = false; -+ unsigned long long min_vstart; - -- if (entity == sd->in_service_entity) { -- BUG_ON(entity->tree); -- /* -- * If we are requeueing the current entity we have -- * to take care of not charging to it service it has -- * not received. -- */ -- bfq_calc_finish(entity, entity->service); -- entity->start = entity->finish; -- sd->in_service_entity = NULL; -- } else if (entity->tree == &st->active) { -- /* -- * Requeueing an entity due to a change of some -- * next_in_service entity below it. We reuse the -- * old start time. -- */ -- bfq_active_extract(st, entity); -- } else if (entity->tree == &st->idle) { -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ /* See comments on bfq_fqq_update_budg_for_activation */ -+ if (non_blocking_wait_rq && bfq_gt(st->vtime, entity->finish)) { -+ backshifted = true; -+ min_vstart = entity->finish; -+ } else -+ min_vstart = st->vtime; -+ -+ if (entity->tree == &st->idle) { - /* - * Must be on the idle tree, bfq_idle_extract() will - * check for that. - */ - bfq_idle_extract(st, entity); -- entity->start = bfq_gt(st->vtime, entity->finish) ? -- st->vtime : entity->finish; -+ entity->start = bfq_gt(min_vstart, entity->finish) ? -+ min_vstart : entity->finish; - } else { - /* - * The finish time of the entity may be invalid, and - * it is in the past for sure, otherwise the queue - * would have been on the idle tree. - */ -- entity->start = st->vtime; -+ entity->start = min_vstart; - st->wsum += entity->weight; - bfq_get_entity(entity); - -- BUG_ON(entity->on_st); -- entity->on_st = 1; -+ BUG_ON(entity->on_st && bfqq); -+ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ if (entity->on_st && !bfqq) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, -+ bfqg, -+ "activate bug, class %d in_service %p", -+ bfq_class_idx(entity), sd->in_service_entity); -+ } -+#endif -+ BUG_ON(entity->on_st && !bfqq); -+ entity->on_st = true; - } - -- st = __bfq_entity_update_weight_prio(st, entity); -- bfq_calc_finish(entity, entity->budget); -- bfq_active_insert(st, entity); -+ bfq_update_fin_time_enqueue(entity, st, backshifted); - } - - /** -- * bfq_activate_entity - activate an entity and its ancestors if necessary. -- * @entity: the entity to activate. -+ * __bfq_requeue_entity - handle requeueing or repositioning of an entity. -+ * @entity: the entity being requeued or repositioned. -+ * -+ * Requeueing is needed if this entity stops being served, which -+ * happens if a leaf descendant entity has expired. On the other hand, -+ * repositioning is needed if the next_inservice_entity for the child -+ * entity has changed. See the comments inside the function for -+ * details. - * -- * Activate @entity and all the entities on the path from it to the root. -+ * Basically, this function: 1) removes entity from its active tree if -+ * present there, 2) updates the timestamps of entity and 3) inserts -+ * entity back into its active tree (in the new, right position for -+ * the new values of the timestamps). - */ --static void bfq_activate_entity(struct bfq_entity *entity) -+static void __bfq_requeue_entity(struct bfq_entity *entity) -+{ -+ struct bfq_sched_data *sd = entity->sched_data; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree != &st->active); -+ -+ if (entity == sd->in_service_entity) { -+ /* -+ * We are requeueing the current in-service entity, -+ * which may have to be done for one of the following -+ * reasons: -+ * - entity represents the in-service queue, and the -+ * in-service queue is being requeued after an -+ * expiration; -+ * - entity represents a group, and its budget has -+ * changed because one of its child entities has -+ * just been either activated or requeued for some -+ * reason; the timestamps of the entity need then to -+ * be updated, and the entity needs to be enqueued -+ * or repositioned accordingly. -+ * -+ * In particular, before requeueing, the start time of -+ * the entity must be moved forward to account for the -+ * service that the entity has received while in -+ * service. This is done by the next instructions. The -+ * finish time will then be updated according to this -+ * new value of the start time, and to the budget of -+ * the entity. -+ */ -+ bfq_calc_finish(entity, entity->service); -+ entity->start = entity->finish; -+ BUG_ON(entity->tree && entity->tree != &st->active); -+ /* -+ * In addition, if the entity had more than one child -+ * when set in service, then was not extracted from -+ * the active tree. This implies that the position of -+ * the entity in the active tree may need to be -+ * changed now, because we have just updated the start -+ * time of the entity, and we will update its finish -+ * time in a moment (the requeueing is then, more -+ * precisely, a repositioning in this case). To -+ * implement this repositioning, we: 1) dequeue the -+ * entity here, 2) update the finish time and -+ * requeue the entity according to the new -+ * timestamps below. -+ */ -+ if (entity->tree) -+ bfq_active_extract(st, entity); -+ } else { /* The entity is already active, and not in service */ -+ /* -+ * In this case, this function gets called only if the -+ * next_in_service entity below this entity has -+ * changed, and this change has caused the budget of -+ * this entity to change, which, finally implies that -+ * the finish time of this entity must be -+ * updated. Such an update may cause the scheduling, -+ * i.e., the position in the active tree, of this -+ * entity to change. We handle this change by: 1) -+ * dequeueing the entity here, 2) updating the finish -+ * time and requeueing the entity according to the new -+ * timestamps below. This is the same approach as the -+ * non-extracted-entity sub-case above. -+ */ -+ bfq_active_extract(st, entity); -+ } -+ -+ bfq_update_fin_time_enqueue(entity, st, false); -+} -+ -+static void __bfq_activate_requeue_entity(struct bfq_entity *entity, -+ struct bfq_sched_data *sd, -+ bool non_blocking_wait_rq) -+{ -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ if (sd->in_service_entity == entity || entity->tree == &st->active) -+ /* -+ * in service or already queued on the active tree, -+ * requeue or reposition -+ */ -+ __bfq_requeue_entity(entity); -+ else -+ /* -+ * Not in service and not queued on its active tree: -+ * the activity is idle and this is a true activation. -+ */ -+ __bfq_activate_entity(entity, non_blocking_wait_rq); -+} -+ -+ -+/** -+ * bfq_activate_entity - activate or requeue an entity representing a bfq_queue, -+ * and activate, requeue or reposition all ancestors -+ * for which such an update becomes necessary. -+ * @entity: the entity to activate. -+ * @non_blocking_wait_rq: true if this entity was waiting for a request -+ * @requeue: true if this is a requeue, which implies that bfqq is -+ * being expired; thus ALL its ancestors stop being served and must -+ * therefore be requeued -+ */ -+static void bfq_activate_requeue_entity(struct bfq_entity *entity, -+ bool non_blocking_wait_rq, -+ bool requeue) - { - struct bfq_sched_data *sd; - - for_each_entity(entity) { -- __bfq_activate_entity(entity); -- -+ BUG_ON(!entity); - sd = entity->sched_data; -- if (!bfq_update_next_in_service(sd)) -- /* -- * No need to propagate the activation to the -- * upper entities, as they will be updated when -- * the in-service entity is rescheduled. -- */ -+ __bfq_activate_requeue_entity(entity, sd, non_blocking_wait_rq); -+ -+ BUG_ON(RB_EMPTY_ROOT(&sd->service_tree->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+1)->active) && -+ RB_EMPTY_ROOT(&(sd->service_tree+2)->active)); -+ -+ if (!bfq_update_next_in_service(sd, entity) && !requeue) { -+ BUG_ON(!sd->next_in_service); - break; -+ } -+ BUG_ON(!sd->next_in_service); - } - } - - /** - * __bfq_deactivate_entity - deactivate an entity from its service tree. - * @entity: the entity to deactivate. -- * @requeue: if false, the entity will not be put into the idle tree. -- * -- * Deactivate an entity, independently from its previous state. If the -- * entity was not on a service tree just return, otherwise if it is on -- * any scheduler tree, extract it from that tree, and if necessary -- * and if the caller did not specify @requeue, put it on the idle tree. -+ * @ins_into_idle_tree: if false, the entity will not be put into the -+ * idle tree. - * -- * Return %1 if the caller should update the entity hierarchy, i.e., -- * if the entity was in service or if it was the next_in_service for -- * its sched_data; return %0 otherwise. -+ * Deactivates an entity, independently from its previous state. Must -+ * be invoked only if entity is on a service tree. Extracts the entity -+ * from that tree, and if necessary and allowed, puts it on the idle -+ * tree. - */ --static int __bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -+static bool __bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree) - { - struct bfq_sched_data *sd = entity->sched_data; -- struct bfq_service_tree *st; -- int was_in_service; -- int ret = 0; -- -- if (sd == NULL || !entity->on_st) /* never activated, or inactive */ -- return 0; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ bool was_in_service = entity == sd->in_service_entity; - -- st = bfq_entity_service_tree(entity); -- was_in_service = entity == sd->in_service_entity; -+ if (!entity->on_st) { /* entity never activated, or already inactive */ -+ BUG_ON(entity == entity->sched_data->in_service_entity); -+ return false; -+ } - -- BUG_ON(was_in_service && entity->tree); -+ BUG_ON(was_in_service && entity->tree && entity->tree != &st->active); - -- if (was_in_service) { -+ if (was_in_service) - bfq_calc_finish(entity, entity->service); -- sd->in_service_entity = NULL; -- } else if (entity->tree == &st->active) -+ -+ if (entity->tree == &st->active) - bfq_active_extract(st, entity); -- else if (entity->tree == &st->idle) -+ else if (!was_in_service && entity->tree == &st->idle) - bfq_idle_extract(st, entity); - else if (entity->tree) - BUG(); - -- if (was_in_service || sd->next_in_service == entity) -- ret = bfq_update_next_in_service(sd); -- -- if (!requeue || !bfq_gt(entity->finish, st->vtime)) -+ if (!ins_into_idle_tree || !bfq_gt(entity->finish, st->vtime)) - bfq_forget_entity(st, entity); - else - bfq_idle_insert(st, entity); - -- BUG_ON(sd->in_service_entity == entity); -- BUG_ON(sd->next_in_service == entity); -- -- return ret; -+ return true; - } - - /** -- * bfq_deactivate_entity - deactivate an entity. -+ * bfq_deactivate_entity - deactivate an entity representing a bfq_queue. - * @entity: the entity to deactivate. -- * @requeue: true if the entity can be put on the idle tree -+ * @ins_into_idle_tree: true if the entity can be put on the idle tree - */ --static void bfq_deactivate_entity(struct bfq_entity *entity, int requeue) -+static void bfq_deactivate_entity(struct bfq_entity *entity, -+ bool ins_into_idle_tree, -+ bool expiration) - { - struct bfq_sched_data *sd; -- struct bfq_entity *parent; -+ struct bfq_entity *parent = NULL; - - for_each_entity_safe(entity, parent) { - sd = entity->sched_data; - -- if (!__bfq_deactivate_entity(entity, requeue)) -+ BUG_ON(sd == NULL); /* -+ * It would mean that this is the -+ * root group. -+ */ -+ -+ BUG_ON(expiration && entity != sd->in_service_entity); -+ -+ BUG_ON(entity != sd->in_service_entity && -+ entity->tree == -+ &bfq_entity_service_tree(entity)->active && -+ !sd->next_in_service); -+ -+ if (!__bfq_deactivate_entity(entity, ins_into_idle_tree)) { - /* -- * The parent entity is still backlogged, and -- * we don't need to update it as it is still -- * in service. -+ * Entity is not any tree any more, so, this -+ * deactivation is a no-op, and there is -+ * nothing to change for upper-level entities -+ * (in case of expiration, this can never -+ * happen). - */ -- break; -+ BUG_ON(expiration); /* -+ * entity cannot be already out of -+ * any tree -+ */ -+ return; -+ } - -- if (sd->next_in_service) -+ if (sd->next_in_service == entity) - /* -- * The parent entity is still backlogged and -- * the budgets on the path towards the root -- * need to be updated. -+ * entity was the next_in_service entity, -+ * then, since entity has just been -+ * deactivated, a new one must be found. - */ -- goto update; -+ bfq_update_next_in_service(sd, NULL); -+ -+ if (sd->next_in_service) { -+ /* -+ * The parent entity is still backlogged, -+ * because next_in_service is not NULL. So, no -+ * further upwards deactivation must be -+ * performed. Yet, next_in_service has -+ * changed. Then the schedule does need to be -+ * updated upwards. -+ */ -+ BUG_ON(sd->next_in_service == entity); -+ break; -+ } - - /* -- * If we reach there the parent is no more backlogged and -- * we want to propagate the dequeue upwards. -+ * If we get here, then the parent is no more -+ * backlogged and we need to propagate the -+ * deactivation upwards. Thus let the loop go on. - */ -- requeue = 1; -- } - -- return; -+ /* -+ * Also let parent be queued into the idle tree on -+ * deactivation, to preserve service guarantees, and -+ * assuming that who invoked this function does not -+ * need parent entities too to be removed completely. -+ */ -+ ins_into_idle_tree = true; -+ } - --update: -+ /* -+ * If the deactivation loop is fully executed, then there are -+ * no more entities to touch and next loop is not executed at -+ * all. Otherwise, requeue remaining entities if they are -+ * about to stop receiving service, or reposition them if this -+ * is not the case. -+ */ - entity = parent; - for_each_entity(entity) { -- __bfq_activate_entity(entity); -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ /* -+ * Invoke __bfq_requeue_entity on entity, even if -+ * already active, to requeue/reposition it in the -+ * active tree (because sd->next_in_service has -+ * changed) -+ */ -+ __bfq_requeue_entity(entity); - - sd = entity->sched_data; -- if (!bfq_update_next_in_service(sd)) -+ BUG_ON(expiration && sd->in_service_entity != entity); -+ -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "invoking udpdate_next for this queue"); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, -+ struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "invoking udpdate_next for this entity"); -+ } -+#endif -+ if (!bfq_update_next_in_service(sd, entity) && -+ !expiration) -+ /* -+ * next_in_service unchanged or not causing -+ * any change in entity->parent->sd, and no -+ * requeueing needed for expiration: stop -+ * here. -+ */ - break; - } - } - - /** -- * bfq_update_vtime - update vtime if necessary. -+ * bfq_calc_vtime_jump - compute the value to which the vtime should jump, -+ * if needed, to have at least one entity eligible. - * @st: the service tree to act upon. - * -- * If necessary update the service tree vtime to have at least one -- * eligible entity, skipping to its start time. Assumes that the -- * active tree of the device is not empty. -- * -- * NOTE: this hierarchical implementation updates vtimes quite often, -- * we may end up with reactivated processes getting timestamps after a -- * vtime skip done because we needed a ->first_active entity on some -- * intermediate node. -+ * Assumes that st is not empty. - */ --static void bfq_update_vtime(struct bfq_service_tree *st) -+static u64 bfq_calc_vtime_jump(struct bfq_service_tree *st) - { -- struct bfq_entity *entry; -- struct rb_node *node = st->active.rb_node; -+ struct bfq_entity *root_entity = bfq_root_active_entity(&st->active); -+ -+ if (bfq_gt(root_entity->min_start, st->vtime)) { -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(root_entity); - -- entry = rb_entry(node, struct bfq_entity, rb_node); -- if (bfq_gt(entry->min_start, st->vtime)) { -- st->vtime = entry->min_start; -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "calc_vtime_jump: new value %llu", -+ root_entity->min_start); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(root_entity, struct bfq_group, -+ entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "calc_vtime_jump: new value %llu", -+ root_entity->min_start); -+ } -+#endif -+ return root_entity->min_start; -+ } -+ return st->vtime; -+} -+ -+static void bfq_update_vtime(struct bfq_service_tree *st, u64 new_value) -+{ -+ if (new_value > st->vtime) { -+ st->vtime = new_value; - bfq_forget_idle(st); - } - } -@@ -952,6 +1462,7 @@ static void bfq_update_vtime(struct bfq_service_tree *st) - * bfq_first_active_entity - find the eligible entity with - * the smallest finish time - * @st: the service tree to select from. -+ * @vtime: the system virtual to use as a reference for eligibility - * - * This function searches the first schedulable entity, starting from the - * root of the tree and going on the left every time on this side there is -@@ -959,7 +1470,8 @@ static void bfq_update_vtime(struct bfq_service_tree *st) - * the right is followed only if a) the left subtree contains no eligible - * entities and b) no eligible entity has been found yet. - */ --static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) -+static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st, -+ u64 vtime) - { - struct bfq_entity *entry, *first = NULL; - struct rb_node *node = st->active.rb_node; -@@ -967,15 +1479,15 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) - while (node) { - entry = rb_entry(node, struct bfq_entity, rb_node); - left: -- if (!bfq_gt(entry->start, st->vtime)) -+ if (!bfq_gt(entry->start, vtime)) - first = entry; - -- BUG_ON(bfq_gt(entry->min_start, st->vtime)); -+ BUG_ON(bfq_gt(entry->min_start, vtime)); - - if (node->rb_left) { - entry = rb_entry(node->rb_left, - struct bfq_entity, rb_node); -- if (!bfq_gt(entry->min_start, st->vtime)) { -+ if (!bfq_gt(entry->min_start, vtime)) { - node = node->rb_left; - goto left; - } -@@ -993,31 +1505,84 @@ static struct bfq_entity *bfq_first_active_entity(struct bfq_service_tree *st) - * __bfq_lookup_next_entity - return the first eligible entity in @st. - * @st: the service tree. - * -- * Update the virtual time in @st and return the first eligible entity -- * it contains. -+ * If there is no in-service entity for the sched_data st belongs to, -+ * then return the entity that will be set in service if: -+ * 1) the parent entity this st belongs to is set in service; -+ * 2) no entity belonging to such parent entity undergoes a state change -+ * that would influence the timestamps of the entity (e.g., becomes idle, -+ * becomes backlogged, changes its budget, ...). -+ * -+ * In this first case, update the virtual time in @st too (see the -+ * comments on this update inside the function). -+ * -+ * In constrast, if there is an in-service entity, then return the -+ * entity that would be set in service if not only the above -+ * conditions, but also the next one held true: the currently -+ * in-service entity, on expiration, -+ * 1) gets a finish time equal to the current one, or -+ * 2) is not eligible any more, or -+ * 3) is idle. - */ --static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, -- bool force) -+static struct bfq_entity * -+__bfq_lookup_next_entity(struct bfq_service_tree *st, bool in_service -+#if 0 -+ , bool force -+#endif -+ ) - { -- struct bfq_entity *entity, *new_next_in_service = NULL; -+ struct bfq_entity *entity -+#if 0 -+ , *new_next_in_service = NULL -+#endif -+ ; -+ u64 new_vtime; -+ struct bfq_queue *bfqq; - - if (RB_EMPTY_ROOT(&st->active)) - return NULL; - -- bfq_update_vtime(st); -- entity = bfq_first_active_entity(st); -- BUG_ON(bfq_gt(entity->start, st->vtime)); -+ /* -+ * Get the value of the system virtual time for which at -+ * least one entity is eligible. -+ */ -+ new_vtime = bfq_calc_vtime_jump(st); - - /* -- * If the chosen entity does not match with the sched_data's -- * next_in_service and we are forcedly serving the IDLE priority -- * class tree, bubble up budget update. -+ * If there is no in-service entity for the sched_data this -+ * active tree belongs to, then push the system virtual time -+ * up to the value that guarantees that at least one entity is -+ * eligible. If, instead, there is an in-service entity, then -+ * do not make any such update, because there is already an -+ * eligible entity, namely the in-service one (even if the -+ * entity is not on st, because it was extracted when set in -+ * service). - */ -- if (unlikely(force && entity != entity->sched_data->next_in_service)) { -- new_next_in_service = entity; -- for_each_entity(new_next_in_service) -- bfq_update_budget(new_next_in_service); -+ if (!in_service) -+ bfq_update_vtime(st, new_vtime); -+ -+ entity = bfq_first_active_entity(st, new_vtime); -+ BUG_ON(bfq_gt(entity->start, new_vtime)); -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "__lookup_next: start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "__lookup_next: start %llu vtime %llu st %p", -+ ((entity->start>>10)*1000)>>12, -+ ((new_vtime>>10)*1000)>>12, st); - } -+#endif -+ -+ BUG_ON(!entity); - - return entity; - } -@@ -1025,50 +1590,81 @@ static struct bfq_entity *__bfq_lookup_next_entity(struct bfq_service_tree *st, - /** - * bfq_lookup_next_entity - return the first eligible entity in @sd. - * @sd: the sched_data. -- * @extract: if true the returned entity will be also extracted from @sd. - * -- * NOTE: since we cache the next_in_service entity at each level of the -- * hierarchy, the complexity of the lookup can be decreased with -- * absolutely no effort just returning the cached next_in_service value; -- * we prefer to do full lookups to test the consistency of * the data -- * structures. -+ * This function is invoked when there has been a change in the trees -+ * for sd, and we need know what is the new next entity after this -+ * change. - */ --static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd, -- int extract, -- struct bfq_data *bfqd) -+static struct bfq_entity *bfq_lookup_next_entity(struct bfq_sched_data *sd) - { - struct bfq_service_tree *st = sd->service_tree; -- struct bfq_entity *entity; -- int i = 0; -- -- BUG_ON(sd->in_service_entity); -+ struct bfq_service_tree *idle_class_st = st + (BFQ_IOPRIO_CLASSES - 1); -+ struct bfq_entity *entity = NULL; -+ struct bfq_queue *bfqq; -+ int class_idx = 0; - -- if (bfqd && -- jiffies - bfqd->bfq_class_idle_last_service > BFQ_CL_IDLE_TIMEOUT) { -- entity = __bfq_lookup_next_entity(st + BFQ_IOPRIO_CLASSES - 1, -- true); -- if (entity) { -- i = BFQ_IOPRIO_CLASSES - 1; -- bfqd->bfq_class_idle_last_service = jiffies; -- sd->next_in_service = entity; -- } -+ BUG_ON(!sd); -+ BUG_ON(!st); -+ /* -+ * Choose from idle class, if needed to guarantee a minimum -+ * bandwidth to this class (and if there is some active entity -+ * in idle class). This should also mitigate -+ * priority-inversion problems in case a low priority task is -+ * holding file system resources. -+ */ -+ if (time_is_before_jiffies(sd->bfq_class_idle_last_service + -+ BFQ_CL_IDLE_TIMEOUT)) { -+ if (!RB_EMPTY_ROOT(&idle_class_st->active)) -+ class_idx = BFQ_IOPRIO_CLASSES - 1; -+ /* About to be served if backlogged, or not yet backlogged */ -+ sd->bfq_class_idle_last_service = jiffies; - } -- for (; i < BFQ_IOPRIO_CLASSES; i++) { -- entity = __bfq_lookup_next_entity(st + i, false); -- if (entity) { -- if (extract) { -- bfq_check_next_in_service(sd, entity); -- bfq_active_extract(st + i, entity); -- sd->in_service_entity = entity; -- sd->next_in_service = NULL; -- } -+ -+ /* -+ * Find the next entity to serve for the highest-priority -+ * class, unless the idle class needs to be served. -+ */ -+ for (; class_idx < BFQ_IOPRIO_CLASSES; class_idx++) { -+ entity = __bfq_lookup_next_entity(st + class_idx, -+ sd->in_service_entity); -+ -+ if (entity) - break; -- } - } - -+ BUG_ON(!entity && -+ (!RB_EMPTY_ROOT(&st->active) || !RB_EMPTY_ROOT(&(st+1)->active) || -+ !RB_EMPTY_ROOT(&(st+2)->active))); -+ -+ if (!entity) -+ return NULL; -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, "chosen from st %p %d", -+ st + class_idx, class_idx); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "chosen from st %p %d", -+ st + class_idx, class_idx); -+ } -+#endif -+ - return entity; - } - -+static bool next_queue_may_preempt(struct bfq_data *bfqd) -+{ -+ struct bfq_sched_data *sd = &bfqd->root_group->sched_data; -+ -+ return sd->next_in_service != sd->in_service_entity; -+} -+ - /* - * Get next queue for service. - */ -@@ -1083,58 +1679,208 @@ static struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd) - if (bfqd->busy_queues == 0) - return NULL; - -+ /* -+ * Traverse the path from the root to the leaf entity to -+ * serve. Set in service all the entities visited along the -+ * way. -+ */ - sd = &bfqd->root_group->sched_data; - for (; sd ; sd = entity->my_sched_data) { -- entity = bfq_lookup_next_entity(sd, 1, bfqd); -- BUG_ON(!entity); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ if (entity) { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "get_next_queue: lookup in this group"); -+ if (!sd->next_in_service) -+ pr_crit("get_next_queue: lookup in this group"); -+ } else { -+ bfq_log_bfqg(bfqd, bfqd->root_group, -+ "get_next_queue: lookup in root group"); -+ if (!sd->next_in_service) -+ pr_crit("get_next_queue: lookup in root group"); -+ } -+#endif -+ -+ BUG_ON(!sd->next_in_service); -+ -+ /* -+ * WARNING. We are about to set the in-service entity -+ * to sd->next_in_service, i.e., to the (cached) value -+ * returned by bfq_lookup_next_entity(sd) the last -+ * time it was invoked, i.e., the last time when the -+ * service order in sd changed as a consequence of the -+ * activation or deactivation of an entity. In this -+ * respect, if we execute bfq_lookup_next_entity(sd) -+ * in this very moment, it may, although with low -+ * probability, yield a different entity than that -+ * pointed to by sd->next_in_service. This rare event -+ * happens in case there was no CLASS_IDLE entity to -+ * serve for sd when bfq_lookup_next_entity(sd) was -+ * invoked for the last time, while there is now one -+ * such entity. -+ * -+ * If the above event happens, then the scheduling of -+ * such entity in CLASS_IDLE is postponed until the -+ * service of the sd->next_in_service entity -+ * finishes. In fact, when the latter is expired, -+ * bfq_lookup_next_entity(sd) gets called again, -+ * exactly to update sd->next_in_service. -+ */ -+ -+ /* Make next_in_service entity become in_service_entity */ -+ entity = sd->next_in_service; -+ sd->in_service_entity = entity; -+ -+ /* -+ * Reset the accumulator of the amount of service that -+ * the entity is about to receive. -+ */ - entity->service = 0; -+ -+ /* -+ * If entity is no longer a candidate for next -+ * service, then we extract it from its active tree, -+ * for the following reason. To further boost the -+ * throughput in some special case, BFQ needs to know -+ * which is the next candidate entity to serve, while -+ * there is already an entity in service. In this -+ * respect, to make it easy to compute/update the next -+ * candidate entity to serve after the current -+ * candidate has been set in service, there is a case -+ * where it is necessary to extract the current -+ * candidate from its service tree. Such a case is -+ * when the entity just set in service cannot be also -+ * a candidate for next service. Details about when -+ * this conditions holds are reported in the comments -+ * on the function bfq_no_longer_next_in_service() -+ * invoked below. -+ */ -+ if (bfq_no_longer_next_in_service(entity)) -+ bfq_active_extract(bfq_entity_service_tree(entity), -+ entity); -+ -+ /* -+ * For the same reason why we may have just extracted -+ * entity from its active tree, we may need to update -+ * next_in_service for the sched_data of entity too, -+ * regardless of whether entity has been extracted. -+ * In fact, even if entity has not been extracted, a -+ * descendant entity may get extracted. Such an event -+ * would cause a change in next_in_service for the -+ * level of the descendant entity, and thus possibly -+ * back to upper levels. -+ * -+ * We cannot perform the resulting needed update -+ * before the end of this loop, because, to know which -+ * is the correct next-to-serve candidate entity for -+ * each level, we need first to find the leaf entity -+ * to set in service. In fact, only after we know -+ * which is the next-to-serve leaf entity, we can -+ * discover whether the parent entity of the leaf -+ * entity becomes the next-to-serve, and so on. -+ */ -+ -+ /* Log some information */ -+ bfqq = bfq_entity_to_bfqq(entity); -+ if (bfqq) -+ bfq_log_bfqq(bfqd, bfqq, -+ "get_next_queue: this queue, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg(bfqd, bfqg, -+ "get_next_queue: this entity, finish %llu", -+ (((entity->finish>>10)*1000)>>10)>>2); -+ } -+#endif -+ - } - -+ BUG_ON(!entity); - bfqq = bfq_entity_to_bfqq(entity); - BUG_ON(!bfqq); - -+ /* -+ * We can finally update all next-to-serve entities along the -+ * path from the leaf entity just set in service to the root. -+ */ -+ for_each_entity(entity) { -+ struct bfq_sched_data *sd = entity->sched_data; -+ -+ if(!bfq_update_next_in_service(sd, NULL)) -+ break; -+ } -+ - return bfqq; - } - - static void __bfq_bfqd_reset_in_service(struct bfq_data *bfqd) - { -+ struct bfq_entity *entity = &bfqd->in_service_queue->entity; -+ - if (bfqd->in_service_bic) { - put_io_context(bfqd->in_service_bic->icq.ioc); - bfqd->in_service_bic = NULL; - } - -+ bfq_clear_bfqq_wait_request(bfqd->in_service_queue); -+ hrtimer_try_to_cancel(&bfqd->idle_slice_timer); - bfqd->in_service_queue = NULL; -- del_timer(&bfqd->idle_slice_timer); -+ -+ /* -+ * When this function is called, all in-service entities have -+ * been properly deactivated or requeued, so we can safely -+ * execute the final step: reset in_service_entity along the -+ * path from entity to the root. -+ */ -+ for_each_entity(entity) -+ entity->sched_data->in_service_entity = NULL; - } - - static void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- int requeue) -+ bool ins_into_idle_tree, bool expiration) - { - struct bfq_entity *entity = &bfqq->entity; - -- if (bfqq == bfqd->in_service_queue) -- __bfq_bfqd_reset_in_service(bfqd); -- -- bfq_deactivate_entity(entity, requeue); -+ bfq_deactivate_entity(entity, ins_into_idle_tree, expiration); - } - - static void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) - { - struct bfq_entity *entity = &bfqq->entity; -+ struct bfq_service_tree *st = bfq_entity_service_tree(entity); -+ -+ BUG_ON(bfqq == bfqd->in_service_queue); -+ BUG_ON(entity->tree != &st->active && entity->tree != &st->idle && -+ entity->on_st); - -- bfq_activate_entity(entity); -+ bfq_activate_requeue_entity(entity, bfq_bfqq_non_blocking_wait_rq(bfqq), -+ false); -+ bfq_clear_bfqq_non_blocking_wait_rq(bfqq); -+} -+ -+static void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq) -+{ -+ struct bfq_entity *entity = &bfqq->entity; -+ -+ bfq_activate_requeue_entity(entity, false, -+ bfqq == bfqd->in_service_queue); - } - --#ifdef CONFIG_BFQ_GROUP_IOSCHED - static void bfqg_stats_update_dequeue(struct bfq_group *bfqg); --#endif - - /* - * Called when the bfqq no longer has requests pending, remove it from -- * the service tree. -+ * the service tree. As a special case, it can be invoked during an -+ * expiration. - */ - static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, -- int requeue) -+ bool expiration) - { - BUG_ON(!bfq_bfqq_busy(bfqq)); - BUG_ON(!RB_EMPTY_ROOT(&bfqq->sort_list)); -@@ -1146,27 +1892,20 @@ static void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, - BUG_ON(bfqd->busy_queues == 0); - bfqd->busy_queues--; - -- if (!bfqq->dispatched) { -+ if (!bfqq->dispatched) - bfq_weights_tree_remove(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); -- if (!blk_queue_nonrot(bfqd->queue)) { -- BUG_ON(!bfqd->busy_in_flight_queues); -- bfqd->busy_in_flight_queues--; -- if (bfq_bfqq_constantly_seeky(bfqq)) { -- BUG_ON(!bfqd-> -- const_seeky_busy_in_flight_queues); -- bfqd->const_seeky_busy_in_flight_queues--; -- } -- } -- } -+ - if (bfqq->wr_coeff > 1) - bfqd->wr_busy_queues--; - --#ifdef CONFIG_BFQ_GROUP_IOSCHED - bfqg_stats_update_dequeue(bfqq_group(bfqq)); --#endif - -- bfq_deactivate_bfqq(bfqd, bfqq, requeue); -+ BUG_ON(bfqq->entity.budget < 0); -+ -+ bfq_deactivate_bfqq(bfqd, bfqq, true, expiration); -+ -+ BUG_ON(bfqq->entity.budget < 0); - } - - /* -@@ -1184,16 +1923,11 @@ static void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq) - bfq_mark_bfqq_busy(bfqq); - bfqd->busy_queues++; - -- if (!bfqq->dispatched) { -+ if (!bfqq->dispatched) - if (bfqq->wr_coeff == 1) - bfq_weights_tree_add(bfqd, &bfqq->entity, - &bfqd->queue_weights_tree); -- if (!blk_queue_nonrot(bfqd->queue)) { -- bfqd->busy_in_flight_queues++; -- if (bfq_bfqq_constantly_seeky(bfqq)) -- bfqd->const_seeky_busy_in_flight_queues++; -- } -- } -+ - if (bfqq->wr_coeff > 1) - bfqd->wr_busy_queues++; - } -diff --git a/block/bfq.h b/block/bfq.h -index fcce855..2a2bc30 100644 ---- a/block/bfq.h -+++ b/block/bfq.h -@@ -1,5 +1,5 @@ - /* -- * BFQ-v7r11 for 4.5.0: data structures and common functions prototypes. -+ * BFQ v8r8 for 4.10.0: data structures and common functions prototypes. - * - * Based on ideas and code from CFQ: - * Copyright (C) 2003 Jens Axboe -@@ -7,7 +7,9 @@ - * Copyright (C) 2008 Fabio Checconi - * Paolo Valente - * -- * Copyright (C) 2010 Paolo Valente -+ * Copyright (C) 2015 Paolo Valente -+ * -+ * Copyright (C) 2017 Paolo Valente - */ - - #ifndef _BFQ_H -@@ -28,20 +30,21 @@ - - #define BFQ_DEFAULT_QUEUE_IOPRIO 4 - --#define BFQ_DEFAULT_GRP_WEIGHT 10 -+#define BFQ_WEIGHT_LEGACY_DFL 100 - #define BFQ_DEFAULT_GRP_IOPRIO 0 - #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE - -+/* -+ * Soft real-time applications are extremely more latency sensitive -+ * than interactive ones. Over-raise the weight of the former to -+ * privilege them against the latter. -+ */ -+#define BFQ_SOFTRT_WEIGHT_FACTOR 100 -+ - struct bfq_entity; - - /** - * struct bfq_service_tree - per ioprio_class service tree. -- * @active: tree for active entities (i.e., those backlogged). -- * @idle: tree for idle entities (i.e., those not backlogged, with V <= F_i). -- * @first_idle: idle entity with minimum F_i. -- * @last_idle: idle entity with maximum F_i. -- * @vtime: scheduler virtual time. -- * @wsum: scheduler weight sum; active and idle entities contribute to it. - * - * Each service tree represents a B-WF2Q+ scheduler on its own. Each - * ioprio_class has its own independent scheduler, and so its own -@@ -49,27 +52,28 @@ struct bfq_entity; - * of the containing bfqd. - */ - struct bfq_service_tree { -+ /* tree for active entities (i.e., those backlogged) */ - struct rb_root active; -+ /* tree for idle entities (i.e., not backlogged, with V <= F_i)*/ - struct rb_root idle; - -- struct bfq_entity *first_idle; -- struct bfq_entity *last_idle; -+ struct bfq_entity *first_idle; /* idle entity with minimum F_i */ -+ struct bfq_entity *last_idle; /* idle entity with maximum F_i */ - -- u64 vtime; -+ u64 vtime; /* scheduler virtual time */ -+ /* scheduler weight sum; active and idle entities contribute to it */ - unsigned long wsum; - }; - - /** - * struct bfq_sched_data - multi-class scheduler. -- * @in_service_entity: entity in service. -- * @next_in_service: head-of-the-line entity in the scheduler. -- * @service_tree: array of service trees, one per ioprio_class. - * - * bfq_sched_data is the basic scheduler queue. It supports three -- * ioprio_classes, and can be used either as a toplevel queue or as -- * an intermediate queue on a hierarchical setup. -- * @next_in_service points to the active entity of the sched_data -- * service trees that will be scheduled next. -+ * ioprio_classes, and can be used either as a toplevel queue or as an -+ * intermediate queue on a hierarchical setup. @next_in_service -+ * points to the active entity of the sched_data service trees that -+ * will be scheduled next. It is used to reduce the number of steps -+ * needed for each hierarchical-schedule update. - * - * The supported ioprio_classes are the same as in CFQ, in descending - * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. -@@ -79,48 +83,32 @@ struct bfq_service_tree { - * All the fields are protected by the queue lock of the containing bfqd. - */ - struct bfq_sched_data { -- struct bfq_entity *in_service_entity; -+ struct bfq_entity *in_service_entity; /* entity in service */ -+ /* head-of-the-line entity in the scheduler (see comments above) */ - struct bfq_entity *next_in_service; -+ /* array of service trees, one per ioprio_class */ - struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; -+ /* last time CLASS_IDLE was served */ -+ unsigned long bfq_class_idle_last_service; -+ - }; - - /** - * struct bfq_weight_counter - counter of the number of all active entities - * with a given weight. -- * @weight: weight of the entities that this counter refers to. -- * @num_active: number of active entities with this weight. -- * @weights_node: weights tree member (see bfq_data's @queue_weights_tree -- * and @group_weights_tree). - */ - struct bfq_weight_counter { -- short int weight; -- unsigned int num_active; -+ unsigned int weight; /* weight of the entities this counter refers to */ -+ unsigned int num_active; /* nr of active entities with this weight */ -+ /* -+ * Weights tree member (see bfq_data's @queue_weights_tree and -+ * @group_weights_tree) -+ */ - struct rb_node weights_node; - }; - - /** - * struct bfq_entity - schedulable entity. -- * @rb_node: service_tree member. -- * @weight_counter: pointer to the weight counter associated with this entity. -- * @on_st: flag, true if the entity is on a tree (either the active or -- * the idle one of its service_tree). -- * @finish: B-WF2Q+ finish timestamp (aka F_i). -- * @start: B-WF2Q+ start timestamp (aka S_i). -- * @tree: tree the entity is enqueued into; %NULL if not on a tree. -- * @min_start: minimum start time of the (active) subtree rooted at -- * this entity; used for O(log N) lookups into active trees. -- * @service: service received during the last round of service. -- * @budget: budget used to calculate F_i; F_i = S_i + @budget / @weight. -- * @weight: weight of the queue -- * @parent: parent entity, for hierarchical scheduling. -- * @my_sched_data: for non-leaf nodes in the cgroup hierarchy, the -- * associated scheduler queue, %NULL on leaf nodes. -- * @sched_data: the scheduler queue this entity belongs to. -- * @ioprio: the ioprio in use. -- * @new_weight: when a weight change is requested, the new weight value. -- * @orig_weight: original weight, used to implement weight boosting -- * @prio_changed: flag, true when the user requested a weight, ioprio or -- * ioprio_class change. - * - * A bfq_entity is used to represent either a bfq_queue (leaf node in the - * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each -@@ -147,27 +135,52 @@ struct bfq_weight_counter { - * containing bfqd. - */ - struct bfq_entity { -- struct rb_node rb_node; -+ struct rb_node rb_node; /* service_tree member */ -+ /* pointer to the weight counter associated with this entity */ - struct bfq_weight_counter *weight_counter; - -- int on_st; -+ /* -+ * Flag, true if the entity is on a tree (either the active or -+ * the idle one of its service_tree) or is in service. -+ */ -+ bool on_st; - -- u64 finish; -- u64 start; -+ u64 finish; /* B-WF2Q+ finish timestamp (aka F_i) */ -+ u64 start; /* B-WF2Q+ start timestamp (aka S_i) */ - -+ /* tree the entity is enqueued into; %NULL if not on a tree */ - struct rb_root *tree; - -+ /* -+ * minimum start time of the (active) subtree rooted at this -+ * entity; used for O(log N) lookups into active trees -+ */ - u64 min_start; - -- int service, budget; -- unsigned short weight, new_weight; -- unsigned short orig_weight; -+ /* amount of service received during the last service slot */ -+ int service; -+ -+ /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ -+ int budget; -+ -+ unsigned int weight; /* weight of the queue */ -+ unsigned int new_weight; /* next weight if a change is in progress */ -+ -+ /* original weight, used to implement weight boosting */ -+ unsigned int orig_weight; - -+ /* parent entity, for hierarchical scheduling */ - struct bfq_entity *parent; - -+ /* -+ * For non-leaf nodes in the hierarchy, the associated -+ * scheduler queue, %NULL on leaf nodes. -+ */ - struct bfq_sched_data *my_sched_data; -+ /* the scheduler queue this entity belongs to */ - struct bfq_sched_data *sched_data; - -+ /* flag, set to request a weight, ioprio or ioprio_class change */ - int prio_changed; - }; - -@@ -175,56 +188,6 @@ struct bfq_group; - - /** - * struct bfq_queue - leaf schedulable entity. -- * @ref: reference counter. -- * @bfqd: parent bfq_data. -- * @new_ioprio: when an ioprio change is requested, the new ioprio value. -- * @ioprio_class: the ioprio_class in use. -- * @new_ioprio_class: when an ioprio_class change is requested, the new -- * ioprio_class value. -- * @new_bfqq: shared bfq_queue if queue is cooperating with -- * one or more other queues. -- * @pos_node: request-position tree member (see bfq_group's @rq_pos_tree). -- * @pos_root: request-position tree root (see bfq_group's @rq_pos_tree). -- * @sort_list: sorted list of pending requests. -- * @next_rq: if fifo isn't expired, next request to serve. -- * @queued: nr of requests queued in @sort_list. -- * @allocated: currently allocated requests. -- * @meta_pending: pending metadata requests. -- * @fifo: fifo list of requests in sort_list. -- * @entity: entity representing this queue in the scheduler. -- * @max_budget: maximum budget allowed from the feedback mechanism. -- * @budget_timeout: budget expiration (in jiffies). -- * @dispatched: number of requests on the dispatch list or inside driver. -- * @flags: status flags. -- * @bfqq_list: node for active/idle bfqq list inside our bfqd. -- * @burst_list_node: node for the device's burst list. -- * @seek_samples: number of seeks sampled -- * @seek_total: sum of the distances of the seeks sampled -- * @seek_mean: mean seek distance -- * @last_request_pos: position of the last request enqueued -- * @requests_within_timer: number of consecutive pairs of request completion -- * and arrival, such that the queue becomes idle -- * after the completion, but the next request arrives -- * within an idle time slice; used only if the queue's -- * IO_bound has been cleared. -- * @pid: pid of the process owning the queue, used for logging purposes. -- * @last_wr_start_finish: start time of the current weight-raising period if -- * the @bfq-queue is being weight-raised, otherwise -- * finish time of the last weight-raising period -- * @wr_cur_max_time: current max raising time for this queue -- * @soft_rt_next_start: minimum time instant such that, only if a new -- * request is enqueued after this time instant in an -- * idle @bfq_queue with no outstanding requests, then -- * the task associated with the queue it is deemed as -- * soft real-time (see the comments to the function -- * bfq_bfqq_softrt_next_start()) -- * @last_idle_bklogged: time of the last transition of the @bfq_queue from -- * idle to backlogged -- * @service_from_backlogged: cumulative service received from the @bfq_queue -- * since the last transition from idle to -- * backlogged -- * @bic: pointer to the bfq_io_cq owning the bfq_queue, set to %NULL if the -- * queue is shared - * - * A bfq_queue is a leaf request queue; it can be associated with an - * io_context or more, if it is async or shared between cooperating -@@ -235,117 +198,175 @@ struct bfq_group; - * All the fields are protected by the queue lock of the containing bfqd. - */ - struct bfq_queue { -- atomic_t ref; -+ /* reference counter */ -+ int ref; -+ /* parent bfq_data */ - struct bfq_data *bfqd; - -- unsigned short ioprio, new_ioprio; -- unsigned short ioprio_class, new_ioprio_class; -+ /* current ioprio and ioprio class */ -+ unsigned short ioprio, ioprio_class; -+ /* next ioprio and ioprio class if a change is in progress */ -+ unsigned short new_ioprio, new_ioprio_class; - -- /* fields for cooperating queues handling */ -+ /* -+ * Shared bfq_queue if queue is cooperating with one or more -+ * other queues. -+ */ - struct bfq_queue *new_bfqq; -+ /* request-position tree member (see bfq_group's @rq_pos_tree) */ - struct rb_node pos_node; -+ /* request-position tree root (see bfq_group's @rq_pos_tree) */ - struct rb_root *pos_root; - -+ /* sorted list of pending requests */ - struct rb_root sort_list; -+ /* if fifo isn't expired, next request to serve */ - struct request *next_rq; -+ /* number of sync and async requests queued */ - int queued[2]; -+ /* number of sync and async requests currently allocated */ - int allocated[2]; -+ /* number of pending metadata requests */ - int meta_pending; -+ /* fifo list of requests in sort_list */ - struct list_head fifo; - -+ /* entity representing this queue in the scheduler */ - struct bfq_entity entity; - -+ /* maximum budget allowed from the feedback mechanism */ - int max_budget; -+ /* budget expiration (in jiffies) */ - unsigned long budget_timeout; - -+ /* number of requests on the dispatch list or inside driver */ - int dispatched; - -- unsigned int flags; -+ unsigned int flags; /* status flags.*/ - -+ /* node for active/idle bfqq list inside parent bfqd */ - struct list_head bfqq_list; - -+ /* bit vector: a 1 for each seeky requests in history */ -+ u32 seek_history; -+ -+ /* node for the device's burst list */ - struct hlist_node burst_list_node; - -- unsigned int seek_samples; -- u64 seek_total; -- sector_t seek_mean; -+ /* position of the last request enqueued */ - sector_t last_request_pos; - -+ /* Number of consecutive pairs of request completion and -+ * arrival, such that the queue becomes idle after the -+ * completion, but the next request arrives within an idle -+ * time slice; used only if the queue's IO_bound flag has been -+ * cleared. -+ */ - unsigned int requests_within_timer; - -+ /* pid of the process owning the queue, used for logging purposes */ - pid_t pid; -+ -+ /* -+ * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL -+ * if the queue is shared. -+ */ - struct bfq_io_cq *bic; - -- /* weight-raising fields */ -+ /* current maximum weight-raising time for this queue */ - unsigned long wr_cur_max_time; -+ /* -+ * Minimum time instant such that, only if a new request is -+ * enqueued after this time instant in an idle @bfq_queue with -+ * no outstanding requests, then the task associated with the -+ * queue it is deemed as soft real-time (see the comments on -+ * the function bfq_bfqq_softrt_next_start()) -+ */ - unsigned long soft_rt_next_start; -+ /* -+ * Start time of the current weight-raising period if -+ * the @bfq-queue is being weight-raised, otherwise -+ * finish time of the last weight-raising period. -+ */ - unsigned long last_wr_start_finish; -+ /* factor by which the weight of this queue is multiplied */ - unsigned int wr_coeff; -+ /* -+ * Time of the last transition of the @bfq_queue from idle to -+ * backlogged. -+ */ - unsigned long last_idle_bklogged; -+ /* -+ * Cumulative service received from the @bfq_queue since the -+ * last transition from idle to backlogged. -+ */ - unsigned long service_from_backlogged; -+ /* -+ * Value of wr start time when switching to soft rt -+ */ -+ unsigned long wr_start_at_switch_to_srt; -+ -+ unsigned long split_time; /* time of last split */ - }; - - /** - * struct bfq_ttime - per process thinktime stats. -- * @ttime_total: total process thinktime -- * @ttime_samples: number of thinktime samples -- * @ttime_mean: average process thinktime - */ - struct bfq_ttime { -- unsigned long last_end_request; -+ u64 last_end_request; /* completion time of last request */ -+ -+ u64 ttime_total; /* total process thinktime */ -+ unsigned long ttime_samples; /* number of thinktime samples */ -+ u64 ttime_mean; /* average process thinktime */ - -- unsigned long ttime_total; -- unsigned long ttime_samples; -- unsigned long ttime_mean; - }; - - /** - * struct bfq_io_cq - per (request_queue, io_context) structure. -- * @icq: associated io_cq structure -- * @bfqq: array of two process queues, the sync and the async -- * @ttime: associated @bfq_ttime struct -- * @ioprio: per (request_queue, blkcg) ioprio. -- * @blkcg_id: id of the blkcg the related io_cq belongs to. -- * @wr_time_left: snapshot of the time left before weight raising ends -- * for the sync queue associated to this process; this -- * snapshot is taken to remember this value while the weight -- * raising is suspended because the queue is merged with a -- * shared queue, and is used to set @raising_cur_max_time -- * when the queue is split from the shared queue and its -- * weight is raised again -- * @saved_idle_window: same purpose as the previous field for the idle -- * window -- * @saved_IO_bound: same purpose as the previous two fields for the I/O -- * bound classification of a queue -- * @saved_in_large_burst: same purpose as the previous fields for the -- * value of the field keeping the queue's belonging -- * to a large burst -- * @was_in_burst_list: true if the queue belonged to a burst list -- * before its merge with another cooperating queue -- * @cooperations: counter of consecutive successful queue merges underwent -- * by any of the process' @bfq_queues -- * @failed_cooperations: counter of consecutive failed queue merges of any -- * of the process' @bfq_queues - */ - struct bfq_io_cq { -+ /* associated io_cq structure */ - struct io_cq icq; /* must be the first member */ -+ /* array of two process queues, the sync and the async */ - struct bfq_queue *bfqq[2]; -+ /* associated @bfq_ttime struct */ - struct bfq_ttime ttime; -+ /* per (request_queue, blkcg) ioprio */ - int ioprio; -- - #ifdef CONFIG_BFQ_GROUP_IOSCHED -- uint64_t blkcg_id; /* the current blkcg ID */ -+ uint64_t blkcg_serial_nr; /* the current blkcg serial */ - #endif - -- unsigned int wr_time_left; -+ /* -+ * Snapshot of the idle window before merging; taken to -+ * remember this value while the queue is merged, so as to be -+ * able to restore it in case of split. -+ */ - bool saved_idle_window; -+ /* -+ * Same purpose as the previous two fields for the I/O bound -+ * classification of a queue. -+ */ - bool saved_IO_bound; - -+ /* -+ * Same purpose as the previous fields for the value of the -+ * field keeping the queue's belonging to a large burst -+ */ - bool saved_in_large_burst; -+ /* -+ * True if the queue belonged to a burst list before its merge -+ * with another cooperating queue. -+ */ - bool was_in_burst_list; - -- unsigned int cooperations; -- unsigned int failed_cooperations; -+ /* -+ * Similar to previous fields: save wr information. -+ */ -+ unsigned long saved_wr_coeff; -+ unsigned long saved_last_wr_start_finish; -+ unsigned long saved_wr_start_at_switch_to_srt; -+ unsigned int saved_wr_cur_max_time; - }; - - enum bfq_device_speed { -@@ -354,224 +375,232 @@ enum bfq_device_speed { - }; - - /** -- * struct bfq_data - per device data structure. -- * @queue: request queue for the managed device. -- * @root_group: root bfq_group for the device. -- * @active_numerous_groups: number of bfq_groups containing more than one -- * active @bfq_entity. -- * @queue_weights_tree: rbtree of weight counters of @bfq_queues, sorted by -- * weight. Used to keep track of whether all @bfq_queues -- * have the same weight. The tree contains one counter -- * for each distinct weight associated to some active -- * and not weight-raised @bfq_queue (see the comments to -- * the functions bfq_weights_tree_[add|remove] for -- * further details). -- * @group_weights_tree: rbtree of non-queue @bfq_entity weight counters, sorted -- * by weight. Used to keep track of whether all -- * @bfq_groups have the same weight. The tree contains -- * one counter for each distinct weight associated to -- * some active @bfq_group (see the comments to the -- * functions bfq_weights_tree_[add|remove] for further -- * details). -- * @busy_queues: number of bfq_queues containing requests (including the -- * queue in service, even if it is idling). -- * @busy_in_flight_queues: number of @bfq_queues containing pending or -- * in-flight requests, plus the @bfq_queue in -- * service, even if idle but waiting for the -- * possible arrival of its next sync request. This -- * field is updated only if the device is rotational, -- * but used only if the device is also NCQ-capable. -- * The reason why the field is updated also for non- -- * NCQ-capable rotational devices is related to the -- * fact that the value of @hw_tag may be set also -- * later than when busy_in_flight_queues may need to -- * be incremented for the first time(s). Taking also -- * this possibility into account, to avoid unbalanced -- * increments/decrements, would imply more overhead -- * than just updating busy_in_flight_queues -- * regardless of the value of @hw_tag. -- * @const_seeky_busy_in_flight_queues: number of constantly-seeky @bfq_queues -- * (that is, seeky queues that expired -- * for budget timeout at least once) -- * containing pending or in-flight -- * requests, including the in-service -- * @bfq_queue if constantly seeky. This -- * field is updated only if the device -- * is rotational, but used only if the -- * device is also NCQ-capable (see the -- * comments to @busy_in_flight_queues). -- * @wr_busy_queues: number of weight-raised busy @bfq_queues. -- * @queued: number of queued requests. -- * @rq_in_driver: number of requests dispatched and waiting for completion. -- * @sync_flight: number of sync requests in the driver. -- * @max_rq_in_driver: max number of reqs in driver in the last -- * @hw_tag_samples completed requests. -- * @hw_tag_samples: nr of samples used to calculate hw_tag. -- * @hw_tag: flag set to one if the driver is showing a queueing behavior. -- * @budgets_assigned: number of budgets assigned. -- * @idle_slice_timer: timer set when idling for the next sequential request -- * from the queue in service. -- * @unplug_work: delayed work to restart dispatching on the request queue. -- * @in_service_queue: bfq_queue in service. -- * @in_service_bic: bfq_io_cq (bic) associated with the @in_service_queue. -- * @last_position: on-disk position of the last served request. -- * @last_budget_start: beginning of the last budget. -- * @last_idling_start: beginning of the last idle slice. -- * @peak_rate: peak transfer rate observed for a budget. -- * @peak_rate_samples: number of samples used to calculate @peak_rate. -- * @bfq_max_budget: maximum budget allotted to a bfq_queue before -- * rescheduling. -- * @active_list: list of all the bfq_queues active on the device. -- * @idle_list: list of all the bfq_queues idle on the device. -- * @bfq_fifo_expire: timeout for async/sync requests; when it expires -- * requests are served in fifo order. -- * @bfq_back_penalty: weight of backward seeks wrt forward ones. -- * @bfq_back_max: maximum allowed backward seek. -- * @bfq_slice_idle: maximum idling time. -- * @bfq_user_max_budget: user-configured max budget value -- * (0 for auto-tuning). -- * @bfq_max_budget_async_rq: maximum budget (in nr of requests) allotted to -- * async queues. -- * @bfq_timeout: timeout for bfq_queues to consume their budget; used to -- * to prevent seeky queues to impose long latencies to well -- * behaved ones (this also implies that seeky queues cannot -- * receive guarantees in the service domain; after a timeout -- * they are charged for the whole allocated budget, to try -- * to preserve a behavior reasonably fair among them, but -- * without service-domain guarantees). -- * @bfq_coop_thresh: number of queue merges after which a @bfq_queue is -- * no more granted any weight-raising. -- * @bfq_failed_cooperations: number of consecutive failed cooperation -- * chances after which weight-raising is restored -- * to a queue subject to more than bfq_coop_thresh -- * queue merges. -- * @bfq_requests_within_timer: number of consecutive requests that must be -- * issued within the idle time slice to set -- * again idling to a queue which was marked as -- * non-I/O-bound (see the definition of the -- * IO_bound flag for further details). -- * @last_ins_in_burst: last time at which a queue entered the current -- * burst of queues being activated shortly after -- * each other; for more details about this and the -- * following parameters related to a burst of -- * activations, see the comments to the function -- * @bfq_handle_burst. -- * @bfq_burst_interval: reference time interval used to decide whether a -- * queue has been activated shortly after -- * @last_ins_in_burst. -- * @burst_size: number of queues in the current burst of queue activations. -- * @bfq_large_burst_thresh: maximum burst size above which the current -- * queue-activation burst is deemed as 'large'. -- * @large_burst: true if a large queue-activation burst is in progress. -- * @burst_list: head of the burst list (as for the above fields, more details -- * in the comments to the function bfq_handle_burst). -- * @low_latency: if set to true, low-latency heuristics are enabled. -- * @bfq_wr_coeff: maximum factor by which the weight of a weight-raised -- * queue is multiplied. -- * @bfq_wr_max_time: maximum duration of a weight-raising period (jiffies). -- * @bfq_wr_rt_max_time: maximum duration for soft real-time processes. -- * @bfq_wr_min_idle_time: minimum idle period after which weight-raising -- * may be reactivated for a queue (in jiffies). -- * @bfq_wr_min_inter_arr_async: minimum period between request arrivals -- * after which weight-raising may be -- * reactivated for an already busy queue -- * (in jiffies). -- * @bfq_wr_max_softrt_rate: max service-rate for a soft real-time queue, -- * sectors per seconds. -- * @RT_prod: cached value of the product R*T used for computing the maximum -- * duration of the weight raising automatically. -- * @device_speed: device-speed class for the low-latency heuristic. -- * @oom_bfqq: fallback dummy bfqq for extreme OOM conditions. -+ * struct bfq_data - per-device data structure. - * - * All the fields are protected by the @queue lock. - */ - struct bfq_data { -+ /* request queue for the device */ - struct request_queue *queue; - -+ /* root bfq_group for the device */ - struct bfq_group *root_group; - --#ifdef CONFIG_BFQ_GROUP_IOSCHED -- int active_numerous_groups; --#endif -- -+ /* -+ * rbtree of weight counters of @bfq_queues, sorted by -+ * weight. Used to keep track of whether all @bfq_queues have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active and not -+ * weight-raised @bfq_queue (see the comments to the functions -+ * bfq_weights_tree_[add|remove] for further details). -+ */ - struct rb_root queue_weights_tree; -+ /* -+ * rbtree of non-queue @bfq_entity weight counters, sorted by -+ * weight. Used to keep track of whether all @bfq_groups have -+ * the same weight. The tree contains one counter for each -+ * distinct weight associated to some active @bfq_group (see -+ * the comments to the functions bfq_weights_tree_[add|remove] -+ * for further details). -+ */ - struct rb_root group_weights_tree; - -+ /* -+ * Number of bfq_queues containing requests (including the -+ * queue in service, even if it is idling). -+ */ - int busy_queues; -- int busy_in_flight_queues; -- int const_seeky_busy_in_flight_queues; -+ /* number of weight-raised busy @bfq_queues */ - int wr_busy_queues; -+ /* number of queued requests */ - int queued; -+ /* number of requests dispatched and waiting for completion */ - int rq_in_driver; -- int sync_flight; - -+ /* -+ * Maximum number of requests in driver in the last -+ * @hw_tag_samples completed requests. -+ */ - int max_rq_in_driver; -+ /* number of samples used to calculate hw_tag */ - int hw_tag_samples; -+ /* flag set to one if the driver is showing a queueing behavior */ - int hw_tag; - -+ /* number of budgets assigned */ - int budgets_assigned; - -- struct timer_list idle_slice_timer; -+ /* -+ * Timer set when idling (waiting) for the next request from -+ * the queue in service. -+ */ -+ struct hrtimer idle_slice_timer; -+ /* delayed work to restart dispatching on the request queue */ - struct work_struct unplug_work; - -+ /* bfq_queue in service */ - struct bfq_queue *in_service_queue; -+ /* bfq_io_cq (bic) associated with the @in_service_queue */ - struct bfq_io_cq *in_service_bic; - -+ /* on-disk position of the last served request */ - sector_t last_position; - -+ /* time of last request completion (ns) */ -+ u64 last_completion; -+ -+ /* time of first rq dispatch in current observation interval (ns) */ -+ u64 first_dispatch; -+ /* time of last rq dispatch in current observation interval (ns) */ -+ u64 last_dispatch; -+ -+ /* beginning of the last budget */ - ktime_t last_budget_start; -+ /* beginning of the last idle slice */ - ktime_t last_idling_start; -+ -+ /* number of samples in current observation interval */ - int peak_rate_samples; -- u64 peak_rate; -+ /* num of samples of seq dispatches in current observation interval */ -+ u32 sequential_samples; -+ /* total num of sectors transferred in current observation interval */ -+ u64 tot_sectors_dispatched; -+ /* max rq size seen during current observation interval (sectors) */ -+ u32 last_rq_max_size; -+ /* time elapsed from first dispatch in current observ. interval (us) */ -+ u64 delta_from_first; -+ /* current estimate of device peak rate */ -+ u32 peak_rate; -+ -+ /* maximum budget allotted to a bfq_queue before rescheduling */ - int bfq_max_budget; - -+ /* list of all the bfq_queues active on the device */ - struct list_head active_list; -+ /* list of all the bfq_queues idle on the device */ - struct list_head idle_list; - -- unsigned int bfq_fifo_expire[2]; -+ /* -+ * Timeout for async/sync requests; when it fires, requests -+ * are served in fifo order. -+ */ -+ u64 bfq_fifo_expire[2]; -+ /* weight of backward seeks wrt forward ones */ - unsigned int bfq_back_penalty; -+ /* maximum allowed backward seek */ - unsigned int bfq_back_max; -- unsigned int bfq_slice_idle; -- u64 bfq_class_idle_last_service; -+ /* maximum idling time */ -+ u32 bfq_slice_idle; - -+ /* user-configured max budget value (0 for auto-tuning) */ - int bfq_user_max_budget; -- int bfq_max_budget_async_rq; -- unsigned int bfq_timeout[2]; -- -- unsigned int bfq_coop_thresh; -- unsigned int bfq_failed_cooperations; -+ /* -+ * Timeout for bfq_queues to consume their budget; used to -+ * prevent seeky queues from imposing long latencies to -+ * sequential or quasi-sequential ones (this also implies that -+ * seeky queues cannot receive guarantees in the service -+ * domain; after a timeout they are charged for the time they -+ * have been in service, to preserve fairness among them, but -+ * without service-domain guarantees). -+ */ -+ unsigned int bfq_timeout; -+ -+ /* -+ * Number of consecutive requests that must be issued within -+ * the idle time slice to set again idling to a queue which -+ * was marked as non-I/O-bound (see the definition of the -+ * IO_bound flag for further details). -+ */ - unsigned int bfq_requests_within_timer; - -+ /* -+ * Force device idling whenever needed to provide accurate -+ * service guarantees, without caring about throughput -+ * issues. CAVEAT: this may even increase latencies, in case -+ * of useless idling for processes that did stop doing I/O. -+ */ -+ bool strict_guarantees; -+ -+ /* -+ * Last time at which a queue entered the current burst of -+ * queues being activated shortly after each other; for more -+ * details about this and the following parameters related to -+ * a burst of activations, see the comments on the function -+ * bfq_handle_burst. -+ */ - unsigned long last_ins_in_burst; -+ /* -+ * Reference time interval used to decide whether a queue has -+ * been activated shortly after @last_ins_in_burst. -+ */ - unsigned long bfq_burst_interval; -+ /* number of queues in the current burst of queue activations */ - int burst_size; -+ -+ /* common parent entity for the queues in the burst */ -+ struct bfq_entity *burst_parent_entity; -+ /* Maximum burst size above which the current queue-activation -+ * burst is deemed as 'large'. -+ */ - unsigned long bfq_large_burst_thresh; -+ /* true if a large queue-activation burst is in progress */ - bool large_burst; -+ /* -+ * Head of the burst list (as for the above fields, more -+ * details in the comments on the function bfq_handle_burst). -+ */ - struct hlist_head burst_list; - -+ /* if set to true, low-latency heuristics are enabled */ - bool low_latency; -- -- /* parameters of the low_latency heuristics */ -+ /* -+ * Maximum factor by which the weight of a weight-raised queue -+ * is multiplied. -+ */ - unsigned int bfq_wr_coeff; -+ /* maximum duration of a weight-raising period (jiffies) */ - unsigned int bfq_wr_max_time; -+ -+ /* Maximum weight-raising duration for soft real-time processes */ - unsigned int bfq_wr_rt_max_time; -+ /* -+ * Minimum idle period after which weight-raising may be -+ * reactivated for a queue (in jiffies). -+ */ - unsigned int bfq_wr_min_idle_time; -+ /* -+ * Minimum period between request arrivals after which -+ * weight-raising may be reactivated for an already busy async -+ * queue (in jiffies). -+ */ - unsigned long bfq_wr_min_inter_arr_async; -+ -+ /* Max service-rate for a soft real-time queue, in sectors/sec */ - unsigned int bfq_wr_max_softrt_rate; -+ /* -+ * Cached value of the product R*T, used for computing the -+ * maximum duration of weight raising automatically. -+ */ - u64 RT_prod; -+ /* device-speed class for the low-latency heuristic */ - enum bfq_device_speed device_speed; - -+ /* fallback dummy bfqq for extreme OOM conditions */ - struct bfq_queue oom_bfqq; - }; - - enum bfqq_state_flags { -- BFQ_BFQQ_FLAG_busy = 0, /* has requests or is in service */ -+ BFQ_BFQQ_FLAG_just_created = 0, /* queue just allocated */ -+ BFQ_BFQQ_FLAG_busy, /* has requests or is in service */ - BFQ_BFQQ_FLAG_wait_request, /* waiting for a request */ -+ BFQ_BFQQ_FLAG_non_blocking_wait_rq, /* -+ * waiting for a request -+ * without idling the device -+ */ - BFQ_BFQQ_FLAG_must_alloc, /* must be allowed rq alloc */ - BFQ_BFQQ_FLAG_fifo_expire, /* FIFO checked in this slice */ - BFQ_BFQQ_FLAG_idle_window, /* slice idling enabled */ - BFQ_BFQQ_FLAG_sync, /* synchronous queue */ -- BFQ_BFQQ_FLAG_budget_new, /* no completion with this budget */ - BFQ_BFQQ_FLAG_IO_bound, /* - * bfqq has timed-out at least once - * having consumed at most 2/10 of -@@ -581,17 +610,12 @@ enum bfqq_state_flags { - * bfqq activated in a large burst, - * see comments to bfq_handle_burst. - */ -- BFQ_BFQQ_FLAG_constantly_seeky, /* -- * bfqq has proved to be slow and -- * seeky until budget timeout -- */ - BFQ_BFQQ_FLAG_softrt_update, /* - * may need softrt-next-start - * update - */ - BFQ_BFQQ_FLAG_coop, /* bfqq is shared */ -- BFQ_BFQQ_FLAG_split_coop, /* shared bfqq will be split */ -- BFQ_BFQQ_FLAG_just_split, /* queue has just been split */ -+ BFQ_BFQQ_FLAG_split_coop /* shared bfqq will be split */ - }; - - #define BFQ_BFQQ_FNS(name) \ -@@ -608,28 +632,94 @@ static int bfq_bfqq_##name(const struct bfq_queue *bfqq) \ - return ((bfqq)->flags & (1 << BFQ_BFQQ_FLAG_##name)) != 0; \ - } - -+BFQ_BFQQ_FNS(just_created); - BFQ_BFQQ_FNS(busy); - BFQ_BFQQ_FNS(wait_request); -+BFQ_BFQQ_FNS(non_blocking_wait_rq); - BFQ_BFQQ_FNS(must_alloc); - BFQ_BFQQ_FNS(fifo_expire); - BFQ_BFQQ_FNS(idle_window); - BFQ_BFQQ_FNS(sync); --BFQ_BFQQ_FNS(budget_new); - BFQ_BFQQ_FNS(IO_bound); - BFQ_BFQQ_FNS(in_large_burst); --BFQ_BFQQ_FNS(constantly_seeky); - BFQ_BFQQ_FNS(coop); - BFQ_BFQQ_FNS(split_coop); --BFQ_BFQQ_FNS(just_split); - BFQ_BFQQ_FNS(softrt_update); - #undef BFQ_BFQQ_FNS - - /* Logging facilities. */ --#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -- blk_add_trace_msg((bfqd)->queue, "bfq%d " fmt, (bfqq)->pid, ##args) -+#ifdef CONFIG_BFQ_REDIRECT_TO_CONSOLE -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("bfq%d%c %s " fmt "\n", \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ pr_crit("%s " fmt "\n", __pbuf, ##args); \ -+} while (0) -+ -+#else /* CONFIG_BFQ_GROUP_IOSCHED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ pr_crit("bfq%d%c " fmt "\n", (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* CONFIG_BFQ_GROUP_IOSCHED */ -+ -+#define bfq_log(bfqd, fmt, args...) \ -+ pr_crit("bfq " fmt "\n", ##args) -+ -+#else /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+static struct bfq_group *bfqq_group(struct bfq_queue *bfqq); -+static struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ assert_spin_locked((bfqd)->queue->queue_lock); \ -+ blkg_path(bfqg_to_blkg(bfqq_group(bfqq)), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c %s " fmt, \ -+ (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ __pbuf, ##args); \ -+} while (0) -+ -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ -+ char __pbuf[128]; \ -+ \ -+ blkg_path(bfqg_to_blkg(bfqg), __pbuf, sizeof(__pbuf)); \ -+ blk_add_trace_msg((bfqd)->queue, "%s " fmt, __pbuf, ##args); \ -+} while (0) -+ -+#else /* CONFIG_BFQ_GROUP_IOSCHED */ -+ -+#define bfq_log_bfqq(bfqd, bfqq, fmt, args...) \ -+ blk_add_trace_msg((bfqd)->queue, "bfq%d%c " fmt, (bfqq)->pid, \ -+ bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ -+ ##args) -+#define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) -+ -+#endif /* CONFIG_BFQ_GROUP_IOSCHED */ - - #define bfq_log(bfqd, fmt, args...) \ - blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) -+#endif /* CONFIG_BFQ_REDIRECT_TO_CONSOLE */ - - /* Expiration reasons. */ - enum bfqq_expiration { -@@ -640,15 +730,12 @@ enum bfqq_expiration { - BFQ_BFQQ_BUDGET_TIMEOUT, /* budget took too long to be used */ - BFQ_BFQQ_BUDGET_EXHAUSTED, /* budget consumed */ - BFQ_BFQQ_NO_MORE_REQUESTS, /* the queue has no more requests */ -+ BFQ_BFQQ_PREEMPTED /* preemption in progress */ - }; - --#ifdef CONFIG_BFQ_GROUP_IOSCHED - - struct bfqg_stats { -- /* total bytes transferred */ -- struct blkg_rwstat service_bytes; -- /* total IOs serviced, post merge */ -- struct blkg_rwstat serviced; -+#ifdef CONFIG_BFQ_GROUP_IOSCHED - /* number of ios merged */ - struct blkg_rwstat merged; - /* total time spent on device in ns, may not be accurate w/ queueing */ -@@ -657,12 +744,8 @@ struct bfqg_stats { - struct blkg_rwstat wait_time; - /* number of IOs queued up */ - struct blkg_rwstat queued; -- /* total sectors transferred */ -- struct blkg_stat sectors; - /* total disk time and nr sectors dispatched by this group */ - struct blkg_stat time; -- /* time not charged to this cgroup */ -- struct blkg_stat unaccounted_time; - /* sum of number of ios queued across all samples */ - struct blkg_stat avg_queue_size_sum; - /* count of samples taken for average */ -@@ -680,8 +763,10 @@ struct bfqg_stats { - uint64_t start_idle_time; - uint64_t start_empty_time; - uint16_t flags; -+#endif - }; - -+#ifdef CONFIG_BFQ_GROUP_IOSCHED - /* - * struct bfq_group_data - per-blkcg storage for the blkio subsystem. - * -@@ -692,7 +777,7 @@ struct bfq_group_data { - /* must be the first member */ - struct blkcg_policy_data pd; - -- unsigned short weight; -+ unsigned int weight; - }; - - /** -@@ -712,7 +797,7 @@ struct bfq_group_data { - * unused for the root group. Used to know whether there - * are groups with more than one active @bfq_entity - * (see the comments to the function -- * bfq_bfqq_must_not_expire()). -+ * bfq_bfqq_may_idle()). - * @rq_pos_tree: rbtree sorted by next_request position, used when - * determining if two or more queues have interleaving - * requests (see bfq_find_close_cooperator()). -@@ -745,7 +830,6 @@ struct bfq_group { - struct rb_root rq_pos_tree; - - struct bfqg_stats stats; -- struct bfqg_stats dead_stats; /* stats pushed from dead children */ - }; - - #else -@@ -761,17 +845,38 @@ struct bfq_group { - - static struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); - -+static unsigned int bfq_class_idx(struct bfq_entity *entity) -+{ -+ struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -+ -+ return bfqq ? bfqq->ioprio_class - 1 : -+ BFQ_DEFAULT_GRP_CLASS - 1; -+} -+ - static struct bfq_service_tree * - bfq_entity_service_tree(struct bfq_entity *entity) - { - struct bfq_sched_data *sched_data = entity->sched_data; - struct bfq_queue *bfqq = bfq_entity_to_bfqq(entity); -- unsigned int idx = bfqq ? bfqq->ioprio_class - 1 : -- BFQ_DEFAULT_GRP_CLASS; -+ unsigned int idx = bfq_class_idx(entity); - - BUG_ON(idx >= BFQ_IOPRIO_CLASSES); - BUG_ON(sched_data == NULL); - -+ if (bfqq) -+ bfq_log_bfqq(bfqq->bfqd, bfqq, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED -+ else { -+ struct bfq_group *bfqg = -+ container_of(entity, struct bfq_group, entity); -+ -+ bfq_log_bfqg((struct bfq_data *)bfqg->bfqd, bfqg, -+ "entity_service_tree %p %d", -+ sched_data->service_tree + idx, idx); -+ } -+#endif - return sched_data->service_tree + idx; - } - -@@ -791,47 +896,6 @@ static struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic) - return bic->icq.q->elevator->elevator_data; - } - --/** -- * bfq_get_bfqd_locked - get a lock to a bfqd using a RCU protected pointer. -- * @ptr: a pointer to a bfqd. -- * @flags: storage for the flags to be saved. -- * -- * This function allows bfqg->bfqd to be protected by the -- * queue lock of the bfqd they reference; the pointer is dereferenced -- * under RCU, so the storage for bfqd is assured to be safe as long -- * as the RCU read side critical section does not end. After the -- * bfqd->queue->queue_lock is taken the pointer is rechecked, to be -- * sure that no other writer accessed it. If we raced with a writer, -- * the function returns NULL, with the queue unlocked, otherwise it -- * returns the dereferenced pointer, with the queue locked. -- */ --static struct bfq_data *bfq_get_bfqd_locked(void **ptr, unsigned long *flags) --{ -- struct bfq_data *bfqd; -- -- rcu_read_lock(); -- bfqd = rcu_dereference(*(struct bfq_data **)ptr); -- -- if (bfqd != NULL) { -- spin_lock_irqsave(bfqd->queue->queue_lock, *flags); -- if (ptr == NULL) -- printk(KERN_CRIT "get_bfqd_locked pointer NULL\n"); -- else if (*ptr == bfqd) -- goto out; -- spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); -- } -- -- bfqd = NULL; --out: -- rcu_read_unlock(); -- return bfqd; --} -- --static void bfq_put_bfqd_unlock(struct bfq_data *bfqd, unsigned long *flags) --{ -- spin_unlock_irqrestore(bfqd->queue->queue_lock, *flags); --} -- - #ifdef CONFIG_BFQ_GROUP_IOSCHED - - static struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq) -@@ -857,11 +921,13 @@ static void bfq_check_ioprio_change(struct bfq_io_cq *bic, struct bio *bio); - static void bfq_put_queue(struct bfq_queue *bfqq); - static void bfq_dispatch_insert(struct request_queue *q, struct request *rq); - static struct bfq_queue *bfq_get_queue(struct bfq_data *bfqd, -- struct bio *bio, int is_sync, -- struct bfq_io_cq *bic, gfp_t gfp_mask); -+ struct bio *bio, bool is_sync, -+ struct bfq_io_cq *bic); - static void bfq_end_wr_async_queues(struct bfq_data *bfqd, - struct bfq_group *bfqg); -+#ifdef CONFIG_BFQ_GROUP_IOSCHED - static void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); -+#endif - static void bfq_exit_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); - - #endif /* _BFQ_H */ --- -2.10.0 - diff --git a/patches/enable_additional_cpu_optimizations_for_gcc_v4.9+_kernel_v3.15+.patch b/patches/enable_additional_cpu_optimizations_for_gcc_v4.9+_kernel_v3.15+.patch deleted file mode 100644 index 76cbd9d..0000000 --- a/patches/enable_additional_cpu_optimizations_for_gcc_v4.9+_kernel_v3.15+.patch +++ /dev/null @@ -1,533 +0,0 @@ -WARNING -This patch works with gcc versions 4.9+ and with kernel version 3.15+ and should -NOT be applied when compiling on older versions of gcc due to key name changes -of the march flags introduced with the version 4.9 release of gcc.[1] - -Use the older version of this patch hosted on the same github for older -versions of gcc. - -FEATURES -This patch adds additional CPU options to the Linux kernel accessible under: - Processor type and features ---> - Processor family ---> - -The expanded microarchitectures include: -* AMD Improved K8-family -* AMD K10-family -* AMD Family 10h (Barcelona) -* AMD Family 14h (Bobcat) -* AMD Family 16h (Jaguar) -* AMD Family 15h (Bulldozer) -* AMD Family 15h (Piledriver) -* AMD Family 15h (Steamroller) -* AMD Family 15h (Excavator) -* AMD Family 17h (Zen) -* Intel Silvermont low-power processors -* Intel 1st Gen Core i3/i5/i7 (Nehalem) -* Intel 1.5 Gen Core i3/i5/i7 (Westmere) -* Intel 2nd Gen Core i3/i5/i7 (Sandybridge) -* Intel 3rd Gen Core i3/i5/i7 (Ivybridge) -* Intel 4th Gen Core i3/i5/i7 (Haswell) -* Intel 5th Gen Core i3/i5/i7 (Broadwell) -* Intel 6th Gen Core i3/i5.i7 (Skylake) - -It also offers to compile passing the 'native' option which, "selects the CPU -to generate code for at compilation time by determining the processor type of -the compiling machine. Using -march=native enables all instruction subsets -supported by the local machine and will produce code optimized for the local -machine under the constraints of the selected instruction set."[3] - -MINOR NOTES -This patch also changes 'atom' to 'bonnell' in accordance with the gcc v4.9 -changes. Note that upstream is using the deprecated 'match=atom' flags when I -believe it should use the newer 'march=bonnell' flag for atom processors.[2] - -It is not recommended to compile on Atom-CPUs with the 'native' option.[4] The -recommendation is use to the 'atom' option instead. - -BENEFITS -Small but real speed increases are measurable using a make endpoint comparing -a generic kernel to one built with one of the respective microarchs. - -See the following experimental evidence supporting this statement: -https://github.com/graysky2/kernel_gcc_patch - -REQUIREMENTS -linux version >=3.15 -gcc version >=4.9 - -ACKNOWLEDGMENTS -This patch builds on the seminal work by Jeroen.[5] - -REFERENCES -1. https://gcc.gnu.org/gcc-4.9/changes.html -2. https://bugzilla.kernel.org/show_bug.cgi?id=77461 -3. https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html -4. https://github.com/graysky2/kernel_gcc_patch/issues/15 -5. http://www.linuxforge.net/docs/linux/linux-gcc.php - ---- a/arch/x86/include/asm/module.h 2016-12-11 14:17:54.000000000 -0500 -+++ b/arch/x86/include/asm/module.h 2017-01-06 20:44:36.602227264 -0500 -@@ -15,6 +15,24 @@ - #define MODULE_PROC_FAMILY "586MMX " - #elif defined CONFIG_MCORE2 - #define MODULE_PROC_FAMILY "CORE2 " -+#elif defined CONFIG_MNATIVE -+#define MODULE_PROC_FAMILY "NATIVE " -+#elif defined CONFIG_MNEHALEM -+#define MODULE_PROC_FAMILY "NEHALEM " -+#elif defined CONFIG_MWESTMERE -+#define MODULE_PROC_FAMILY "WESTMERE " -+#elif defined CONFIG_MSILVERMONT -+#define MODULE_PROC_FAMILY "SILVERMONT " -+#elif defined CONFIG_MSANDYBRIDGE -+#define MODULE_PROC_FAMILY "SANDYBRIDGE " -+#elif defined CONFIG_MIVYBRIDGE -+#define MODULE_PROC_FAMILY "IVYBRIDGE " -+#elif defined CONFIG_MHASWELL -+#define MODULE_PROC_FAMILY "HASWELL " -+#elif defined CONFIG_MBROADWELL -+#define MODULE_PROC_FAMILY "BROADWELL " -+#elif defined CONFIG_MSKYLAKE -+#define MODULE_PROC_FAMILY "SKYLAKE " - #elif defined CONFIG_MATOM - #define MODULE_PROC_FAMILY "ATOM " - #elif defined CONFIG_M686 -@@ -33,6 +51,26 @@ - #define MODULE_PROC_FAMILY "K7 " - #elif defined CONFIG_MK8 - #define MODULE_PROC_FAMILY "K8 " -+#elif defined CONFIG_MK8SSE3 -+#define MODULE_PROC_FAMILY "K8SSE3 " -+#elif defined CONFIG_MK10 -+#define MODULE_PROC_FAMILY "K10 " -+#elif defined CONFIG_MBARCELONA -+#define MODULE_PROC_FAMILY "BARCELONA " -+#elif defined CONFIG_MBOBCAT -+#define MODULE_PROC_FAMILY "BOBCAT " -+#elif defined CONFIG_MBULLDOZER -+#define MODULE_PROC_FAMILY "BULLDOZER " -+#elif defined CONFIG_MPILEDRIVER -+#define MODULE_PROC_FAMILY "PILEDRIVER " -+#elif defined CONFIG_MSTEAMROLLER -+#define MODULE_PROC_FAMILY "STEAMROLLER " -+#elif defined CONFIG_MJAGUAR -+#define MODULE_PROC_FAMILY "JAGUAR " -+#elif defined CONFIG_MEXCAVATOR -+#define MODULE_PROC_FAMILY "EXCAVATOR " -+#elif defined CONFIG_MZEN -+#define MODULE_PROC_FAMILY "ZEN " - #elif defined CONFIG_MELAN - #define MODULE_PROC_FAMILY "ELAN " - #elif defined CONFIG_MCRUSOE ---- a/arch/x86/Kconfig.cpu 2016-12-11 14:17:54.000000000 -0500 -+++ b/arch/x86/Kconfig.cpu 2017-01-06 20:46:14.004109597 -0500 -@@ -115,6 +115,7 @@ config MPENTIUMM - config MPENTIUM4 - bool "Pentium-4/Celeron(P4-based)/Pentium-4 M/older Xeon" - depends on X86_32 -+ select X86_P6_NOP - ---help--- - Select this for Intel Pentium 4 chips. This includes the - Pentium 4, Pentium D, P4-based Celeron and Xeon, and -@@ -147,9 +148,8 @@ config MPENTIUM4 - -Paxville - -Dempsey - -- - config MK6 -- bool "K6/K6-II/K6-III" -+ bool "AMD K6/K6-II/K6-III" - depends on X86_32 - ---help--- - Select this for an AMD K6-family processor. Enables use of -@@ -157,7 +157,7 @@ config MK6 - flags to GCC. - - config MK7 -- bool "Athlon/Duron/K7" -+ bool "AMD Athlon/Duron/K7" - depends on X86_32 - ---help--- - Select this for an AMD Athlon K7-family processor. Enables use of -@@ -165,12 +165,83 @@ config MK7 - flags to GCC. - - config MK8 -- bool "Opteron/Athlon64/Hammer/K8" -+ bool "AMD Opteron/Athlon64/Hammer/K8" - ---help--- - Select this for an AMD Opteron or Athlon64 Hammer-family processor. - Enables use of some extended instructions, and passes appropriate - optimization flags to GCC. - -+config MK8SSE3 -+ bool "AMD Opteron/Athlon64/Hammer/K8 with SSE3" -+ ---help--- -+ Select this for improved AMD Opteron or Athlon64 Hammer-family processors. -+ Enables use of some extended instructions, and passes appropriate -+ optimization flags to GCC. -+ -+config MK10 -+ bool "AMD 61xx/7x50/PhenomX3/X4/II/K10" -+ ---help--- -+ Select this for an AMD 61xx Eight-Core Magny-Cours, Athlon X2 7x50, -+ Phenom X3/X4/II, Athlon II X2/X3/X4, or Turion II-family processor. -+ Enables use of some extended instructions, and passes appropriate -+ optimization flags to GCC. -+ -+config MBARCELONA -+ bool "AMD Barcelona" -+ ---help--- -+ Select this for AMD Family 10h Barcelona processors. -+ -+ Enables -march=barcelona -+ -+config MBOBCAT -+ bool "AMD Bobcat" -+ ---help--- -+ Select this for AMD Family 14h Bobcat processors. -+ -+ Enables -march=btver1 -+ -+config MJAGUAR -+ bool "AMD Jaguar" -+ ---help--- -+ Select this for AMD Family 16h Jaguar processors. -+ -+ Enables -march=btver2 -+ -+config MBULLDOZER -+ bool "AMD Bulldozer" -+ ---help--- -+ Select this for AMD Family 15h Bulldozer processors. -+ -+ Enables -march=bdver1 -+ -+config MPILEDRIVER -+ bool "AMD Piledriver" -+ ---help--- -+ Select this for AMD Family 15h Piledriver processors. -+ -+ Enables -march=bdver2 -+ -+config MSTEAMROLLER -+ bool "AMD Steamroller" -+ ---help--- -+ Select this for AMD Family 15h Steamroller processors. -+ -+ Enables -march=bdver3 -+ -+config MEXCAVATOR -+ bool "AMD Excavator" -+ ---help--- -+ Select this for AMD Family 15h Excavator processors. -+ -+ Enables -march=bdver4 -+ -+config MZEN -+ bool "AMD Zen" -+ ---help--- -+ Select this for AMD Family 17h Zen processors. -+ -+ Enables -march=znver1 -+ - config MCRUSOE - bool "Crusoe" - depends on X86_32 -@@ -252,6 +323,7 @@ config MVIAC7 - - config MPSC - bool "Intel P4 / older Netburst based Xeon" -+ select X86_P6_NOP - depends on X86_64 - ---help--- - Optimize for Intel Pentium 4, Pentium D and older Nocona/Dempsey -@@ -261,8 +333,19 @@ config MPSC - using the cpu family field - in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. - -+config MATOM -+ bool "Intel Atom" -+ select X86_P6_NOP -+ ---help--- -+ -+ Select this for the Intel Atom platform. Intel Atom CPUs have an -+ in-order pipelining architecture and thus can benefit from -+ accordingly optimized code. Use a recent GCC with specific Atom -+ support in order to fully benefit from selecting this option. -+ - config MCORE2 -- bool "Core 2/newer Xeon" -+ bool "Intel Core 2" -+ select X86_P6_NOP - ---help--- - - Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and -@@ -270,14 +353,79 @@ config MCORE2 - family in /proc/cpuinfo. Newer ones have 6 and older ones 15 - (not a typo) - --config MATOM -- bool "Intel Atom" -+ Enables -march=core2 -+ -+config MNEHALEM -+ bool "Intel Nehalem" -+ select X86_P6_NOP - ---help--- - -- Select this for the Intel Atom platform. Intel Atom CPUs have an -- in-order pipelining architecture and thus can benefit from -- accordingly optimized code. Use a recent GCC with specific Atom -- support in order to fully benefit from selecting this option. -+ Select this for 1st Gen Core processors in the Nehalem family. -+ -+ Enables -march=nehalem -+ -+config MWESTMERE -+ bool "Intel Westmere" -+ select X86_P6_NOP -+ ---help--- -+ -+ Select this for the Intel Westmere formerly Nehalem-C family. -+ -+ Enables -march=westmere -+ -+config MSILVERMONT -+ bool "Intel Silvermont" -+ select X86_P6_NOP -+ ---help--- -+ -+ Select this for the Intel Silvermont platform. -+ -+ Enables -march=silvermont -+ -+config MSANDYBRIDGE -+ bool "Intel Sandy Bridge" -+ select X86_P6_NOP -+ ---help--- -+ -+ Select this for 2nd Gen Core processors in the Sandy Bridge family. -+ -+ Enables -march=sandybridge -+ -+config MIVYBRIDGE -+ bool "Intel Ivy Bridge" -+ select X86_P6_NOP -+ ---help--- -+ -+ Select this for 3rd Gen Core processors in the Ivy Bridge family. -+ -+ Enables -march=ivybridge -+ -+config MHASWELL -+ bool "Intel Haswell" -+ select X86_P6_NOP -+ ---help--- -+ -+ Select this for 4th Gen Core processors in the Haswell family. -+ -+ Enables -march=haswell -+ -+config MBROADWELL -+ bool "Intel Broadwell" -+ select X86_P6_NOP -+ ---help--- -+ -+ Select this for 5th Gen Core processors in the Broadwell family. -+ -+ Enables -march=broadwell -+ -+config MSKYLAKE -+ bool "Intel Skylake" -+ select X86_P6_NOP -+ ---help--- -+ -+ Select this for 6th Gen Core processors in the Skylake family. -+ -+ Enables -march=skylake - - config GENERIC_CPU - bool "Generic-x86-64" -@@ -286,6 +434,19 @@ config GENERIC_CPU - Generic x86-64 CPU. - Run equally well on all x86-64 CPUs. - -+config MNATIVE -+ bool "Native optimizations autodetected by GCC" -+ ---help--- -+ -+ GCC 4.2 and above support -march=native, which automatically detects -+ the optimum settings to use based on your processor. -march=native -+ also detects and applies additional settings beyond -march specific -+ to your CPU, (eg. -msse4). Unless you have a specific reason not to -+ (e.g. distcc cross-compiling), you should probably be using -+ -march=native rather than anything listed below. -+ -+ Enables -march=native -+ - endchoice - - config X86_GENERIC -@@ -310,7 +471,7 @@ config X86_INTERNODE_CACHE_SHIFT - config X86_L1_CACHE_SHIFT - int - default "7" if MPENTIUM4 || MPSC -- default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU -+ default "6" if MK7 || MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MJAGUAR || MPENTIUMM || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU - default "4" if MELAN || M486 || MGEODEGX1 - default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - -@@ -341,45 +502,46 @@ config X86_ALIGNMENT_16 - - config X86_INTEL_USERCOPY - def_bool y -- depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 -+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK8SSE3 || MK7 || MEFFICEON || MCORE2 || MK10 || MBARCELONA || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE - - config X86_USE_PPRO_CHECKSUM - def_bool y -- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM -+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MK10 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MATOM || MNATIVE - - config X86_USE_3DNOW - def_bool y - depends on (MCYRIXIII || MK7 || MGEODE_LX) && !UML - --# --# P6_NOPs are a relatively minor optimization that require a family >= --# 6 processor, except that it is broken on certain VIA chips. --# Furthermore, AMD chips prefer a totally different sequence of NOPs --# (which work on all CPUs). In addition, it looks like Virtual PC --# does not understand them. --# --# As a result, disallow these if we're not compiling for X86_64 (these --# NOPs do work on all x86-64 capable chips); the list of processors in --# the right-hand clause are the cores that benefit from this optimization. --# - config X86_P6_NOP -- def_bool y -- depends on X86_64 -- depends on (MCORE2 || MPENTIUM4 || MPSC) -+ default n -+ bool "Support for P6_NOPs on Intel chips" -+ depends on (MCORE2 || MPENTIUM4 || MPSC || MATOM || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE) -+ ---help--- -+ P6_NOPs are a relatively minor optimization that require a family >= -+ 6 processor, except that it is broken on certain VIA chips. -+ Furthermore, AMD chips prefer a totally different sequence of NOPs -+ (which work on all CPUs). In addition, it looks like Virtual PC -+ does not understand them. -+ -+ As a result, disallow these if we're not compiling for X86_64 (these -+ NOPs do work on all x86-64 capable chips); the list of processors in -+ the right-hand clause are the cores that benefit from this optimization. -+ -+ Say Y if you have Intel CPU newer than Pentium Pro, N otherwise. - - config X86_TSC - def_bool y -- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 -+ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MK8SSE3 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MNATIVE || MATOM) || X86_64 - - config X86_CMPXCHG64 - def_bool y -- depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM -+ depends on X86_PAE || X86_64 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MATOM || MNATIVE - - # this should be set for all -march=.. options where the compiler - # generates cmov. - config X86_CMOV - def_bool y -- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) -+ depends on (MK8 || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MJAGUAR || MK7 || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MNATIVE || MATOM || MGEODE_LX) - - config X86_MINIMUM_CPU_FAMILY - int ---- a/arch/x86/Makefile 2016-12-11 14:17:54.000000000 -0500 -+++ b/arch/x86/Makefile 2017-01-06 20:44:36.603227283 -0500 -@@ -104,13 +104,40 @@ else - KBUILD_CFLAGS += $(call cc-option,-mskip-rax-setup) - - # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) -+ cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) - cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8) -+ cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-mtune=k8) -+ cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10) -+ cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona) -+ cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1) -+ cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2) -+ cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1) -+ cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2) -+ cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3) -+ cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4) -+ cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1) - cflags-$(CONFIG_MPSC) += $(call cc-option,-march=nocona) - - cflags-$(CONFIG_MCORE2) += \ -- $(call cc-option,-march=core2,$(call cc-option,-mtune=generic)) -- cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom) \ -- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) -+ $(call cc-option,-march=core2,$(call cc-option,-mtune=core2)) -+ cflags-$(CONFIG_MNEHALEM) += \ -+ $(call cc-option,-march=nehalem,$(call cc-option,-mtune=nehalem)) -+ cflags-$(CONFIG_MWESTMERE) += \ -+ $(call cc-option,-march=westmere,$(call cc-option,-mtune=westmere)) -+ cflags-$(CONFIG_MSILVERMONT) += \ -+ $(call cc-option,-march=silvermont,$(call cc-option,-mtune=silvermont)) -+ cflags-$(CONFIG_MSANDYBRIDGE) += \ -+ $(call cc-option,-march=sandybridge,$(call cc-option,-mtune=sandybridge)) -+ cflags-$(CONFIG_MIVYBRIDGE) += \ -+ $(call cc-option,-march=ivybridge,$(call cc-option,-mtune=ivybridge)) -+ cflags-$(CONFIG_MHASWELL) += \ -+ $(call cc-option,-march=haswell,$(call cc-option,-mtune=haswell)) -+ cflags-$(CONFIG_MBROADWELL) += \ -+ $(call cc-option,-march=broadwell,$(call cc-option,-mtune=broadwell)) -+ cflags-$(CONFIG_MSKYLAKE) += \ -+ $(call cc-option,-march=skylake,$(call cc-option,-mtune=skylake)) -+ cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell) \ -+ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) - cflags-$(CONFIG_GENERIC_CPU) += $(call cc-option,-mtune=generic) - KBUILD_CFLAGS += $(cflags-y) - ---- a/arch/x86/Makefile_32.cpu 2016-12-11 14:17:54.000000000 -0500 -+++ b/arch/x86/Makefile_32.cpu 2017-01-06 20:44:36.603227283 -0500 -@@ -23,7 +23,18 @@ cflags-$(CONFIG_MK6) += -march=k6 - # Please note, that patches that add -march=athlon-xp and friends are pointless. - # They make zero difference whatsosever to performance at this time. - cflags-$(CONFIG_MK7) += -march=athlon -+cflags-$(CONFIG_MNATIVE) += $(call cc-option,-march=native) - cflags-$(CONFIG_MK8) += $(call cc-option,-march=k8,-march=athlon) -+cflags-$(CONFIG_MK8SSE3) += $(call cc-option,-march=k8-sse3,-march=athlon) -+cflags-$(CONFIG_MK10) += $(call cc-option,-march=amdfam10,-march=athlon) -+cflags-$(CONFIG_MBARCELONA) += $(call cc-option,-march=barcelona,-march=athlon) -+cflags-$(CONFIG_MBOBCAT) += $(call cc-option,-march=btver1,-march=athlon) -+cflags-$(CONFIG_MJAGUAR) += $(call cc-option,-march=btver2,-march=athlon) -+cflags-$(CONFIG_MBULLDOZER) += $(call cc-option,-march=bdver1,-march=athlon) -+cflags-$(CONFIG_MPILEDRIVER) += $(call cc-option,-march=bdver2,-march=athlon) -+cflags-$(CONFIG_MSTEAMROLLER) += $(call cc-option,-march=bdver3,-march=athlon) -+cflags-$(CONFIG_MEXCAVATOR) += $(call cc-option,-march=bdver4,-march=athlon) -+cflags-$(CONFIG_MZEN) += $(call cc-option,-march=znver1,-march=athlon) - cflags-$(CONFIG_MCRUSOE) += -march=i686 $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 - cflags-$(CONFIG_MEFFICEON) += -march=i686 $(call tune,pentium3) $(align)-functions=0 $(align)-jumps=0 $(align)-loops=0 - cflags-$(CONFIG_MWINCHIPC6) += $(call cc-option,-march=winchip-c6,-march=i586) -@@ -32,8 +43,16 @@ cflags-$(CONFIG_MCYRIXIII) += $(call cc- - cflags-$(CONFIG_MVIAC3_2) += $(call cc-option,-march=c3-2,-march=i686) - cflags-$(CONFIG_MVIAC7) += -march=i686 - cflags-$(CONFIG_MCORE2) += -march=i686 $(call tune,core2) --cflags-$(CONFIG_MATOM) += $(call cc-option,-march=atom,$(call cc-option,-march=core2,-march=i686)) \ -- $(call cc-option,-mtune=atom,$(call cc-option,-mtune=generic)) -+cflags-$(CONFIG_MNEHALEM) += -march=i686 $(call tune,nehalem) -+cflags-$(CONFIG_MWESTMERE) += -march=i686 $(call tune,westmere) -+cflags-$(CONFIG_MSILVERMONT) += -march=i686 $(call tune,silvermont) -+cflags-$(CONFIG_MSANDYBRIDGE) += -march=i686 $(call tune,sandybridge) -+cflags-$(CONFIG_MIVYBRIDGE) += -march=i686 $(call tune,ivybridge) -+cflags-$(CONFIG_MHASWELL) += -march=i686 $(call tune,haswell) -+cflags-$(CONFIG_MBROADWELL) += -march=i686 $(call tune,broadwell) -+cflags-$(CONFIG_MSKYLAKE) += -march=i686 $(call tune,skylake) -+cflags-$(CONFIG_MATOM) += $(call cc-option,-march=bonnell,$(call cc-option,-march=core2,-march=i686)) \ -+ $(call cc-option,-mtune=bonnell,$(call cc-option,-mtune=generic)) - - # AMD Elan support - cflags-$(CONFIG_MELAN) += -march=i486 diff --git a/patches/mce_Keep_quiet_in_case_of_broadcasted_mce_after_system_panic.patch b/patches/mce_Keep_quiet_in_case_of_broadcasted_mce_after_system_panic.patch deleted file mode 100644 index f5b955a..0000000 --- a/patches/mce_Keep_quiet_in_case_of_broadcasted_mce_after_system_panic.patch +++ /dev/null @@ -1,43 +0,0 @@ -diff --git a/arch/x86/kernel/cpu/mcheck/mce.c b/arch/x86/kernel/cpu/mcheck/mce.c -index 00ef432..0c2bf77 100644 ---- a/arch/x86/kernel/cpu/mcheck/mce.c -+++ b/arch/x86/kernel/cpu/mcheck/mce.c -@@ -1157,6 +1157,23 @@ void do_machine_check(struct pt_regs *regs, long error_code) - - mce_gather_info(&m, regs); - -+ /* -+ * Check if this MCE is signaled to only this logical processor, -+ * on Intel only. -+ */ -+ if (m.cpuvendor == X86_VENDOR_INTEL) -+ lmce = m.mcgstatus & MCG_STATUS_LMCES; -+ -+ /* -+ * Special treatment for Intel broadcasted machine check: -+ * To avoid panic due to MCE synchronization in case of kdump, -+ * after system panic, clear global status and bail out. -+ */ -+ if (!lmce && atomic_read(&panic_cpu) != PANIC_CPU_INVALID) { -+ wrmsrl(MSR_IA32_MCG_STATUS, 0); -+ goto out; -+ } -+ - final = this_cpu_ptr(&mces_seen); - *final = m; - -@@ -1174,13 +1191,6 @@ void do_machine_check(struct pt_regs *regs, long error_code) - kill_it = 1; - - /* -- * Check if this MCE is signaled to only this logical processor, -- * on Intel only. -- */ -- if (m.cpuvendor == X86_VENDOR_INTEL) -- lmce = m.mcgstatus & MCG_STATUS_LMCES; -- -- /* - * Go through all banks in exclusion of the other CPUs. This way we - * don't report duplicated events on shared banks because the first one - * to see it will clear it. If this is a Local MCE, then no need to -