--- linux-azure-4.13.0.orig/Documentation/ABI/testing/debugfs-aufs +++ linux-azure-4.13.0/Documentation/ABI/testing/debugfs-aufs @@ -0,0 +1,50 @@ +What: /debug/aufs/si_/ +Date: March 2009 +Contact: J. R. Okajima +Description: + Under /debug/aufs, a directory named si_ is created + per aufs mount, where is a unique id generated + internally. + +What: /debug/aufs/si_/plink +Date: Apr 2013 +Contact: J. R. Okajima +Description: + It has three lines and shows the information about the + pseudo-link. The first line is a single number + representing a number of buckets. The second line is a + number of pseudo-links per buckets (separated by a + blank). The last line is a single number representing a + total number of psedo-links. + When the aufs mount option 'noplink' is specified, it + will show "1\n0\n0\n". + +What: /debug/aufs/si_/xib +Date: March 2009 +Contact: J. R. Okajima +Description: + It shows the consumed blocks by xib (External Inode Number + Bitmap), its block size and file size. + When the aufs mount option 'noxino' is specified, it + will be empty. About XINO files, see the aufs manual. + +What: /debug/aufs/si_/xino0, xino1 ... xinoN +Date: March 2009 +Contact: J. R. Okajima +Description: + It shows the consumed blocks by xino (External Inode Number + Translation Table), its link count, block size and file + size. + When the aufs mount option 'noxino' is specified, it + will be empty. About XINO files, see the aufs manual. + +What: /debug/aufs/si_/xigen +Date: March 2009 +Contact: J. R. Okajima +Description: + It shows the consumed blocks by xigen (External Inode + Generation Table), its block size and file size. + If CONFIG_AUFS_EXPORT is disabled, this entry will not + be created. + When the aufs mount option 'noxino' is specified, it + will be empty. About XINO files, see the aufs manual. --- linux-azure-4.13.0.orig/Documentation/ABI/testing/sysfs-aufs +++ linux-azure-4.13.0/Documentation/ABI/testing/sysfs-aufs @@ -0,0 +1,31 @@ +What: /sys/fs/aufs/si_/ +Date: March 2009 +Contact: J. R. Okajima +Description: + Under /sys/fs/aufs, a directory named si_ is created + per aufs mount, where is a unique id generated + internally. + +What: /sys/fs/aufs/si_/br0, br1 ... brN +Date: March 2009 +Contact: J. R. Okajima +Description: + It shows the abolute path of a member directory (which + is called branch) in aufs, and its permission. + +What: /sys/fs/aufs/si_/brid0, brid1 ... bridN +Date: July 2013 +Contact: J. R. Okajima +Description: + It shows the id of a member directory (which is called + branch) in aufs. + +What: /sys/fs/aufs/si_/xi_path +Date: March 2009 +Contact: J. R. Okajima +Description: + It shows the abolute path of XINO (External Inode Number + Bitmap, Translation Table and Generation Table) file + even if it is the default path. + When the aufs mount option 'noxino' is specified, it + will be empty. About XINO files, see the aufs manual. --- linux-azure-4.13.0.orig/Documentation/ABI/testing/sysfs-bus-thunderbolt +++ linux-azure-4.13.0/Documentation/ABI/testing/sysfs-bus-thunderbolt @@ -45,6 +45,8 @@ Description: When a devices supports Thunderbolt secure connect it will have this attribute. Writing 32 byte hex string changes authorization to use the secure connection method instead. + Writing an empty string clears the key and regular connection + method can be used again. What: /sys/bus/thunderbolt/devices/.../device Date: Sep 2017 --- linux-azure-4.13.0.orig/Documentation/ABI/testing/sysfs-devices-system-cpu +++ linux-azure-4.13.0/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -373,3 +373,19 @@ Description: information about CPUs heterogeneity. cpu_capacity: capacity of cpu#. + +What: /sys/devices/system/cpu/vulnerabilities + /sys/devices/system/cpu/vulnerabilities/meltdown + /sys/devices/system/cpu/vulnerabilities/spectre_v1 + /sys/devices/system/cpu/vulnerabilities/spectre_v2 +Date: Januar 2018 +Contact: Linux kernel mailing list +Description: Information about CPU vulnerabilities + + The files are named after the code names of CPU + vulnerabilities. The output of those files reflects the + state of the CPUs in the system. Possible output values: + + "Not affected" CPU is not affected by the vulnerability + "Vulnerable" CPU is affected and no mitigation in effect + "Mitigation: $M" CPU is affetcted and mitigation $M is in effect --- linux-azure-4.13.0.orig/Documentation/admin-guide/LSM/index.rst +++ linux-azure-4.13.0/Documentation/admin-guide/LSM/index.rst @@ -17,11 +17,16 @@ specific changes to system operation when these tweaks are not available in the core functionality of Linux itself. -Without a specific LSM built into the kernel, the default LSM will be the -Linux capabilities system. Most LSMs choose to extend the capabilities -system, building their checks on top of the defined capability hooks. -For more details on capabilities, see ``capabilities(7)`` in the Linux -man-pages project. +The Linux capabilities modules will always be included. For more details +on capabilities, see ``capabilities(7)`` in the Linux man-pages project. + +Security modules that do not use the security data blobs maintained +by the LSM infrastructure are considered "minor" modules. These may be +included at compile time and stacked explicitly. Security modules that +use the LSM maintained security blobs are considered "major" modules. +These may only be stacked if the CONFIG_LSM_STACKED configuration +option is used. If this is chosen all of the security modules selected +will be used. A list of the active security modules can be found by reading ``/sys/kernel/security/lsm``. This is a comma separated list, and @@ -30,6 +35,22 @@ be first, followed by any "minor" modules (e.g. Yama) and then the one "major" module (e.g. SELinux) if there is one configured. +Process attributes associated with "major" security modules should +be accessed and maintained using the special files in ``/proc/.../attr``. +A security module may maintain a module specific subdirectory there, +named after the module. ``/proc/.../attr/smack`` is provided by the Smack +security module and contains all its special files. The files directly +in ``/proc/.../attr`` remain as legacy interfaces for modules that provide +subdirectories. + +The files named "context" in the attr directories contain the +same information as the "current" files, but formatted to +identify the module it comes from. + +if selinux is the active security module: + /proc/self/attr/context could contain selinux='unconfined_t' + /proc/self/attr/selinux/context could contain selinux='unconfined_t' + .. toctree:: :maxdepth: 1 --- linux-azure-4.13.0.orig/Documentation/admin-guide/kernel-parameters.rst +++ linux-azure-4.13.0/Documentation/admin-guide/kernel-parameters.rst @@ -138,6 +138,7 @@ PPT Parallel port support is enabled. PS2 Appropriate PS/2 support is enabled. RAM RAM disk support is enabled. + RDT Intel Resource Director Technology. S390 S390 architecture is enabled. SCSI Appropriate SCSI support is enabled. A lot of drivers have their options described inside --- linux-azure-4.13.0.orig/Documentation/admin-guide/kernel-parameters.txt +++ linux-azure-4.13.0/Documentation/admin-guide/kernel-parameters.txt @@ -656,6 +656,10 @@ 0: default value, disable debugging 1: enable debugging at boot time + cpufreq_driver= [X86] Allow only the named cpu frequency scaling driver + to register. Example: cpufreq_driver=powernow-k8 + Format: { none | STRING } + cpuidle.off=1 [CPU_IDLE] disable the cpuidle sub-system @@ -2562,6 +2566,16 @@ noexec=on: enable non-executable mappings (default) noexec=off: disable non-executable mappings + noibrs [X86] + Don't use indirect branch restricted speculation (IBRS) + feature when running in secure environment, + to avoid performance overhead. + + noibpb [X86] + Don't use indirect branch prediction barrier (IBPB) + feature when running in secure environment, + to avoid performance overhead. + nosmap [X86] Disable SMAP (Supervisor Mode Access Prevention) even if it is supported by processor. @@ -2588,6 +2602,11 @@ nosmt [KNL,S390] Disable symmetric multithreading (SMT). Equivalent to smt=1. + nospectre_v2 [X86] Disable all mitigations for the Spectre variant 2 + (indirect branch prediction) vulnerability. System may + allow data leaks with this option, which is equivalent + to spectre_v2=off. + noxsave [BUGS=X86] Disables x86 extended register state save and restore using xsave. The kernel will fallback to enabling legacy floating-point and sse state. @@ -2696,6 +2715,8 @@ nopat [X86] Disable PAT (page attribute table extension of pagetables) support. + nopcid [X86-64] Disable the PCID cpu feature. + norandmaps Don't use address space randomization. Equivalent to echo 0 > /proc/sys/kernel/randomize_va_space @@ -3230,6 +3251,21 @@ pt. [PARIDE] See Documentation/blockdev/paride.txt. + pti= [X86_64] Control Page Table Isolation of user and + kernel address spaces. Disabling this feature + removes hardening, but improves performance of + system calls and interrupts. + + on - unconditionally enable + off - unconditionally disable + auto - kernel detects whether your CPU model is + vulnerable to issues that PTI mitigates + + Not specifying this option is equivalent to pti=auto. + + nopti [X86_64] + Equivalent to pti=off + pty.legacy_count= [KNL] Number of legacy pty's. Overwrites compiled-in default number. @@ -3598,6 +3634,12 @@ Run specified binary instead of /init from the ramdisk, used for early userspace startup. See initrd. + rdt= [HW,X86,RDT] + Turn on/off individual RDT features. List is: + cmt, mbmtotal, mbmlocal, l3cat, l3cdp, l2cat, mba. + E.g. to turn on cmt and turn off mba use: + rdt=cmt,!mba + reboot= [KNL] Format (x86 or x86_64): [w[arm] | c[old] | h[ard] | s[oft] | g[pio]] \ @@ -3864,6 +3906,29 @@ sonypi.*= [HW] Sony Programmable I/O Control Device driver See Documentation/laptops/sonypi.txt + spectre_v2= [X86] Control mitigation of Spectre variant 2 + (indirect branch speculation) vulnerability. + + on - unconditionally enable + off - unconditionally disable + auto - kernel detects whether your CPU model is + vulnerable + + Selecting 'on' will, and 'auto' may, choose a + mitigation method at run time according to the + CPU, the available microcode, the setting of the + CONFIG_RETPOLINE configuration option, and the + compiler with which the kernel was built. + + Specific mitigations can also be selected manually: + + retpoline - replace indirect branches + retpoline,generic - google's original retpoline + retpoline,amd - AMD-specific minimal thunk + + Not specifying this option is equivalent to + spectre_v2=auto. + spia_io_base= [HW,MTD] spia_fio_base= spia_pedr= --- linux-azure-4.13.0.orig/Documentation/arm64/cpu-feature-registers.txt +++ linux-azure-4.13.0/Documentation/arm64/cpu-feature-registers.txt @@ -179,6 +179,8 @@ | FCMA | [19-16] | y | |--------------------------------------------------| | JSCVT | [15-12] | y | + |--------------------------------------------------| + | DPB | [3-0] | y | x--------------------------------------------------x Appendix I: Example --- linux-azure-4.13.0.orig/Documentation/arm64/silicon-errata.txt +++ linux-azure-4.13.0/Documentation/arm64/silicon-errata.txt @@ -71,6 +71,7 @@ | Hisilicon | Hip0{5,6,7} | #161010101 | HISILICON_ERRATUM_161010101 | | Hisilicon | Hip0{6,7} | #161010701 | N/A | | | | | | -| Qualcomm Tech. | Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | +| Qualcomm Tech. | Kryo/Falkor v1 | E1003 | QCOM_FALKOR_ERRATUM_1003 | | Qualcomm Tech. | Falkor v1 | E1009 | QCOM_FALKOR_ERRATUM_1009 | | Qualcomm Tech. | QDF2400 ITS | E0065 | QCOM_QDF2400_ERRATUM_0065 | +| Qualcomm Tech. | Falkor v{1,2} | E1041 | QCOM_FALKOR_ERRATUM_1041 | --- linux-azure-4.13.0.orig/Documentation/cgroups/namespace.txt +++ linux-azure-4.13.0/Documentation/cgroups/namespace.txt @@ -0,0 +1,142 @@ + CGroup Namespaces + +CGroup Namespace provides a mechanism to virtualize the view of the +/proc//cgroup file. The CLONE_NEWCGROUP clone-flag can be used with +clone() and unshare() syscalls to create a new cgroup namespace. +The process running inside the cgroup namespace will have its /proc//cgroup +output restricted to cgroupns-root. cgroupns-root is the cgroup of the process +at the time of creation of the cgroup namespace. + +Prior to CGroup Namespace, the /proc//cgroup file used to show complete +path of the cgroup of a process. In a container setup (where a set of cgroups +and namespaces are intended to isolate processes), the /proc//cgroup file +may leak potential system level information to the isolated processes. + +For Example: + $ cat /proc/self/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/container_id1 + +The path '/batchjobs/container_id1' can generally be considered as system-data +and its desirable to not expose it to the isolated process. + +CGroup Namespaces can be used to restrict visibility of this path. +For Example: + # Before creating cgroup namespace + $ ls -l /proc/self/ns/cgroup + lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835] + $ cat /proc/self/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/container_id1 + + # unshare(CLONE_NEWCGROUP) and exec /bin/bash + $ ~/unshare -c + [ns]$ ls -l /proc/self/ns/cgroup + lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> cgroup:[4026532183] + # From within new cgroupns, process sees that its in the root cgroup + [ns]$ cat /proc/self/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/ + + # From global cgroupns: + $ cat /proc//cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/container_id1 + + # Unshare cgroupns along with userns and mountns + # Following calls unshare(CLONE_NEWCGROUP|CLONE_NEWUSER|CLONE_NEWNS), then + # sets up uid/gid map and execs /bin/bash + $ ~/unshare -c -u -m + # Originally, we were in /batchjobs/container_id1 cgroup. Mount our own cgroup + # hierarchy. + [ns]$ mount -t cgroup cgroup /tmp/cgroup + [ns]$ ls -l /tmp/cgroup + total 0 + -r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.controllers + -r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.populated + -rw-r--r-- 1 root root 0 2014-10-13 09:25 cgroup.procs + -rw-r--r-- 1 root root 0 2014-10-13 09:32 cgroup.subtree_control + +The cgroupns-root (/batchjobs/container_id1 in above example) becomes the +filesystem root for the namespace specific cgroupfs mount. + +The virtualization of /proc/self/cgroup file combined with restricting +the view of cgroup hierarchy by namespace-private cgroupfs mount +should provide a completely isolated cgroup view inside the container. + +In its current form, the cgroup namespaces patcheset provides following +behavior: + +(1) The 'cgroupns-root' for a cgroup namespace is the cgroup in which + the process calling unshare is running. + For ex. if a process in /batchjobs/container_id1 cgroup calls unshare, + cgroup /batchjobs/container_id1 becomes the cgroupns-root. + For the init_cgroup_ns, this is the real root ('/') cgroup + (identified in code as cgrp_dfl_root.cgrp). + +(2) The cgroupns-root cgroup does not change even if the namespace + creator process later moves to a different cgroup. + $ ~/unshare -c # unshare cgroupns in some cgroup + [ns]$ cat /proc/self/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/ + [ns]$ mkdir sub_cgrp_1 + [ns]$ echo 0 > sub_cgrp_1/cgroup.procs + [ns]$ cat /proc/self/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1 + +(3) Each process gets its CGROUPNS specific view of /proc//cgroup +(a) Processes running inside the cgroup namespace will be able to see + cgroup paths (in /proc/self/cgroup) only inside their root cgroup + [ns]$ sleep 100000 & # From within unshared cgroupns + [1] 7353 + [ns]$ echo 7353 > sub_cgrp_1/cgroup.procs + [ns]$ cat /proc/7353/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1 + +(b) From global cgroupns, the real cgroup path will be visible: + $ cat /proc/7353/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/container_id1/sub_cgrp_1 + +(c) From a sibling cgroupns (cgroupns root-ed at a different cgroup), cgroup + path relative to its own cgroupns-root will be shown: + # ns2's cgroupns-root is at '/batchjobs/container_id2' + [ns2]$ cat /proc/7353/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/../container_id2/sub_cgrp_1 + + Note that the relative path always starts with '/' to indicate that its + relative to the cgroupns-root of the caller. + +(4) Processes inside a cgroupns can move in-and-out of the cgroupns-root + (if they have proper access to external cgroups). + # From inside cgroupns (with cgroupns-root at /batchjobs/container_id1), and + # assuming that the global hierarchy is still accessible inside cgroupns: + $ cat /proc/7353/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1 + $ echo 7353 > batchjobs/container_id2/cgroup.procs + $ cat /proc/7353/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/../container_id2 + + Note that this kind of setup is not encouraged. A task inside cgroupns + should only be exposed to its own cgroupns hierarchy. Otherwise it makes + the virtualization of /proc//cgroup less useful. + +(5) Setns to another cgroup namespace is allowed when: + (a) the process has CAP_SYS_ADMIN in its current userns + (b) the process has CAP_SYS_ADMIN in the target cgroupns' userns + No implicit cgroup changes happen with attaching to another cgroupns. It + is expected that the somone moves the attaching process under the target + cgroupns-root. + +(6) When some thread from a multi-threaded process unshares its + cgroup-namespace, the new cgroupns gets applied to the entire process (all + the threads). For the unified-hierarchy this is expected as it only allows + process-level containerization. For the legacy hierarchies this may be + unexpected. So all the threads in the process will have the same cgroup. + +(7) The cgroup namespace is alive as long as there is atleast 1 + process inside it. When the last process exits, the cgroup + namespace is destroyed. The cgroupns-root and the actual cgroups + remain though. + +(8) Namespace specific cgroup hierarchy can be mounted by a process running + inside cgroupns: + $ mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT + + This will mount the unified cgroup hierarchy with cgroupns-root as the + filesystem root. The process needs CAP_SYS_ADMIN in its userns and mntns. --- linux-azure-4.13.0.orig/Documentation/dev-tools/gdb-kernel-debugging.rst +++ linux-azure-4.13.0/Documentation/dev-tools/gdb-kernel-debugging.rst @@ -31,11 +31,13 @@ CONFIG_DEBUG_INFO_REDUCED off. If your architecture supports CONFIG_FRAME_POINTER, keep it enabled. -- Install that kernel on the guest. +- Install that kernel on the guest, turn off KASLR if necessary by adding + "nokaslr" to the kernel command line. Alternatively, QEMU allows to boot the kernel directly using -kernel, -append, -initrd command line switches. This is generally only useful if you do not depend on modules. See QEMU documentation for more details on - this mode. + this mode. In this case, you should build the kernel with + CONFIG_RANDOMIZE_BASE disabled if the architecture supports KASLR. - Enable the gdb stub of QEMU/KVM, either --- linux-azure-4.13.0.orig/Documentation/devicetree/bindings/arm/hisilicon/hisilicon-low-pin-count.txt +++ linux-azure-4.13.0/Documentation/devicetree/bindings/arm/hisilicon/hisilicon-low-pin-count.txt @@ -0,0 +1,33 @@ +Hisilicon Hip06 low-pin-count device + Hisilicon Hip06 SoCs implement a Low Pin Count (LPC) controller, which + provides I/O access to some legacy ISA devices. + Hip06 is based on arm64 architecture where there is no I/O space. So, the + I/O ports here are not cpu addresses, and there is no 'ranges' property in + LPC device node. + +Required properties: +- compatible: value should be as follows: + (a) "hisilicon,hip06-lpc" + (b) "hisilicon,hip07-lpc" +- #address-cells: must be 2 which stick to the ISA/EISA binding doc. +- #size-cells: must be 1 which stick to the ISA/EISA binding doc. +- reg: base memory range where the LPC register set is mapped. + +Note: + The node name before '@' must be "isa" to represent the binding stick to the + ISA/EISA binding specification. + +Example: + +isa@a01b0000 { + compatible = "hisilicon,hip06-lpc"; + #address-cells = <2>; + #size-cells = <1>; + reg = <0x0 0xa01b0000 0x0 0x1000>; + + ipmi0: bt@e4 { + compatible = "ipmi-bt"; + device_type = "ipmi"; + reg = <0x01 0xe4 0x04>; + }; +}; --- linux-azure-4.13.0.orig/Documentation/driver-api/firmware/request_firmware.rst +++ linux-azure-4.13.0/Documentation/driver-api/firmware/request_firmware.rst @@ -44,17 +44,6 @@ .. kernel-doc:: drivers/base/firmware_class.c :functions: request_firmware_nowait -Considerations for suspend and resume -===================================== - -During suspend and resume only the built-in firmware and the firmware cache -elements of the firmware API can be used. This is managed by fw_pm_notify(). - -fw_pm_notify ------------- -.. kernel-doc:: drivers/base/firmware_class.c - :functions: fw_pm_notify - request firmware API expected driver use ======================================== --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/README +++ linux-azure-4.13.0/Documentation/filesystems/aufs/README @@ -0,0 +1,393 @@ + +Aufs4 -- advanced multi layered unification filesystem version 4.x +http://aufs.sf.net +Junjiro R. Okajima + + +0. Introduction +---------------------------------------- +In the early days, aufs was entirely re-designed and re-implemented +Unionfs Version 1.x series. Adding many original ideas, approaches, +improvements and implementations, it becomes totally different from +Unionfs while keeping the basic features. +Recently, Unionfs Version 2.x series begin taking some of the same +approaches to aufs1's. +Unionfs is being developed by Professor Erez Zadok at Stony Brook +University and his team. + +Aufs4 supports linux-4.0 and later, and for linux-3.x series try aufs3. +If you want older kernel version support, try aufs2-2.6.git or +aufs2-standalone.git repository, aufs1 from CVS on SourceForge. + +Note: it becomes clear that "Aufs was rejected. Let's give it up." + According to Christoph Hellwig, linux rejects all union-type + filesystems but UnionMount. + + +PS. Al Viro seems have a plan to merge aufs as well as overlayfs and + UnionMount, and he pointed out an issue around a directory mutex + lock and aufs addressed it. But it is still unsure whether aufs will + be merged (or any other union solution). + + + +1. Features +---------------------------------------- +- unite several directories into a single virtual filesystem. The member + directory is called as a branch. +- you can specify the permission flags to the branch, which are 'readonly', + 'readwrite' and 'whiteout-able.' +- by upper writable branch, internal copyup and whiteout, files/dirs on + readonly branch are modifiable logically. +- dynamic branch manipulation, add, del. +- etc... + +Also there are many enhancements in aufs, such as: +- test only the highest one for the directory permission (dirperm1) +- copyup on open (coo=) +- 'move' policy for copy-up between two writable branches, after + checking free space. +- xattr, acl +- readdir(3) in userspace. +- keep inode number by external inode number table +- keep the timestamps of file/dir in internal copyup operation +- seekable directory, supporting NFS readdir. +- whiteout is hardlinked in order to reduce the consumption of inodes + on branch +- do not copyup, nor create a whiteout when it is unnecessary +- revert a single systemcall when an error occurs in aufs +- remount interface instead of ioctl +- maintain /etc/mtab by an external command, /sbin/mount.aufs. +- loopback mounted filesystem as a branch +- kernel thread for removing the dir who has a plenty of whiteouts +- support copyup sparse file (a file which has a 'hole' in it) +- default permission flags for branches +- selectable permission flags for ro branch, whether whiteout can + exist or not +- export via NFS. +- support /fs/aufs and /aufs. +- support multiple writable branches, some policies to select one + among multiple writable branches. +- a new semantics for link(2) and rename(2) to support multiple + writable branches. +- no glibc changes are required. +- pseudo hardlink (hardlink over branches) +- allow a direct access manually to a file on branch, e.g. bypassing aufs. + including NFS or remote filesystem branch. +- userspace wrapper for pathconf(3)/fpathconf(3) with _PC_LINK_MAX. +- and more... + +Currently these features are dropped temporary from aufs4. +See design/08plan.txt in detail. +- nested mount, i.e. aufs as readonly no-whiteout branch of another aufs + (robr) +- statistics of aufs thread (/sys/fs/aufs/stat) + +Features or just an idea in the future (see also design/*.txt), +- reorder the branch index without del/re-add. +- permanent xino files for NFSD +- an option for refreshing the opened files after add/del branches +- light version, without branch manipulation. (unnecessary?) +- copyup in userspace +- inotify in userspace +- readv/writev + + +2. Download +---------------------------------------- +There are three GIT trees for aufs4, aufs4-linux.git, +aufs4-standalone.git, and aufs-util.git. Note that there is no "4" in +"aufs-util.git." +While the aufs-util is always necessary, you need either of aufs4-linux +or aufs4-standalone. + +The aufs4-linux tree includes the whole linux mainline GIT tree, +git://git.kernel.org/.../torvalds/linux.git. +And you cannot select CONFIG_AUFS_FS=m for this version, eg. you cannot +build aufs4 as an external kernel module. +Several extra patches are not included in this tree. Only +aufs4-standalone tree contains them. They are described in the later +section "Configuration and Compilation." + +On the other hand, the aufs4-standalone tree has only aufs source files +and necessary patches, and you can select CONFIG_AUFS_FS=m. +But you need to apply all aufs patches manually. + +You will find GIT branches whose name is in form of "aufs4.x" where "x" +represents the linux kernel version, "linux-4.x". For instance, +"aufs4.0" is for linux-4.0. For latest "linux-4.x-rcN", use +"aufs4.x-rcN" branch. + +o aufs4-linux tree +$ git clone --reference /your/linux/git/tree \ + git://github.com/sfjro/aufs4-linux.git aufs4-linux.git +- if you don't have linux GIT tree, then remove "--reference ..." +$ cd aufs4-linux.git +$ git checkout origin/aufs4.0 + +Or You may want to directly git-pull aufs into your linux GIT tree, and +leave the patch-work to GIT. +$ cd /your/linux/git/tree +$ git remote add aufs4 git://github.com/sfjro/aufs4-linux.git +$ git fetch aufs4 +$ git checkout -b my4.0 v4.0 +$ (add your local change...) +$ git pull aufs4 aufs4.0 +- now you have v4.0 + your_changes + aufs4.0 in you my4.0 branch. +- you may need to solve some conflicts between your_changes and + aufs4.0. in this case, git-rerere is recommended so that you can + solve the similar conflicts automatically when you upgrade to 4.1 or + later in the future. + +o aufs4-standalone tree +$ git clone git://github.com/sfjro/aufs4-standalone.git aufs4-standalone.git +$ cd aufs4-standalone.git +$ git checkout origin/aufs4.0 + +o aufs-util tree +$ git clone git://git.code.sf.net/p/aufs/aufs-util aufs-util.git +- note that the public aufs-util.git is on SourceForge instead of + GitHUB. +$ cd aufs-util.git +$ git checkout origin/aufs4.0 + +Note: The 4.x-rcN branch is to be used with `rc' kernel versions ONLY. +The minor version number, 'x' in '4.x', of aufs may not always +follow the minor version number of the kernel. +Because changes in the kernel that cause the use of a new +minor version number do not always require changes to aufs-util. + +Since aufs-util has its own minor version number, you may not be +able to find a GIT branch in aufs-util for your kernel's +exact minor version number. +In this case, you should git-checkout the branch for the +nearest lower number. + +For (an unreleased) example: +If you are using "linux-4.10" and the "aufs4.10" branch +does not exist in aufs-util repository, then "aufs4.9", "aufs4.8" +or something numerically smaller is the branch for your kernel. + +Also you can view all branches by + $ git branch -a + + +3. Configuration and Compilation +---------------------------------------- +Make sure you have git-checkout'ed the correct branch. + +For aufs4-linux tree, +- enable CONFIG_AUFS_FS. +- set other aufs configurations if necessary. + +For aufs4-standalone tree, +There are several ways to build. + +1. +- apply ./aufs4-kbuild.patch to your kernel source files. +- apply ./aufs4-base.patch too. +- apply ./aufs4-mmap.patch too. +- apply ./aufs4-standalone.patch too, if you have a plan to set + CONFIG_AUFS_FS=m. otherwise you don't need ./aufs4-standalone.patch. +- copy ./{Documentation,fs,include/uapi/linux/aufs_type.h} files to your + kernel source tree. Never copy $PWD/include/uapi/linux/Kbuild. +- enable CONFIG_AUFS_FS, you can select either + =m or =y. +- and build your kernel as usual. +- install the built kernel. + Note: Since linux-3.9, every filesystem module requires an alias + "fs-". You should make sure that "fs-aufs" is listed in your + modules.aliases file if you set CONFIG_AUFS_FS=m. +- install the header files too by "make headers_install" to the + directory where you specify. By default, it is $PWD/usr. + "make help" shows a brief note for headers_install. +- and reboot your system. + +2. +- module only (CONFIG_AUFS_FS=m). +- apply ./aufs4-base.patch to your kernel source files. +- apply ./aufs4-mmap.patch too. +- apply ./aufs4-standalone.patch too. +- build your kernel, don't forget "make headers_install", and reboot. +- edit ./config.mk and set other aufs configurations if necessary. + Note: You should read $PWD/fs/aufs/Kconfig carefully which describes + every aufs configurations. +- build the module by simple "make". + Note: Since linux-3.9, every filesystem module requires an alias + "fs-". You should make sure that "fs-aufs" is listed in your + modules.aliases file. +- you can specify ${KDIR} make variable which points to your kernel + source tree. +- install the files + + run "make install" to install the aufs module, or copy the built + $PWD/aufs.ko to /lib/modules/... and run depmod -a (or reboot simply). + + run "make install_headers" (instead of headers_install) to install + the modified aufs header file (you can specify DESTDIR which is + available in aufs standalone version's Makefile only), or copy + $PWD/usr/include/linux/aufs_type.h to /usr/include/linux or wherever + you like manually. By default, the target directory is $PWD/usr. +- no need to apply aufs4-kbuild.patch, nor copying source files to your + kernel source tree. + +Note: The header file aufs_type.h is necessary to build aufs-util + as well as "make headers_install" in the kernel source tree. + headers_install is subject to be forgotten, but it is essentially + necessary, not only for building aufs-util. + You may not meet problems without headers_install in some older + version though. + +And then, +- read README in aufs-util, build and install it +- note that your distribution may contain an obsoleted version of + aufs_type.h in /usr/include/linux or something. When you build aufs + utilities, make sure that your compiler refers the correct aufs header + file which is built by "make headers_install." +- if you want to use readdir(3) in userspace or pathconf(3) wrapper, + then run "make install_ulib" too. And refer to the aufs manual in + detail. + +There several other patches in aufs4-standalone.git. They are all +optional. When you meet some problems, they will help you. +- aufs4-loopback.patch + Supports a nested loopback mount in a branch-fs. This patch is + unnecessary until aufs produces a message like "you may want to try + another patch for loopback file". +- vfs-ino.patch + Modifies a system global kernel internal function get_next_ino() in + order to stop assigning 0 for an inode-number. Not directly related to + aufs, but recommended generally. +- tmpfs-idr.patch + Keeps the tmpfs inode number as the lowest value. Effective to reduce + the size of aufs XINO files for tmpfs branch. Also it prevents the + duplication of inode number, which is important for backup tools and + other utilities. When you find aufs XINO files for tmpfs branch + growing too much, try this patch. +- lockdep-debug.patch + Because aufs is not only an ordinary filesystem (callee of VFS), but + also a caller of VFS functions for branch filesystems, subclassing of + the internal locks for LOCKDEP is necessary. LOCKDEP is a debugging + feature of linux kernel. If you enable CONFIG_LOCKDEP, then you will + need to apply this debug patch to expand several constant values. + If don't know what LOCKDEP, then you don't have apply this patch. + + +4. Usage +---------------------------------------- +At first, make sure aufs-util are installed, and please read the aufs +manual, aufs.5 in aufs-util.git tree. +$ man -l aufs.5 + +And then, +$ mkdir /tmp/rw /tmp/aufs +# mount -t aufs -o br=/tmp/rw:${HOME} none /tmp/aufs + +Here is another example. The result is equivalent. +# mount -t aufs -o br=/tmp/rw=rw:${HOME}=ro none /tmp/aufs + Or +# mount -t aufs -o br:/tmp/rw none /tmp/aufs +# mount -o remount,append:${HOME} /tmp/aufs + +Then, you can see whole tree of your home dir through /tmp/aufs. If +you modify a file under /tmp/aufs, the one on your home directory is +not affected, instead the same named file will be newly created under +/tmp/rw. And all of your modification to a file will be applied to +the one under /tmp/rw. This is called the file based Copy on Write +(COW) method. +Aufs mount options are described in aufs.5. +If you run chroot or something and make your aufs as a root directory, +then you need to customize the shutdown script. See the aufs manual in +detail. + +Additionally, there are some sample usages of aufs which are a +diskless system with network booting, and LiveCD over NFS. +See sample dir in CVS tree on SourceForge. + + +5. Contact +---------------------------------------- +When you have any problems or strange behaviour in aufs, please let me +know with: +- /proc/mounts (instead of the output of mount(8)) +- /sys/module/aufs/* +- /sys/fs/aufs/* (if you have them) +- /debug/aufs/* (if you have them) +- linux kernel version + if your kernel is not plain, for example modified by distributor, + the url where i can download its source is necessary too. +- aufs version which was printed at loading the module or booting the + system, instead of the date you downloaded. +- configuration (define/undefine CONFIG_AUFS_xxx) +- kernel configuration or /proc/config.gz (if you have it) +- behaviour which you think to be incorrect +- actual operation, reproducible one is better +- mailto: aufs-users at lists.sourceforge.net + +Usually, I don't watch the Public Areas(Bugs, Support Requests, Patches, +and Feature Requests) on SourceForge. Please join and write to +aufs-users ML. + + +6. Acknowledgements +---------------------------------------- +Thanks to everyone who have tried and are using aufs, whoever +have reported a bug or any feedback. + +Especially donators: +Tomas Matejicek(slax.org) made a donation (much more than once). + Since Apr 2010, Tomas M (the author of Slax and Linux Live + scripts) is making "doubling" donations. + Unfortunately I cannot list all of the donators, but I really + appreciate. + It ends Aug 2010, but the ordinary donation URL is still available. + +Dai Itasaka made a donation (2007/8). +Chuck Smith made a donation (2008/4, 10 and 12). +Henk Schoneveld made a donation (2008/9). +Chih-Wei Huang, ASUS, CTC donated Eee PC 4G (2008/10). +Francois Dupoux made a donation (2008/11). +Bruno Cesar Ribas and Luis Carlos Erpen de Bona, C3SL serves public + aufs2 GIT tree (2009/2). +William Grant made a donation (2009/3). +Patrick Lane made a donation (2009/4). +The Mail Archive (mail-archive.com) made donations (2009/5). +Nippy Networks (Ed Wildgoose) made a donation (2009/7). +New Dream Network, LLC (www.dreamhost.com) made a donation (2009/11). +Pavel Pronskiy made a donation (2011/2). +Iridium and Inmarsat satellite phone retailer (www.mailasail.com), Nippy + Networks (Ed Wildgoose) made a donation for hardware (2011/3). +Max Lekomcev (DOM-TV project) made a donation (2011/7, 12, 2012/3, 6 and +11). +Sam Liddicott made a donation (2011/9). +Era Scarecrow made a donation (2013/4). +Bor Ratajc made a donation (2013/4). +Alessandro Gorreta made a donation (2013/4). +POIRETTE Marc made a donation (2013/4). +Alessandro Gorreta made a donation (2013/4). +lauri kasvandik made a donation (2013/5). +"pemasu from Finland" made a donation (2013/7). +The Parted Magic Project made a donation (2013/9 and 11). +Pavel Barta made a donation (2013/10). +Nikolay Pertsev made a donation (2014/5). +James B made a donation (2014/7 and 2015/7). +Stefano Di Biase made a donation (2014/8). +Daniel Epellei made a donation (2015/1). +OmegaPhil made a donation (2016/1). +Tomasz Szewczyk made a donation (2016/4). +James Burry made a donation (2016/12). + +Thank you very much. +Donations are always, including future donations, very important and +helpful for me to keep on developing aufs. + + +7. +---------------------------------------- +If you are an experienced user, no explanation is needed. Aufs is +just a linux filesystem. + + +Enjoy! + +# Local variables: ; +# mode: text; +# End: ; --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/01intro.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/01intro.txt @@ -0,0 +1,171 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Introduction +---------------------------------------- + +aufs [ei ju: ef es] | /ey-yoo-ef-es/ | [a u f s] +1. abbrev. for "advanced multi-layered unification filesystem". +2. abbrev. for "another unionfs". +3. abbrev. for "auf das" in German which means "on the" in English. + Ex. "Butter aufs Brot"(G) means "butter onto bread"(E). + But "Filesystem aufs Filesystem" is hard to understand. +4. abbrev. for "African Urban Fashion Show". + +AUFS is a filesystem with features: +- multi layered stackable unification filesystem, the member directory + is called as a branch. +- branch permission and attribute, 'readonly', 'real-readonly', + 'readwrite', 'whiteout-able', 'link-able whiteout', etc. and their + combination. +- internal "file copy-on-write". +- logical deletion, whiteout. +- dynamic branch manipulation, adding, deleting and changing permission. +- allow bypassing aufs, user's direct branch access. +- external inode number translation table and bitmap which maintains the + persistent aufs inode number. +- seekable directory, including NFS readdir. +- file mapping, mmap and sharing pages. +- pseudo-link, hardlink over branches. +- loopback mounted filesystem as a branch. +- several policies to select one among multiple writable branches. +- revert a single systemcall when an error occurs in aufs. +- and more... + + +Multi Layered Stackable Unification Filesystem +---------------------------------------------------------------------- +Most people already knows what it is. +It is a filesystem which unifies several directories and provides a +merged single directory. When users access a file, the access will be +passed/re-directed/converted (sorry, I am not sure which English word is +correct) to the real file on the member filesystem. The member +filesystem is called 'lower filesystem' or 'branch' and has a mode +'readonly' and 'readwrite.' And the deletion for a file on the lower +readonly branch is handled by creating 'whiteout' on the upper writable +branch. + +On LKML, there have been discussions about UnionMount (Jan Blunck, +Bharata B Rao and Valerie Aurora) and Unionfs (Erez Zadok). They took +different approaches to implement the merged-view. +The former tries putting it into VFS, and the latter implements as a +separate filesystem. +(If I misunderstand about these implementations, please let me know and +I shall correct it. Because it is a long time ago when I read their +source files last time). + +UnionMount's approach will be able to small, but may be hard to share +branches between several UnionMount since the whiteout in it is +implemented in the inode on branch filesystem and always +shared. According to Bharata's post, readdir does not seems to be +finished yet. +There are several missing features known in this implementations such as +- for users, the inode number may change silently. eg. copy-up. +- link(2) may break by copy-up. +- read(2) may get an obsoleted filedata (fstat(2) too). +- fcntl(F_SETLK) may be broken by copy-up. +- unnecessary copy-up may happen, for example mmap(MAP_PRIVATE) after + open(O_RDWR). + +In linux-3.18, "overlay" filesystem (formerly known as "overlayfs") was +merged into mainline. This is another implementation of UnionMount as a +separated filesystem. All the limitations and known problems which +UnionMount are equally inherited to "overlay" filesystem. + +Unionfs has a longer history. When I started implementing a stackable +filesystem (Aug 2005), it already existed. It has virtual super_block, +inode, dentry and file objects and they have an array pointing lower +same kind objects. After contributing many patches for Unionfs, I +re-started my project AUFS (Jun 2006). + +In AUFS, the structure of filesystem resembles to Unionfs, but I +implemented my own ideas, approaches and enhancements and it became +totally different one. + +Comparing DM snapshot and fs based implementation +- the number of bytes to be copied between devices is much smaller. +- the type of filesystem must be one and only. +- the fs must be writable, no readonly fs, even for the lower original + device. so the compression fs will not be usable. but if we use + loopback mount, we may address this issue. + for instance, + mount /cdrom/squashfs.img /sq + losetup /sq/ext2.img + losetup /somewhere/cow + dmsetup "snapshot /dev/loop0 /dev/loop1 ..." +- it will be difficult (or needs more operations) to extract the + difference between the original device and COW. +- DM snapshot-merge may help a lot when users try merging. in the + fs-layer union, users will use rsync(1). + +You may want to read my old paper "Filesystems in LiveCD" +(http://aufs.sourceforge.net/aufs2/report/sq/sq.pdf). + + +Several characters/aspects/persona of aufs +---------------------------------------------------------------------- + +Aufs has several characters, aspects or persona. +1. a filesystem, callee of VFS helper +2. sub-VFS, caller of VFS helper for branches +3. a virtual filesystem which maintains persistent inode number +4. reader/writer of files on branches such like an application + +1. Callee of VFS Helper +As an ordinary linux filesystem, aufs is a callee of VFS. For instance, +unlink(2) from an application reaches sys_unlink() kernel function and +then vfs_unlink() is called. vfs_unlink() is one of VFS helper and it +calls filesystem specific unlink operation. Actually aufs implements the +unlink operation but it behaves like a redirector. + +2. Caller of VFS Helper for Branches +aufs_unlink() passes the unlink request to the branch filesystem as if +it were called from VFS. So the called unlink operation of the branch +filesystem acts as usual. As a caller of VFS helper, aufs should handle +every necessary pre/post operation for the branch filesystem. +- acquire the lock for the parent dir on a branch +- lookup in a branch +- revalidate dentry on a branch +- mnt_want_write() for a branch +- vfs_unlink() for a branch +- mnt_drop_write() for a branch +- release the lock on a branch + +3. Persistent Inode Number +One of the most important issue for a filesystem is to maintain inode +numbers. This is particularly important to support exporting a +filesystem via NFS. Aufs is a virtual filesystem which doesn't have a +backend block device for its own. But some storage is necessary to +keep and maintain the inode numbers. It may be a large space and may not +suit to keep in memory. Aufs rents some space from its first writable +branch filesystem (by default) and creates file(s) on it. These files +are created by aufs internally and removed soon (currently) keeping +opened. +Note: Because these files are removed, they are totally gone after + unmounting aufs. It means the inode numbers are not persistent + across unmount or reboot. I have a plan to make them really + persistent which will be important for aufs on NFS server. + +4. Read/Write Files Internally (copy-on-write) +Because a branch can be readonly, when you write a file on it, aufs will +"copy-up" it to the upper writable branch internally. And then write the +originally requested thing to the file. Generally kernel doesn't +open/read/write file actively. In aufs, even a single write may cause a +internal "file copy". This behaviour is very similar to cp(1) command. + +Some people may think it is better to pass such work to user space +helper, instead of doing in kernel space. Actually I am still thinking +about it. But currently I have implemented it in kernel space. --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/02struct.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/02struct.txt @@ -0,0 +1,258 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Basic Aufs Internal Structure + +Superblock/Inode/Dentry/File Objects +---------------------------------------------------------------------- +As like an ordinary filesystem, aufs has its own +superblock/inode/dentry/file objects. All these objects have a +dynamically allocated array and store the same kind of pointers to the +lower filesystem, branch. +For example, when you build a union with one readwrite branch and one +readonly, mounted /au, /rw and /ro respectively. +- /au = /rw + /ro +- /ro/fileA exists but /rw/fileA + +Aufs lookup operation finds /ro/fileA and gets dentry for that. These +pointers are stored in a aufs dentry. The array in aufs dentry will be, +- [0] = NULL (because /rw/fileA doesn't exist) +- [1] = /ro/fileA + +This style of an array is essentially same to the aufs +superblock/inode/dentry/file objects. + +Because aufs supports manipulating branches, ie. add/delete/change +branches dynamically, these objects has its own generation. When +branches are changed, the generation in aufs superblock is +incremented. And a generation in other object are compared when it is +accessed. When a generation in other objects are obsoleted, aufs +refreshes the internal array. + + +Superblock +---------------------------------------------------------------------- +Additionally aufs superblock has some data for policies to select one +among multiple writable branches, XIB files, pseudo-links and kobject. +See below in detail. +About the policies which supports copy-down a directory, see +wbr_policy.txt too. + + +Branch and XINO(External Inode Number Translation Table) +---------------------------------------------------------------------- +Every branch has its own xino (external inode number translation table) +file. The xino file is created and unlinked by aufs internally. When two +members of a union exist on the same filesystem, they share the single +xino file. +The struct of a xino file is simple, just a sequence of aufs inode +numbers which is indexed by the lower inode number. +In the above sample, assume the inode number of /ro/fileA is i111 and +aufs assigns the inode number i999 for fileA. Then aufs writes 999 as +4(8) bytes at 111 * 4(8) bytes offset in the xino file. + +When the inode numbers are not contiguous, the xino file will be sparse +which has a hole in it and doesn't consume as much disk space as it +might appear. If your branch filesystem consumes disk space for such +holes, then you should specify 'xino=' option at mounting aufs. + +Aufs has a mount option to free the disk blocks for such holes in XINO +files on tmpfs or ramdisk. But it is not so effective actually. If you +meet a problem of disk shortage due to XINO files, then you should try +"tmpfs-ino.patch" (and "vfs-ino.patch" too) in aufs4-standalone.git. +The patch localizes the assignment inumbers per tmpfs-mount and avoid +the holes in XINO files. + +Also a writable branch has three kinds of "whiteout bases". All these +are existed when the branch is joined to aufs, and their names are +whiteout-ed doubly, so that users will never see their names in aufs +hierarchy. +1. a regular file which will be hardlinked to all whiteouts. +2. a directory to store a pseudo-link. +3. a directory to store an "orphan"-ed file temporary. + +1. Whiteout Base + When you remove a file on a readonly branch, aufs handles it as a + logical deletion and creates a whiteout on the upper writable branch + as a hardlink of this file in order not to consume inode on the + writable branch. +2. Pseudo-link Dir + See below, Pseudo-link. +3. Step-Parent Dir + When "fileC" exists on the lower readonly branch only and it is + opened and removed with its parent dir, and then user writes + something into it, then aufs copies-up fileC to this + directory. Because there is no other dir to store fileC. After + creating a file under this dir, the file is unlinked. + +Because aufs supports manipulating branches, ie. add/delete/change +dynamically, a branch has its own id. When the branch order changes, +aufs finds the new index by searching the branch id. + + +Pseudo-link +---------------------------------------------------------------------- +Assume "fileA" exists on the lower readonly branch only and it is +hardlinked to "fileB" on the branch. When you write something to fileA, +aufs copies-up it to the upper writable branch. Additionally aufs +creates a hardlink under the Pseudo-link Directory of the writable +branch. The inode of a pseudo-link is kept in aufs super_block as a +simple list. If fileB is read after unlinking fileA, aufs returns +filedata from the pseudo-link instead of the lower readonly +branch. Because the pseudo-link is based upon the inode, to keep the +inode number by xino (see above) is essentially necessary. + +All the hardlinks under the Pseudo-link Directory of the writable branch +should be restored in a proper location later. Aufs provides a utility +to do this. The userspace helpers executed at remounting and unmounting +aufs by default. +During this utility is running, it puts aufs into the pseudo-link +maintenance mode. In this mode, only the process which began the +maintenance mode (and its child processes) is allowed to operate in +aufs. Some other processes which are not related to the pseudo-link will +be allowed to run too, but the rest have to return an error or wait +until the maintenance mode ends. If a process already acquires an inode +mutex (in VFS), it has to return an error. + + +XIB(external inode number bitmap) +---------------------------------------------------------------------- +Addition to the xino file per a branch, aufs has an external inode number +bitmap in a superblock object. It is also an internal file such like a +xino file. +It is a simple bitmap to mark whether the aufs inode number is in-use or +not. +To reduce the file I/O, aufs prepares a single memory page to cache xib. + +As well as XINO files, aufs has a feature to truncate/refresh XIB to +reduce the number of consumed disk blocks for these files. + + +Virtual or Vertical Dir, and Readdir in Userspace +---------------------------------------------------------------------- +In order to support multiple layers (branches), aufs readdir operation +constructs a virtual dir block on memory. For readdir, aufs calls +vfs_readdir() internally for each dir on branches, merges their entries +with eliminating the whiteout-ed ones, and sets it to file (dir) +object. So the file object has its entry list until it is closed. The +entry list will be updated when the file position is zero and becomes +obsoleted. This decision is made in aufs automatically. + +The dynamically allocated memory block for the name of entries has a +unit of 512 bytes (by default) and stores the names contiguously (no +padding). Another block for each entry is handled by kmem_cache too. +During building dir blocks, aufs creates hash list and judging whether +the entry is whiteouted by its upper branch or already listed. +The merged result is cached in the corresponding inode object and +maintained by a customizable life-time option. + +Some people may call it can be a security hole or invite DoS attack +since the opened and once readdir-ed dir (file object) holds its entry +list and becomes a pressure for system memory. But I'd say it is similar +to files under /proc or /sys. The virtual files in them also holds a +memory page (generally) while they are opened. When an idea to reduce +memory for them is introduced, it will be applied to aufs too. +For those who really hate this situation, I've developed readdir(3) +library which operates this merging in userspace. You just need to set +LD_PRELOAD environment variable, and aufs will not consume no memory in +kernel space for readdir(3). + + +Workqueue +---------------------------------------------------------------------- +Aufs sometimes requires privilege access to a branch. For instance, +in copy-up/down operation. When a user process is going to make changes +to a file which exists in the lower readonly branch only, and the mode +of one of ancestor directories may not be writable by a user +process. Here aufs copy-up the file with its ancestors and they may +require privilege to set its owner/group/mode/etc. +This is a typical case of a application character of aufs (see +Introduction). + +Aufs uses workqueue synchronously for this case. It creates its own +workqueue. The workqueue is a kernel thread and has privilege. Aufs +passes the request to call mkdir or write (for example), and wait for +its completion. This approach solves a problem of a signal handler +simply. +If aufs didn't adopt the workqueue and changed the privilege of the +process, then the process may receive the unexpected SIGXFSZ or other +signals. + +Also aufs uses the system global workqueue ("events" kernel thread) too +for asynchronous tasks, such like handling inotify/fsnotify, re-creating a +whiteout base and etc. This is unrelated to a privilege. +Most of aufs operation tries acquiring a rw_semaphore for aufs +superblock at the beginning, at the same time waits for the completion +of all queued asynchronous tasks. + + +Whiteout +---------------------------------------------------------------------- +The whiteout in aufs is very similar to Unionfs's. That is represented +by its filename. UnionMount takes an approach of a file mode, but I am +afraid several utilities (find(1) or something) will have to support it. + +Basically the whiteout represents "logical deletion" which stops aufs to +lookup further, but also it represents "dir is opaque" which also stop +further lookup. + +In aufs, rmdir(2) and rename(2) for dir uses whiteout alternatively. +In order to make several functions in a single systemcall to be +revertible, aufs adopts an approach to rename a directory to a temporary +unique whiteouted name. +For example, in rename(2) dir where the target dir already existed, aufs +renames the target dir to a temporary unique whiteouted name before the +actual rename on a branch, and then handles other actions (make it opaque, +update the attributes, etc). If an error happens in these actions, aufs +simply renames the whiteouted name back and returns an error. If all are +succeeded, aufs registers a function to remove the whiteouted unique +temporary name completely and asynchronously to the system global +workqueue. + + +Copy-up +---------------------------------------------------------------------- +It is a well-known feature or concept. +When user modifies a file on a readonly branch, aufs operate "copy-up" +internally and makes change to the new file on the upper writable branch. +When the trigger systemcall does not update the timestamps of the parent +dir, aufs reverts it after copy-up. + + +Move-down (aufs3.9 and later) +---------------------------------------------------------------------- +"Copy-up" is one of the essential feature in aufs. It copies a file from +the lower readonly branch to the upper writable branch when a user +changes something about the file. +"Move-down" is an opposite action of copy-up. Basically this action is +ran manually instead of automatically and internally. +For desgin and implementation, aufs has to consider these issues. +- whiteout for the file may exist on the lower branch. +- ancestor directories may not exist on the lower branch. +- diropq for the ancestor directories may exist on the upper branch. +- free space on the lower branch will reduce. +- another access to the file may happen during moving-down, including + UDBA (see "Revalidate Dentry and UDBA"). +- the file should not be hard-linked nor pseudo-linked. they should be + handled by auplink utility later. + +Sometimes users want to move-down a file from the upper writable branch +to the lower readonly or writable branch. For instance, +- the free space of the upper writable branch is going to run out. +- create a new intermediate branch between the upper and lower branch. +- etc. + +For this purpose, use "aumvdown" command in aufs-util.git. --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/03atomic_open.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/03atomic_open.txt @@ -0,0 +1,85 @@ + +# Copyright (C) 2015-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Support for a branch who has its ->atomic_open() +---------------------------------------------------------------------- +The filesystems who implement its ->atomic_open() are not majority. For +example NFSv4 does, and aufs should call NFSv4 ->atomic_open, +particularly for open(O_CREAT|O_EXCL, 0400) case. Other than +->atomic_open(), NFSv4 returns an error for this open(2). While I am not +sure whether all filesystems who have ->atomic_open() behave like this, +but NFSv4 surely returns the error. + +In order to support ->atomic_open() for aufs, there are a few +approaches. + +A. Introduce aufs_atomic_open() + - calls one of VFS:do_last(), lookup_open() or atomic_open() for + branch fs. +B. Introduce aufs_atomic_open() calling create, open and chmod. this is + an aufs user Pip Cet's approach + - calls aufs_create(), VFS finish_open() and notify_change(). + - pass fake-mode to finish_open(), and then correct the mode by + notify_change(). +C. Extend aufs_open() to call branch fs's ->atomic_open() + - no aufs_atomic_open(). + - aufs_lookup() registers the TID to an aufs internal object. + - aufs_create() does nothing when the matching TID is registered, but + registers the mode. + - aufs_open() calls branch fs's ->atomic_open() when the matching + TID is registered. +D. Extend aufs_open() to re-try branch fs's ->open() with superuser's + credential + - no aufs_atomic_open(). + - aufs_create() registers the TID to an internal object. this info + represents "this process created this file just now." + - when aufs gets EACCES from branch fs's ->open(), then confirm the + registered TID and re-try open() with superuser's credential. + +Pros and cons for each approach. + +A. + - straightforward but highly depends upon VFS internal. + - the atomic behavaiour is kept. + - some of parameters such as nameidata are hard to reproduce for + branch fs. + - large overhead. +B. + - easy to implement. + - the atomic behavaiour is lost. +C. + - the atomic behavaiour is kept. + - dirty and tricky. + - VFS checks whether the file is created correctly after calling + ->create(), which means this approach doesn't work. +D. + - easy to implement. + - the atomic behavaiour is lost. + - to open a file with superuser's credential and give it to a user + process is a bad idea, since the file object keeps the credential + in it. It may affect LSM or something. This approach doesn't work + either. + +The approach A is ideal, but it hard to implement. So here is a +variation of A, which is to be implemented. + +A-1. Introduce aufs_atomic_open() + - calls branch fs ->atomic_open() if exists. otherwise calls + vfs_create() and finish_open(). + - the demerit is that the several checks after branch fs + ->atomic_open() are lost. in the ordinary case, the checks are + done by VFS:do_last(), lookup_open() and atomic_open(). some can + be implemented in aufs, but not all I am afraid. --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/03lookup.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/03lookup.txt @@ -0,0 +1,113 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Lookup in a Branch +---------------------------------------------------------------------- +Since aufs has a character of sub-VFS (see Introduction), it operates +lookup for branches as VFS does. It may be a heavy work. But almost all +lookup operation in aufs is the simplest case, ie. lookup only an entry +directly connected to its parent. Digging down the directory hierarchy +is unnecessary. VFS has a function lookup_one_len() for that use, and +aufs calls it. + +When a branch is a remote filesystem, aufs basically relies upon its +->d_revalidate(), also aufs forces the hardest revalidate tests for +them. +For d_revalidate, aufs implements three levels of revalidate tests. See +"Revalidate Dentry and UDBA" in detail. + + +Test Only the Highest One for the Directory Permission (dirperm1 option) +---------------------------------------------------------------------- +Let's try case study. +- aufs has two branches, upper readwrite and lower readonly. + /au = /rw + /ro +- "dirA" exists under /ro, but /rw. and its mode is 0700. +- user invoked "chmod a+rx /au/dirA" +- the internal copy-up is activated and "/rw/dirA" is created and its + permission bits are set to world readable. +- then "/au/dirA" becomes world readable? + +In this case, /ro/dirA is still 0700 since it exists in readonly branch, +or it may be a natively readonly filesystem. If aufs respects the lower +branch, it should not respond readdir request from other users. But user +allowed it by chmod. Should really aufs rejects showing the entries +under /ro/dirA? + +To be honest, I don't have a good solution for this case. So aufs +implements 'dirperm1' and 'nodirperm1' mount options, and leave it to +users. +When dirperm1 is specified, aufs checks only the highest one for the +directory permission, and shows the entries. Otherwise, as usual, checks +every dir existing on all branches and rejects the request. + +As a side effect, dirperm1 option improves the performance of aufs +because the number of permission check is reduced when the number of +branch is many. + + +Revalidate Dentry and UDBA (User's Direct Branch Access) +---------------------------------------------------------------------- +Generally VFS helpers re-validate a dentry as a part of lookup. +0. digging down the directory hierarchy. +1. lock the parent dir by its i_mutex. +2. lookup the final (child) entry. +3. revalidate it. +4. call the actual operation (create, unlink, etc.) +5. unlock the parent dir + +If the filesystem implements its ->d_revalidate() (step 3), then it is +called. Actually aufs implements it and checks the dentry on a branch is +still valid. +But it is not enough. Because aufs has to release the lock for the +parent dir on a branch at the end of ->lookup() (step 2) and +->d_revalidate() (step 3) while the i_mutex of the aufs dir is still +held by VFS. +If the file on a branch is changed directly, eg. bypassing aufs, after +aufs released the lock, then the subsequent operation may cause +something unpleasant result. + +This situation is a result of VFS architecture, ->lookup() and +->d_revalidate() is separated. But I never say it is wrong. It is a good +design from VFS's point of view. It is just not suitable for sub-VFS +character in aufs. + +Aufs supports such case by three level of revalidation which is +selectable by user. +1. Simple Revalidate + Addition to the native flow in VFS's, confirm the child-parent + relationship on the branch just after locking the parent dir on the + branch in the "actual operation" (step 4). When this validation + fails, aufs returns EBUSY. ->d_revalidate() (step 3) in aufs still + checks the validation of the dentry on branches. +2. Monitor Changes Internally by Inotify/Fsnotify + Addition to above, in the "actual operation" (step 4) aufs re-lookup + the dentry on the branch, and returns EBUSY if it finds different + dentry. + Additionally, aufs sets the inotify/fsnotify watch for every dir on branches + during it is in cache. When the event is notified, aufs registers a + function to kernel 'events' thread by schedule_work(). And the + function sets some special status to the cached aufs dentry and inode + private data. If they are not cached, then aufs has nothing to + do. When the same file is accessed through aufs (step 0-3) later, + aufs will detect the status and refresh all necessary data. + In this mode, aufs has to ignore the event which is fired by aufs + itself. +3. No Extra Validation + This is the simplest test and doesn't add any additional revalidation + test, and skip the revalidation in step 4. It is useful and improves + aufs performance when system surely hide the aufs branches from user, + by over-mounting something (or another method). --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/04branch.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/04branch.txt @@ -0,0 +1,74 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Branch Manipulation + +Since aufs supports dynamic branch manipulation, ie. add/remove a branch +and changing its permission/attribute, there are a lot of works to do. + + +Add a Branch +---------------------------------------------------------------------- +o Confirm the adding dir exists outside of aufs, including loopback + mount, and its various attributes. +o Initialize the xino file and whiteout bases if necessary. + See struct.txt. + +o Check the owner/group/mode of the directory + When the owner/group/mode of the adding directory differs from the + existing branch, aufs issues a warning because it may impose a + security risk. + For example, when a upper writable branch has a world writable empty + top directory, a malicious user can create any files on the writable + branch directly, like copy-up and modify manually. If something like + /etc/{passwd,shadow} exists on the lower readonly branch but the upper + writable branch, and the writable branch is world-writable, then a + malicious guy may create /etc/passwd on the writable branch directly + and the infected file will be valid in aufs. + I am afraid it can be a security issue, but aufs can do nothing except + producing a warning. + + +Delete a Branch +---------------------------------------------------------------------- +o Confirm the deleting branch is not busy + To be general, there is one merit to adopt "remount" interface to + manipulate branches. It is to discard caches. At deleting a branch, + aufs checks the still cached (and connected) dentries and inodes. If + there are any, then they are all in-use. An inode without its + corresponding dentry can be alive alone (for example, inotify/fsnotify case). + + For the cached one, aufs checks whether the same named entry exists on + other branches. + If the cached one is a directory, because aufs provides a merged view + to users, as long as one dir is left on any branch aufs can show the + dir to users. In this case, the branch can be removed from aufs. + Otherwise aufs rejects deleting the branch. + + If any file on the deleting branch is opened by aufs, then aufs + rejects deleting. + + +Modify the Permission of a Branch +---------------------------------------------------------------------- +o Re-initialize or remove the xino file and whiteout bases if necessary. + See struct.txt. + +o rw --> ro: Confirm the modifying branch is not busy + Aufs rejects the request if any of these conditions are true. + - a file on the branch is mmap-ed. + - a regular file on the branch is opened for write and there is no + same named entry on the upper branch. --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/05wbr_policy.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/05wbr_policy.txt @@ -0,0 +1,64 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Policies to Select One among Multiple Writable Branches +---------------------------------------------------------------------- +When the number of writable branch is more than one, aufs has to decide +the target branch for file creation or copy-up. By default, the highest +writable branch which has the parent (or ancestor) dir of the target +file is chosen (top-down-parent policy). +By user's request, aufs implements some other policies to select the +writable branch, for file creation several policies, round-robin, +most-free-space, and other policies. For copy-up, top-down-parent, +bottom-up-parent, bottom-up and others. + +As expected, the round-robin policy selects the branch in circular. When +you have two writable branches and creates 10 new files, 5 files will be +created for each branch. mkdir(2) systemcall is an exception. When you +create 10 new directories, all will be created on the same branch. +And the most-free-space policy selects the one which has most free +space among the writable branches. The amount of free space will be +checked by aufs internally, and users can specify its time interval. + +The policies for copy-up is more simple, +top-down-parent is equivalent to the same named on in create policy, +bottom-up-parent selects the writable branch where the parent dir +exists and the nearest upper one from the copyup-source, +bottom-up selects the nearest upper writable branch from the +copyup-source, regardless the existence of the parent dir. + +There are some rules or exceptions to apply these policies. +- If there is a readonly branch above the policy-selected branch and + the parent dir is marked as opaque (a variation of whiteout), or the + target (creating) file is whiteout-ed on the upper readonly branch, + then the result of the policy is ignored and the target file will be + created on the nearest upper writable branch than the readonly branch. +- If there is a writable branch above the policy-selected branch and + the parent dir is marked as opaque or the target file is whiteouted + on the branch, then the result of the policy is ignored and the target + file will be created on the highest one among the upper writable + branches who has diropq or whiteout. In case of whiteout, aufs removes + it as usual. +- link(2) and rename(2) systemcalls are exceptions in every policy. + They try selecting the branch where the source exists as possible + since copyup a large file will take long time. If it can't be, + ie. the branch where the source exists is readonly, then they will + follow the copyup policy. +- There is an exception for rename(2) when the target exists. + If the rename target exists, aufs compares the index of the branches + where the source and the target exists and selects the higher + one. If the selected branch is readonly, then aufs follows the + copyup policy. --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/06fhsm.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/06fhsm.txt @@ -0,0 +1,120 @@ + +# Copyright (C) 2011-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +File-based Hierarchical Storage Management (FHSM) +---------------------------------------------------------------------- +Hierarchical Storage Management (or HSM) is a well-known feature in the +storage world. Aufs provides this feature as file-based with multiple +writable branches, based upon the principle of "Colder, the Lower". +Here the word "colder" means that the less used files, and "lower" means +that the position in the order of the stacked branches vertically. +These multiple writable branches are prioritized, ie. the topmost one +should be the fastest drive and be used heavily. + +o Characters in aufs FHSM story +- aufs itself and a new branch attribute. +- a new ioctl interface to move-down and to establish a connection with + the daemon ("move-down" is a converse of "copy-up"). +- userspace tool and daemon. + +The userspace daemon establishes a connection with aufs and waits for +the notification. The notified information is very similar to struct +statfs containing the number of consumed blocks and inodes. +When the consumed blocks/inodes of a branch exceeds the user-specified +upper watermark, the daemon activates its move-down process until the +consumed blocks/inodes reaches the user-specified lower watermark. + +The actual move-down is done by aufs based upon the request from +user-space since we need to maintain the inode number and the internal +pointer arrays in aufs. + +Currently aufs FHSM handles the regular files only. Additionally they +must not be hard-linked nor pseudo-linked. + + +o Cowork of aufs and the user-space daemon + During the userspace daemon established the connection, aufs sends a + small notification to it whenever aufs writes something into the + writable branch. But it may cost high since aufs issues statfs(2) + internally. So user can specify a new option to cache the + info. Actually the notification is controlled by these factors. + + the specified cache time. + + classified as "force" by aufs internally. + Until the specified time expires, aufs doesn't send the info + except the forced cases. When aufs decide forcing, the info is always + notified to userspace. + For example, the number of free inodes is generally large enough and + the shortage of it happens rarely. So aufs doesn't force the + notification when creating a new file, directory and others. This is + the typical case which aufs doesn't force. + When aufs writes the actual filedata and the files consumes any of new + blocks, the aufs forces notifying. + + +o Interfaces in aufs +- New branch attribute. + + fhsm + Specifies that the branch is managed by FHSM feature. In other word, + participant in the FHSM. + When nofhsm is set to the branch, it will not be the source/target + branch of the move-down operation. This attribute is set + independently from coo and moo attributes, and if you want full + FHSM, you should specify them as well. +- New mount option. + + fhsm_sec + Specifies a second to suppress many less important info to be + notified. +- New ioctl. + + AUFS_CTL_FHSM_FD + create a new file descriptor which userspace can read the notification + (a subset of struct statfs) from aufs. +- Module parameter 'brs' + It has to be set to 1. Otherwise the new mount option 'fhsm' will not + be set. +- mount helpers /sbin/mount.aufs and /sbin/umount.aufs + When there are two or more branches with fhsm attributes, + /sbin/mount.aufs invokes the user-space daemon and /sbin/umount.aufs + terminates it. As a result of remounting and branch-manipulation, the + number of branches with fhsm attribute can be one. In this case, + /sbin/mount.aufs will terminate the user-space daemon. + + +Finally the operation is done as these steps in kernel-space. +- make sure that, + + no one else is using the file. + + the file is not hard-linked. + + the file is not pseudo-linked. + + the file is a regular file. + + the parent dir is not opaqued. +- find the target writable branch. +- make sure the file is not whiteout-ed by the upper (than the target) + branch. +- make the parent dir on the target branch. +- mutex lock the inode on the branch. +- unlink the whiteout on the target branch (if exists). +- lookup and create the whiteout-ed temporary name on the target branch. +- copy the file as the whiteout-ed temporary name on the target branch. +- rename the whiteout-ed temporary name to the original name. +- unlink the file on the source branch. +- maintain the internal pointer array and the external inode number + table (XINO). +- maintain the timestamps and other attributes of the parent dir and the + file. + +And of course, in every step, an error may happen. So the operation +should restore the original file state after an error happens. --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/06mmap.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/06mmap.txt @@ -0,0 +1,72 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +mmap(2) -- File Memory Mapping +---------------------------------------------------------------------- +In aufs, the file-mapped pages are handled by a branch fs directly, no +interaction with aufs. It means aufs_mmap() calls the branch fs's +->mmap(). +This approach is simple and good, but there is one problem. +Under /proc, several entries show the mmapped files by its path (with +device and inode number), and the printed path will be the path on the +branch fs's instead of virtual aufs's. +This is not a problem in most cases, but some utilities lsof(1) (and its +user) may expect the path on aufs. + +To address this issue, aufs adds a new member called vm_prfile in struct +vm_area_struct (and struct vm_region). The original vm_file points to +the file on the branch fs in order to handle everything correctly as +usual. The new vm_prfile points to a virtual file in aufs, and the +show-functions in procfs refers to vm_prfile if it is set. +Also we need to maintain several other places where touching vm_file +such like +- fork()/clone() copies vma and the reference count of vm_file is + incremented. +- merging vma maintains the ref count too. + +This is not a good approach. It just fakes the printed path. But it +leaves all behaviour around f_mapping unchanged. This is surely an +advantage. +Actually aufs had adopted another complicated approach which calls +generic_file_mmap() and handles struct vm_operations_struct. In this +approach, aufs met a hard problem and I could not solve it without +switching the approach. + +There may be one more another approach which is +- bind-mount the branch-root onto the aufs-root internally +- grab the new vfsmount (ie. struct mount) +- lazy-umount the branch-root internally +- in open(2) the aufs-file, open the branch-file with the hidden + vfsmount (instead of the original branch's vfsmount) +- ideally this "bind-mount and lazy-umount" should be done atomically, + but it may be possible from userspace by the mount helper. + +Adding the internal hidden vfsmount and using it in opening a file, the +file path under /proc will be printed correctly. This approach looks +smarter, but is not possible I am afraid. +- aufs-root may be bind-mount later. when it happens, another hidden + vfsmount will be required. +- it is hard to get the chance to bind-mount and lazy-umount + + in kernel-space, FS can have vfsmount in open(2) via + file->f_path, and aufs can know its vfsmount. But several locks are + already acquired, and if aufs tries to bind-mount and lazy-umount + here, then it may cause a deadlock. + + in user-space, bind-mount doesn't invoke the mount helper. +- since /proc shows dev and ino, aufs has to give vma these info. it + means a new member vm_prinode will be necessary. this is essentially + equivalent to vm_prfile described above. + +I have to give up this "looks-smater" approach. --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/06xattr.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/06xattr.txt @@ -0,0 +1,96 @@ + +# Copyright (C) 2014-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Listing XATTR/EA and getting the value +---------------------------------------------------------------------- +For the inode standard attributes (owner, group, timestamps, etc.), aufs +shows the values from the topmost existing file. This behaviour is good +for the non-dir entries since the bahaviour exactly matches the shown +information. But for the directories, aufs considers all the same named +entries on the lower branches. Which means, if one of the lower entry +rejects readdir call, then aufs returns an error even if the topmost +entry allows it. This behaviour is necessary to respect the branch fs's +security, but can make users confused since the user-visible standard +attributes don't match the behaviour. +To address this issue, aufs has a mount option called dirperm1 which +checks the permission for the topmost entry only, and ignores the lower +entry's permission. + +A similar issue can happen around XATTR. +getxattr(2) and listxattr(2) families behave as if dirperm1 option is +always set. Otherwise these very unpleasant situation would happen. +- listxattr(2) may return the duplicated entries. +- users may not be able to remove or reset the XATTR forever, + + +XATTR/EA support in the internal (copy,move)-(up,down) +---------------------------------------------------------------------- +Generally the extended attributes of inode are categorized as these. +- "security" for LSM and capability. +- "system" for posix ACL, 'acl' mount option is required for the branch + fs generally. +- "trusted" for userspace, CAP_SYS_ADMIN is required. +- "user" for userspace, 'user_xattr' mount option is required for the + branch fs generally. + +Moreover there are some other categories. Aufs handles these rather +unpopular categories as the ordinary ones, ie. there is no special +condition nor exception. + +In copy-up, the support for XATTR on the dst branch may differ from the +src branch. In this case, the copy-up operation will get an error and +the original user operation which triggered the copy-up will fail. It +can happen that even all copy-up will fail. +When both of src and dst branches support XATTR and if an error occurs +during copying XATTR, then the copy-up should fail obviously. That is a +good reason and aufs should return an error to userspace. But when only +the src branch support that XATTR, aufs should not return an error. +For example, the src branch supports ACL but the dst branch doesn't +because the dst branch may natively un-support it or temporary +un-support it due to "noacl" mount option. Of course, the dst branch fs +may NOT return an error even if the XATTR is not supported. It is +totally up to the branch fs. + +Anyway when the aufs internal copy-up gets an error from the dst branch +fs, then aufs tries removing the just copied entry and returns the error +to the userspace. The worst case of this situation will be all copy-up +will fail. + +For the copy-up operation, there two basic approaches. +- copy the specified XATTR only (by category above), and return the + error unconditionally if it happens. +- copy all XATTR, and ignore the error on the specified category only. + +In order to support XATTR and to implement the correct behaviour, aufs +chooses the latter approach and introduces some new branch attributes, +"icexsec", "icexsys", "icextr", "icexusr", and "icexoth". +They correspond to the XATTR namespaces (see above). Additionally, to be +convenient, "icex" is also provided which means all "icex*" attributes +are set (here the word "icex" stands for "ignore copy-error on XATTR"). + +The meaning of these attributes is to ignore the error from setting +XATTR on that branch. +Note that aufs tries copying all XATTR unconditionally, and ignores the +error from the dst branch according to the specified attributes. + +Some XATTR may have its default value. The default value may come from +the parent dir or the environment. If the default value is set at the +file creating-time, it will be overwritten by copy-up. +Some contradiction may happen I am afraid. +Do we need another attribute to stop copying XATTR? I am unsure. For +now, aufs implements the branch attributes to ignore the error. --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/07export.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/07export.txt @@ -0,0 +1,58 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Export Aufs via NFS +---------------------------------------------------------------------- +Here is an approach. +- like xino/xib, add a new file 'xigen' which stores aufs inode + generation. +- iget_locked(): initialize aufs inode generation for a new inode, and + store it in xigen file. +- destroy_inode(): increment aufs inode generation and store it in xigen + file. it is necessary even if it is not unlinked, because any data of + inode may be changed by UDBA. +- encode_fh(): for a root dir, simply return FILEID_ROOT. otherwise + build file handle by + + branch id (4 bytes) + + superblock generation (4 bytes) + + inode number (4 or 8 bytes) + + parent dir inode number (4 or 8 bytes) + + inode generation (4 bytes)) + + return value of exportfs_encode_fh() for the parent on a branch (4 + bytes) + + file handle for a branch (by exportfs_encode_fh()) +- fh_to_dentry(): + + find the index of a branch from its id in handle, and check it is + still exist in aufs. + + 1st level: get the inode number from handle and search it in cache. + + 2nd level: if not found in cache, get the parent inode number from + the handle and search it in cache. and then open the found parent + dir, find the matching inode number by vfs_readdir() and get its + name, and call lookup_one_len() for the target dentry. + + 3rd level: if the parent dir is not cached, call + exportfs_decode_fh() for a branch and get the parent on a branch, + build a pathname of it, convert it a pathname in aufs, call + path_lookup(). now aufs gets a parent dir dentry, then handle it as + the 2nd level. + + to open the dir, aufs needs struct vfsmount. aufs keeps vfsmount + for every branch, but not itself. to get this, (currently) aufs + searches in current->nsproxy->mnt_ns list. it may not be a good + idea, but I didn't get other approach. + + test the generation of the gotten inode. +- every inode operation: they may get EBUSY due to UDBA. in this case, + convert it into ESTALE for NFSD. +- readdir(): call lockdep_on/off() because filldir in NFSD calls + lookup_one_len(), vfs_getattr(), encode_fh() and others. --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/08shwh.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/08shwh.txt @@ -0,0 +1,52 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Show Whiteout Mode (shwh) +---------------------------------------------------------------------- +Generally aufs hides the name of whiteouts. But in some cases, to show +them is very useful for users. For instance, creating a new middle layer +(branch) by merging existing layers. + +(borrowing aufs1 HOW-TO from a user, Michael Towers) +When you have three branches, +- Bottom: 'system', squashfs (underlying base system), read-only +- Middle: 'mods', squashfs, read-only +- Top: 'overlay', ram (tmpfs), read-write + +The top layer is loaded at boot time and saved at shutdown, to preserve +the changes made to the system during the session. +When larger changes have been made, or smaller changes have accumulated, +the size of the saved top layer data grows. At this point, it would be +nice to be able to merge the two overlay branches ('mods' and 'overlay') +and rewrite the 'mods' squashfs, clearing the top layer and thus +restoring save and load speed. + +This merging is simplified by the use of another aufs mount, of just the +two overlay branches using the 'shwh' option. +# mount -t aufs -o ro,shwh,br:/livesys/overlay=ro+wh:/livesys/mods=rr+wh \ + aufs /livesys/merge_union + +A merged view of these two branches is then available at +/livesys/merge_union, and the new feature is that the whiteouts are +visible! +Note that in 'shwh' mode the aufs mount must be 'ro', which will disable +writing to all branches. Also the default mode for all branches is 'ro'. +It is now possible to save the combined contents of the two overlay +branches to a new squashfs, e.g.: +# mksquashfs /livesys/merge_union /path/to/newmods.squash + +This new squashfs archive can be stored on the boot device and the +initramfs will use it to replace the old one at the next boot. --- linux-azure-4.13.0.orig/Documentation/filesystems/aufs/design/10dynop.txt +++ linux-azure-4.13.0/Documentation/filesystems/aufs/design/10dynop.txt @@ -0,0 +1,47 @@ + +# Copyright (C) 2010-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Dynamically customizable FS operations +---------------------------------------------------------------------- +Generally FS operations (struct inode_operations, struct +address_space_operations, struct file_operations, etc.) are defined as +"static const", but it never means that FS have only one set of +operation. Some FS have multiple sets of them. For instance, ext2 has +three sets, one for XIP, for NOBH, and for normal. +Since aufs overrides and redirects these operations, sometimes aufs has +to change its behaviour according to the branch FS type. More importantly +VFS acts differently if a function (member in the struct) is set or +not. It means aufs should have several sets of operations and select one +among them according to the branch FS definition. + +In order to solve this problem and not to affect the behaviour of VFS, +aufs defines these operations dynamically. For instance, aufs defines +dummy direct_IO function for struct address_space_operations, but it may +not be set to the address_space_operations actually. When the branch FS +doesn't have it, aufs doesn't set it to its address_space_operations +while the function definition itself is still alive. So the behaviour +itself will not change, and it will return an error when direct_IO is +not set. + +The lifetime of these dynamically generated operation object is +maintained by aufs branch object. When the branch is removed from aufs, +the reference counter of the object is decremented. When it reaches +zero, the dynamically generated operation object will be freed. + +This approach is designed to support AIO (io_submit), Direct I/O and +XIP (DAX) mainly. +Currently this approach is applied to address_space_operations for +regular files only. --- linux-azure-4.13.0.orig/Documentation/filesystems/dax.txt +++ linux-azure-4.13.0/Documentation/filesystems/dax.txt @@ -63,9 +63,8 @@ - implementing an mmap file operation for DAX files which sets the VM_MIXEDMAP and VM_HUGEPAGE flags on the VMA, and setting the vm_ops to include handlers for fault, pmd_fault, page_mkwrite, pfn_mkwrite. These - handlers should probably call dax_iomap_fault() (for fault and page_mkwrite - handlers), dax_iomap_pmd_fault(), dax_pfn_mkwrite() passing the appropriate - iomap operations. + handlers should probably call dax_iomap_fault() passing the appropriate + fault size and iomap operations. - calling iomap_zero_range() passing appropriate iomap operations instead of block_truncate_page() for DAX files - ensuring that there is sufficient locking between reads, writes, --- linux-azure-4.13.0.orig/Documentation/filesystems/overlayfs.txt +++ linux-azure-4.13.0/Documentation/filesystems/overlayfs.txt @@ -210,8 +210,11 @@ beneath or above the path of another overlay lower layer path. Using an upper layer path and/or a workdir path that are already used by -another overlay mount is not allowed and will fail with EBUSY. Using +another overlay mount is not allowed and may fail with EBUSY. Using partially overlapping paths is not allowed but will not fail with EBUSY. +If files are accessed from two overlayfs mounts which share or overlap the +upper layer and/or workdir path the behavior of the overlay is undefined, +though it will not result in a crash or deadlock. Mounting an overlay using an upper layer path, where the upper layer path was previously used by another mounted overlay in combination with a --- linux-azure-4.13.0.orig/Documentation/kmsg/IPVS +++ linux-azure-4.13.0/Documentation/kmsg/IPVS @@ -0,0 +1,81 @@ +/*? Text: "%s(): NULL arg\n" */ +/*? Text: "%s(): NULL scheduler_name\n" */ +/*? Text: "%s(): [%s] pe already existed in the system\n" */ +/*? Text: "%s(): [%s] pe already linked\n" */ +/*? Text: "%s(): [%s] pe is not in the list. failed\n" */ +/*? Text: "%s(): [%s] scheduler already existed in the system\n" */ +/*? Text: "%s(): [%s] scheduler already linked\n" */ +/*? Text: "%s(): [%s] scheduler is not in the list. failed\n" */ +/*? Text: "%s(): done error\n" */ +/*? Text: "%s(): init error\n" */ +/*? Text: "%s(): lower threshold is higher than upper threshold\n" */ +/*? Text: "%s(): no memory\n" */ +/*? Text: "%s(): request for already hashed, called from %pF\n" */ +/*? Text: "%s(): request for unhash flagged, called from %pF\n" */ +/*? Text: "%s(): server weight less than zero\n" */ +/*? Text: "%s: %s %pI4:%d - %s\n" */ +/*? Text: "%s: %s [%pI6]:%d - %s\n" */ +/*? Text: "%s: %s [%pI6c]:%d - %s\n" */ +/*? Text: "%s: FWM %u 0x%08X - %s\n" */ +/*? Text: "%s: enter\n" */ +/*? Text: "%s: loaded support on port[%d] = %d\n" */ +/*? Text: "BACKUP v0, Dropping buffer bogus conn options\n" */ +/*? Text: "BACKUP v0, bogus conn\n" */ +/*? Text: "BACKUP, Dropping buffer, Err: %d in decoding\n" */ +/*? Text: "BACKUP, Dropping buffer, Unknown version %d\n" */ +/*? Text: "BACKUP, Dropping buffer, msg > buffer\n" */ +/*? Text: "BACKUP, Dropping buffer, to small\n" */ +/*? Text: "BACKUP, Invalid PE parameters\n" */ +/*? Text: "BUG control DEL with n=0 : %s:%d to %s:%d\n" */ +/*? Text: "Connection hash table configured (size=%d, memory=%ldKbytes)\n" */ +/*? Text: "Error binding address of the mcast interface\n" */ +/*? Text: "Error binding to the multicast addr\n" */ +/*? Text: "Error connecting to the multicast addr\n" */ +/*? Text: "Error during creation of socket; terminating\n" */ +/*? Text: "Error joining to the multicast group\n" */ +/*? Text: "Error setting outbound mcast interface\n" */ +/*? Text: "Failed to stop Backup Daemon\n" */ +/*? Text: "Failed to stop Master Daemon\n" */ +/*? Text: "Registered protocols (%s)\n" */ +/*? Text: "SYNC, connection pe_data invalid\n" */ +/*? Text: "Schedule: port zero only supported in persistent services, check your ipvs configuration\n" */ +/*? Text: "Scheduler module ip_vs_%s not found\n" */ +/*? Text: "There is no net ptr to find in the skb in %s() line:%d\n" */ +/*? Text: "UDP no ns data\n" */ +/*? Text: "You probably need to specify IP address on multicast interface.\n" */ +/*? Text: "[%s] pe registered.\n" */ +/*? Text: "[%s] pe unregistered.\n" */ +/*? Text: "[%s] scheduler registered.\n" */ +/*? Text: "[%s] scheduler unregistered.\n" */ +/*? Text: "can't register hooks.\n" */ +/*? Text: "can't register netlink/ioctl.\n" */ +/*? Text: "can't setup connection table.\n" */ +/*? Text: "can't setup control.\n" */ +/*? Text: "cannot register Generic Netlink interface.\n" */ +/*? Text: "cannot register sockopt.\n" */ +/*? Text: "get_ctl: len %u < %u\n" */ +/*? Text: "ip_vs_send_async error %d\n" */ +/*? Text: "ip_vs_sync_buff_create failed.\n" */ +/*? Text: "ipvs loaded.\n" */ +/*? Text: "ipvs unloaded.\n" */ +/*? Text: "length: %u != %u\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "not enough space in Netlink message\n" */ +/*? Text: "persistence engine module ip_vs_pe_%s not found\n" */ +/*? Text: "receiving message error\n" */ +/*? Text: "request control ADD for already controlled: %s:%d to %s:%d\n" */ +/*? Text: "request control DEL for uncontrolled: %s:%d to %s:%d\n" */ +/*? Text: "set_ctl: invalid protocol: %d %pI4:%d %s\n" */ +/*? Text: "set_ctl: len %u != %u\n" */ +/*? Text: "shouldn't reach here, because the box is on the half connection in the tun/dr module.\n" */ +/*? Text: "stopping backup sync thread %d ...\n" */ +/*? Text: "stopping master sync thread %d ...\n" */ +/*? Text: "sync thread started: state = BACKUP, mcast_ifn = %s, syncid = %d\n" */ +/*? Text: "sync thread started: state = MASTER, mcast_ifn = %s, syncid = %d\n" */ +/*? Text: "unknown Generic Netlink command\n" */ +/*? Text: "sync thread started: state = MASTER, mcast_ifn = %s, syncid = %d, id = %d\n" */ +/*? Text: "sync thread started: state = BACKUP, mcast_ifn = %s, syncid = %d, id = %d\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "Unknown mcast interface: %s\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/aes_s390 +++ linux-azure-4.13.0/Documentation/kmsg/s390/aes_s390 @@ -0,0 +1,45 @@ +/*? + * Text: "Allocating XTS fallback algorithm %s failed\n" + * Severity: Error + * Parameter: + * @1: algorithm name + * Description: + * The aes_s390 module failed to allocate a software fallback for the AES + * modes that are not supported by the hardware. A possible reason for this + * problem is that the aes_generic module that provides the fallback + * algorithms is not available. + * User action: + * Ensure that the aes_generic module is available and loaded and reload + * the aes_s390 module. + */ + +/*? + * Text: "Allocating AES fallback algorithm %s failed\n" + * Severity: Error + * Parameter: + * @1: algorithm name + * Description: + * The advanced encryption standard (AES) algorithm includes three modes with + * 128-bit, 192-bit, and 256-bit keys. Your hardware system only provides + * hardware acceleration for the 128-bit mode. The aes_s390 module failed to + * allocate a software fallback for the AES modes that are not supported by the + * hardware. A possible reason for this problem is that the aes_generic module + * that provides the fallback algorithms is not available. + * User action: + * Use the 128-bit mode only or ensure that the aes_generic module is available + * and loaded and reload the aes_s390 module. + */ + +/*? + * Text: "AES hardware acceleration is only available for 128-bit keys\n" + * Severity: Informational + * Description: + * The advanced encryption standard (AES) algorithm includes three modes with + * 128-bit, 192-bit, and 256-bit keys. Your hardware system only provides + * hardware acceleration for the 128-bit key mode. The aes_s390 module + * will use the less performant software fallback algorithm for the 192-bit + * and 256-bit key modes. + * User action: + * None. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/af_iucv +++ linux-azure-4.13.0/Documentation/kmsg/s390/af_iucv @@ -0,0 +1,23 @@ +/*? + * Text: "Application %s on z/VM guest %s exceeds message limit\n" + * Severity: Error + * Parameter: + * @1: application name + * @2: z/VM user ID + * Description: + * Messages or packets destined for the application have accumulated and + * reached the maximum value. The default for the message limit is 65535. + * You can specify a different limit as the value for MSGLIMIT within + * the IUCV statement of the z/VM virtual machine on which the application + * runs. + * User action: + * Ensure that you do not send data faster than the application retrieves + * them. Ensure that the message limit on the z/VM guest virtual machine + * on which the application runs is high enough. + */ + +/*? Text: "Attempt to release alive iucv socket %p\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/ap +++ linux-azure-4.13.0/Documentation/kmsg/s390/ap @@ -0,0 +1,49 @@ +/*? + * Text: "%d is not a valid cryptographic domain\n" + * Severity: Warning + * Parameter: + * @1: AP domain index + * Description: + * The cryptographic domain specified for the 'domain=' module or kernel + * parameter must be an integer in the range 0 to 15. + * User action: + * Reload the cryptographic device driver with a correct module parameter. + * If the device driver has been compiled into the kernel, correct the value + * in the kernel parameter line and reboot Linux. + */ + +/*? + * Text: "The hardware system does not support AP instructions\n" + * Severity: Warning + * Description: + * The ap module addresses AP adapters through AP instructions. The hardware + * system on which the Linux instance runs does not support AP instructions. + * The ap module cannot detect any AP adapters. + * User action: + * Load the ap module only if your Linux instance runs on hardware that + * supports AP instructions. If the ap module has been compiled into the kernel, + * ignore this message. + */ + +/*? + * Text: "Registering adapter interrupts for AP device %02x.%04x failed\n" + * Severity: Error + * Parameter: + * @1: AP device ID + * @2: AP queue + * Description: + * The hardware system supports AP adapter interrupts but failed to enable + * an adapter for interrupts. Possible causes for this error are: + * i) The AP adapter firmware does not support AP interrupts. + * ii) An AP adapter firmware update to a firmware level that supports AP + * adapter interrupts failed. + * iii) The AP adapter firmware has been successfully updated to a level that + * supports AP interrupts but the new firmware has not been activated. + * User action: + * Ensure that the firmware on your AP adapters support AP interrupts and that + * any firmware updates have completed successfully. If necessary, deconfigure + * your cryptographic adapters and reconfigure them to ensure that any firmware + * updates become active, then reload the ap module. If the ap module has been + * compiled into the kernel, reboot Linux. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/appldata +++ linux-azure-4.13.0/Documentation/kmsg/s390/appldata @@ -0,0 +1,91 @@ +/*? + * Text: "Starting the data collection for %s failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: appldata module + * @2: return code + * Description: + * The specified data collection module used the z/VM diagnose call + * DIAG 0xDC to start writing data. z/VM returned an error and the data + * collection could not start. If the return code is 5, your z/VM guest + * virtual machine is not authorized to write data records. + * User action: + * If the return code is 5, ensure that your z/VM guest virtual machine's + * entry in the z/VM directory includes the OPTION APPLMON statement. + * For other return codes see the section about DIAGNOSE Code X'DC' + * in "z/VM CP Programming Services". + */ + +/*? + * Text: "Stopping the data collection for %s failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: appldata module + * @2: return code + * Description: + * The specified data collection module used the z/VM diagnose call DIAG 0xDC + * to stop writing data. z/VM returned an error and the data collection + * continues. + * User action: + * See the section about DIAGNOSE Code X'DC' in "z/VM CP Programming Services". + */ + +/*? + * Text: "Starting a new OS data collection failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * After a CPU hotplug event, the record size for the running operating + * system data collection is no longer correct. The appldata_os module tried + * to start a new data collection with the correct record size but received + * an error from the z/VM diagnose call DIAG 0xDC. Any data collected with + * the current record size might be faulty. + * User action: + * Start a new data collection with the cappldata_os module. For information + * about starting data collections see "Device Drivers, Features, and + * Commands". For information about the return codes see the section about + * DIAGNOSE Code X'DC' in "z/VM CP Programming Services". + */ + +/*? + * Text: "Stopping a faulty OS data collection failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * After a CPU hotplug event, the record size for the running operating + * system data collection is no longer correct. The appldata_os module tried + * to stop the faulty data collection but received an error from the z/VM + * diagnose call DIAG 0xDC. Any data collected with the current record size + * might be faulty. + * User action: + * Try to restart appldata_os monitoring. For information about stopping + * and starting data collections see "Device Drivers, Features, and + * Commands". For information about the return codes see the section about + * DIAGNOSE Code X'DC' in "z/VM CP Programming Services". + */ + +/*? + * Text: "Maximum OS record size %i exceeds the maximum record size %i\n" + * Severity: Error + * Parameter: + * @1: no of bytes + * @2: no of bytes + * Description: + * The OS record size grows with the number of CPUs and is adjusted by the + * appldata_os module in response to CPU hotplug events. For more than 110 + * CPUs the record size would exceed the maximum record size of 4024 bytes + * that is supported by the z/VM hypervisor. To prevent the maximum supported + * record size from being exceeded while data collection is in progress, + * you cannot load the appldata_os module on Linux instances that are + * configured for a maximum of more than 110 CPUs. + * User action: + * If you do not want to collect operating system data, you can ignore this + * message. If you want to collect operating system data, reconfigure your + * Linux instance to support less than 110 CPUs. + */ + +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/bpf_jit +++ linux-azure-4.13.0/Documentation/kmsg/s390/bpf_jit @@ -0,0 +1,16 @@ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ + +/*? + * Text: "Unknown opcode %02x\n" + * Severity: Error + * Parameter: + * @1: Instruction opcode + * Description: + * The BPF JIT compiler has found an unknown instruction in the BPF program + * and therefore stops the compilation. As a fallback, the interpreter is used. + * User action: + * Report this problem and the error message to your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/cio +++ linux-azure-4.13.0/Documentation/kmsg/s390/cio @@ -0,0 +1,247 @@ +/*? + * Text: "%s is not a valid device for the cio_ignore kernel parameter\n" + * Severity: Warning + * Parameter: + * @1: device bus-ID + * Description: + * The device specification for the cio_ignore kernel parameter is + * syntactically incorrect or specifies an unknown device. This device is not + * excluded from being sensed and analyzed. + * User action: + * Correct your device specification in the kernel parameter line to have the + * device excluded when you next reboot Linux. You can write the correct + * device specification to /proc/cio_ignore to add the device to the list of + * devices to be excluded. This does not immediately make the device + * inaccessible but the device is ignored if it disappears and later reappears. + */ + +/*? + * Text: "0.%x.%04x to 0.%x.%04x is not a valid range for cio_ignore\n" + * Severity: Warning + * Parameter: + * @1: from subchannel set ID + * @2: from device number + * @3: to subchannel set ID + * @4: to device number + * Description: + * The device range specified for the cio_ignore kernel parameter is + * syntactically incorrect. No devices specified with this range are + * excluded from being sensed and analyzed. + * User action: + * Correct your range specification in the kernel parameter line to have the + * range of devices excluded when you next reboot Linux. You can write the + * correct range specification to /proc/cio_ignore to add the range of devices + * to the list of devices to be excluded. This does not immediately make the + * devices in the range inaccessible but any of these devices are ignored if + * they disappear and later reappear. + */ + +/*? + * Text: "Processing %s for channel path %x.%02x\n" + * Severity: Notice + * Parameter: + * @1: configuration change + * @2: channel subsystem ID + * @3: CHPID + * Description: + * A configuration change is in progress for the given channel path. + * User action: + * None. + */ + +/*? + * Text: "No CCW console was found\n" + * Severity: Warning + * Description: + * Linux did not find the expected CCW console and tries to use an alternative + * console. A possible reason why the console was not found is that the console + * has been specified in the cio_ignore list. + * User action: + * None, if an appropriate alternative console has been found, and you want + * to use this alternative console. If you want to use the CCW console, ensure + * that is not specified in the cio_ignore list, explicitly specify the console + * with the 'condev=' kernel parameter, and reboot Linux. + */ + +/*? + * Text: "Channel measurement facility initialized using format %s (mode %s)\n" + * Severity: Informational + * Parameter: + * @1: format + * @2: mode + * Description: + * The channel measurement facility has been initialized successfully. + * Format 'extended' should be used for z990 and later mainframe systems. + * Format 'basic' is intended for earlier mainframes. Mode 'autodetected' means + * that the format has been set automatically. Mode 'parameter' means that the + * format has been set according to the 'format=' kernel parameter. + * User action: + * None. + */ + +/*? + * Text: "The CSS device driver initialization failed with errno=%d\n" + * Severity: Alert + * Parameter: + * @1: Return code + * Description: + * The channel subsystem bus could not be established. + * User action: + * See the errno man page to find out what caused the problem. + */ + /*? Text: "%s: Got subchannel machine check but no sch_event handler provided.\n" */ + +/*? + * Text: "%s: Setting the device online failed because it is boxed\n" + * Severity: Warning + * Parameter: + * @1: Device bus-ID + * Description: + * Initialization of a device did not complete because it did not respond in + * time or it was reserved by another operating system. + * User action: + * Make sure that the device is working correctly, then try again to set it + * online. For devices that support the reserve/release mechanism (for example + * DASDs), you can try to override the reservation of the other system by + * writing 'force' to the 'online' sysfs attribute of the affected device. + */ + +/*? + * Text: "%s: Setting the device online failed because it is not operational\n" + * Severity: Warning + * Parameter: + * @1: Device bus-ID + * Description: + * Initialization of a device did not complete because it is not present or + * not operational. + * User action: + * Make sure that the device is present and working correctly, then try again + * to set it online. + */ + +/*? + * Text: "%s: The device stopped operating while being set offline\n" + * Severity: Warning + * Parameter: + * @1: Device bus-ID + * Description: + * While the device was set offline, it was not present or not operational. + * The device is now inactive, but setting it online again might fail. + * User action: + * None. + */ + +/*? + * Text: "%s: The device entered boxed state while being set offline\n" + * Severity: Warning + * Parameter: + * @1: Device bus-ID + * Description: + * While the device was set offline, it did not respond in time or it was + * reserved by another operating system. The device is now inactive, but + * setting it online again might fail. + * User action: + * None. + */ + +/*? + * Text: "Logging for subchannel 0.%x.%04x failed with errno=%d\n" + * Severity: Warning + * Parameter: + * @1: subchannel set ID + * @2: subchannel number + * @3: errno + * Description: + * Capturing model-dependent logs and traces could not be triggered for the + * specified subchannel. + * User action: + * See the errno man page to find out what caused the problem. + */ + +/*? + * Text: "Logging for subchannel 0.%x.%04x was triggered\n" + * Severity: Notice + * Parameter: + * @1: subchannel set ID + * @2: subchannel number + * Description: + * Model-dependent logs and traces may be captured for the specified + * subchannel. + * User action: + * None. + */ + +/*? + * Text: "%s: No interrupt was received within %lus (CS=%02x, DS=%02x, CHPID=%x.%02x)\n" + * Severity: Warning + * Parameter: + * @1: device number + * @2: timeout value + * @3: channel status + * @4: device status + * @5: channel subsystem ID + * @6: CHPID + * Description: + * Internal I/Os are used by the common I/O layer to ensure that devices are + * operational and accessible. + * The common I/O layer did not receive an interrupt for an internal I/O + * during the specified timeout period. + * As a result, the device might assume a state that makes the device + * unusable to Linux until the problem is resolved. + * User action: + * Make sure that the device is working correctly and try the action again. + */ + +/*? + * Text: "Link stopped: RS=%02x RSID=%04x IC=%02x IUPARAMS=%s IUNODEID=%s AUPARAMS=%s AUNODEID=%s\n" + * Severity: Error + * Parameter: + * @1: reporting source + * @2: reporting source ID + * @3: incident code + * @4: incident unit parameters + * @5: incident unit node ID + * @6: attached unit parameters + * @7: attached unit node ID + * + * Description: + * A hardware error has occurred. A unit at one end of an interface + * link has detected a failure in the link or in one of the units attached to + * the link. As a result, data transfer across the link has stopped. In the + * message text, the node IDs of involved units are represented in the + * following format: TTTTTT/MDL,MMM.PPSSSSSSSSSSSS,XXXX where TTTTTT refers to + * the machine type, MDL the model number, MMM the manufacturer, PP the + * manufacturing plant, SSSSSSSSSSSS the unit sequence number and XXXX the + * machine type-dependent physical interface number. If no data is available + * for the unit parameters or node ID field, "n/a" is used instead. + * + * User action: + * Report the problem to your support organization. + */ + +/*? + * Text: "Link degraded: RS=%02x RSID=%04x IC=%02x IUPARAMS=%s IUNODEID=%s AUPARAMS=%s AUNODEID=%s\n" + * Severity: Warning + * Parameter: + * @1: reporting source + * @2: reporting source ID + * @3: incident code + * @4: incident unit parameters + * @5: incident unit node ID + * @6: attached unit parameters + * @7: attached unit node ID + * Description: + * A hardware error has occurred. A unit at one end of an interface + * link has detected a failure in the link or in one of the units attached to + * the link. As a result, data transfer across the link is degraded. In the + * message text, the node IDs of involved units are represented in the + * following format: TTTTTT/MDL,MMM.PPSSSSSSSSSSSS,XXXX where TTTTTT refers to + * the machine type, MDL the model number, MMM the manufacturer, PP the + * manufacturing plant, SSSSSSSSSSSS the unit sequence number and XXXX the + * machine type-dependent physical interface number. If no data is available + * for the unit parameters or node ID field, "n/a" is used instead. + * + * User action: + * Report the problem to your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/cpcmd +++ linux-azure-4.13.0/Documentation/kmsg/s390/cpcmd @@ -0,0 +1,16 @@ +/*? + * Text: "The cpcmd kernel function failed to allocate a response buffer\n" + * Severity: Warning + * Description: + * IPL code, console detection, and device drivers like vmcp or vmlogrdr use + * the cpcmd kernel function to send commands to the z/VM control program (CP). + * If a program that uses the cpcmd function does not allocate a contiguous + * response buffer below 2 GB guest real storage, cpcmd creates a bounce buffer + * to be used as the response buffer. Because of low memory or memory + * fragmentation, cpcmd could not create the bounce buffer. + * User action: + * Look for related page allocation failure messages and at the stack trace to + * find out which program or operation failed. Free some memory and retry the + * failed operation. Consider allocating more memory to your z/VM guest virtual + * machine. + */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/cpu +++ linux-azure-4.13.0/Documentation/kmsg/s390/cpu @@ -0,0 +1,46 @@ +/*? + * Text: "%d configured CPUs, %d standby CPUs\n" + * Severity: Informational + * Parameter: + * @1: number of configured CPUs + * @2: number of standby CPUs + * Description: + * The kernel detected the given number of configured and standby CPUs. + * User action: + * None. + */ + +/*? + * Text: "The CPU configuration topology of the machine is:" + * Severity: Informational + * Description: + * The first six values of the topology information represent fields Mag6 to + * Mag1 of system-information block (SYSIB) 15.1.2. These fields specify the + * maximum numbers of topology-list entries (TLE) at successive topology nesting + * levels. The last value represents the MNest value of SYSIB 15.1.2 which + * specifies the maximum possible nesting that can be configured through + * dynamic changes. For details see the SYSIB 15.1.2 information in the + * "Principles of Operation." + * User action: + * None. + */ + +/*? + * Text: "CPU %i exceeds the maximum %i and is excluded from the dump\n" + * Severity: Warning + * Parameter: + * @1: CPU number + * @2: maximum CPU number + * Description: + * The Linux kernel is used as a system dumper but it runs on more CPUs than + * it has been compiled for with the CONFIG_NR_CPUS kernel configuration + * option. The system dump will be created but information on one or more + * CPUs will be missing. + * User action: + * Update the system dump kernel to a newer version that supports more + * CPUs or reduce the number of installed CPUs and reproduce the problem + * that should be analyzed. If you send the system dump that prompted this + * message to a support organization, be sure to communicate that the dump + * does not include all CPU information. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/cpum_cf +++ linux-azure-4.13.0/Documentation/kmsg/s390/cpum_cf @@ -0,0 +1,68 @@ +/*? + * Text: "Enabling the performance measuring unit failed with rc=%x\n" + * Severity: Error + * Parameter: + * @1: error condition + * Description: + * The device driver failed to enable CPU counter sets with the + * load counter controls (lcctl) instruction. + * See the section about lcctl in "The Load-Program-Parameter and the CPU-Measurement + * Facilities", SA23-2260, for an explanation of the error conditions. + * User action: + * Stop the performance measurement programs and try again. + */ + +/*? + * Text: "Disabling the performance measuring unit failed with rc=%x\n" + * Severity: Error + * Parameter: + * @1: error condition + * Description: + * The device driver failed to disable CPU counter sets with the + * load counter controls (lcctl) instruction. + * See the section about lcctl in "The Load-Program-Parameter and the CPU-Measurement + * Facilities", SA23-2260, for an explanation of the error conditions. + * User action: + * Stop the performance measurement programs and try again. + */ + +/*? + * Text: "Registering the cpum_cf PMU failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: error code + * Description: + * The device driver could not register the Performance Measurement Unit (PMU) + * for the CPU-measurement counter facility. + * A possible cause of this problem is memory constraints. + * User action: + * If the error code is -12 (ENOMEM), consider assigning more memory + * to your Linux instance. + */ + +/*? + * Text: "CPU[%i] Counter data was lost\n" + * Severity: Error + * Parameter: + * @1: cpu number + * Description: + * CPU counter data was lost because of machine internal + * high-priority activities. + * User action: + * None. + */ + +/*? + * Text: "Registering for CPU-measurement alerts failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: error code + * Description: + * The device driver could not register to receive CPU-measurement alerts. + * Alerts make you aware of measurement errors. + * A possible cause of this problem is memory constraints. + * User action: + * If the error code is -12 (ENOMEM), consider assigning more memory + * to your Linux instance. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/cpum_sf +++ linux-azure-4.13.0/Documentation/kmsg/s390/cpum_sf @@ -0,0 +1,104 @@ +/*? + * Text: "The sampling buffer limits have changed to: min=%lu max=%lu (diag=x%lu)\n" + * Severity: Informational + * Parameter: + * @1: minimum size in sample-data-blocks + * @2: maximum size in sample-data-blocks + * @3: size factor for buffering diagnostic-sampling data entries + * Description: + * The minimum or maximum size limit for the sampling facility buffer was + * changed. The change is effective immediately. + * User action: + * None. + */ + +/*? + * Text: "Switching off the sampling facility failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: error condition + * Description: + * The CPU-measurement sampling facility could not be switched off and continues + * to run. For details, see LOAD SAMPLING CONTROLS in + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * If this problem persists, reboot your Linux instance. + */ + +/*? + * Text: "Sample data was lost\n" + * Severity: Error + * Description: + * Sample data was lost because of machine-internal high-priority activities. + * The sampling facility is stopped. + * User action: + * End all performance measurement sessions. Discard the measurement data, + * which are likely to be flawed. Repeat your measurements. + * If the problem persists, contact your hardware administrator. + */ + +/*? + * Text: "Sampling facility support for perf is not available: reason=%04x\n" + * Severity: Error + * Parameter: + * @1: reason code + * Description: + * The device driver could not initialize the sampling facility support. + * Possible reason codes are: + * 0001: The device driver failed to query CPU-measurement sampling facility + * information. + * + * 0002: The device driver does not support the basic-sampling function that + * is available on the LPAR within which the Linux instance runs. + * + * 0003: The device driver could not register to receive CPU-measurement alerts. + * A possible cause of this problem is memory constraints. + * + * 0004: The device driver could not register the Performance Measurement Unit + * (PMU) for the CPU-measurement sampling facility. + * A possible cause of this problem is memory constraints. + * User action: + * Consider assigning more memory to your Linux instance. + */ + +/*? + * Text: "Loading sampling controls failed: op=%i err=%i\n" + * Severity: Error + * Parameter: + * @1: Type of operation + * @2: Error condition + * Description: + * The sampling facility support could not load sampling controls to enable + * (operation type 1) or disable (operation type 2) the CPU-measurement sampling + * facility. For details of the error condition, see LOAD SAMPLING CONTROLS in + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * If the problem persists, reboot your Linux instance. + */ + +/*? + * Text: "A sampling buffer entry is incorrect (alert=0x%x)\n" + * Severity: Error + * Parameter: + * @1: Alert code + * Description: + * An incorrect sampling facility buffer entry was detected. The alert code + * indicates the root cause, for example, an incorrect entry address or an + * incorrect sample-data-block-table entry. + * User action: + * End active performance measurement sessions, for example, perf processes. If + * the problem persists, reboot your Linux instance. + */ + +/*? + * Text: "Registering for s390dbf failed\n" + * Severity: Error + * Description: + * The device driver failed to register for the s390 debug feature. You will + * not receive any debug information. A possible cause of this problem is + * memory constraints. + * User action: + * Consider assigning more memory + * to your Linux instance. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/crc32-vx +++ linux-azure-4.13.0/Documentation/kmsg/s390/crc32-vx @@ -0,0 +1 @@ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/ctcm +++ linux-azure-4.13.0/Documentation/kmsg/s390/ctcm @@ -0,0 +1,202 @@ +/*? + * Text: "%s: An I/O-error occurred on the CTCM device\n" + * Severity: Error + * Parameter: + * @1: bus ID of the CTCM device + * Description: + * An I/O error was detected on one of the subchannels of the CTCM device. + * Depending on the error, the CTCM device driver might attempt an automatic + * recovery. + * User action: + * Check the status of the CTCM device, for example, with ifconfig. If the + * device is not operational, perform a manual recovery. See "Device Drivers, + * Features, and Commands" for details about how to recover a CTCM device. + */ + +/*? + * Text: "%s: An adapter hardware operation timed out\n" + * Severity: Error + * Parameter: + * @1: bus ID of the CTCM device + * Description: + * The CTCM device uses an adapter to physically connect to its communication + * peer. An operation on this adapter timed out. + * User action: + * Check the status of the CTCM device, for example, with ifconfig. If the + * device is not operational, perform a manual recovery. See "Device Drivers, + * Features, and Commands" for details about how to recover a CTCM device. + */ + +/*? + * Text: "%s: An error occurred on the adapter hardware\n" + * Severity: Error + * Parameter: + * @1: bus ID of the CTCM device + * Description: + * The CTCM device uses an adapter to physically connect to its communication + * peer. An operation on this adapter returned an error. + * User action: + * Check the status of the CTCM device, for example, with ifconfig. If the + * device is not operational, perform a manual recovery. See "Device Drivers, + * Features, and Commands" for details about how to recover a CTCM device. + */ + +/*? + * Text: "%s: The communication peer has disconnected\n" + * Severity: Notice + * Parameter: + * @1: channel ID + * Description: + * The remote device has disconnected. Possible reasons are that the remote + * interface has been closed or that the operating system instance with the + * communication peer has been rebooted or shut down. + * User action: + * Check the status of the peer device. Ensure that the peer operating system + * instance is running and that the peer interface is operational. + */ + +/*? + * Text: "%s: The remote operating system is not available\n" + * Severity: Notice + * Parameter: + * @1: channel ID + * Description: + * The operating system instance with the communication peer has disconnected. + * Possible reasons are that the operating system instance has been rebooted + * or shut down. + * User action: + * Ensure that the peer operating system instance is running and that the peer + * interface is operational. + */ + +/*? + * Text: "%s: The adapter received a non-specific IRQ\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the CTCM device + * Description: + * The adapter hardware used by the CTCM device received an IRQ that cannot + * be mapped to a particular device. This is a hardware problem. + * User action: + * Check the status of the CTCM device, for example, with ifconfig. Check if + * the connection to the remote device still works. If the CTCM device is not + * operational, set it offline and back online. If this does not resolve the + * problem, perform a manual recovery. See "Device Drivers, Features, and + * Commands" for details about how to recover a CTCM device. If this problem + * persists, gather Linux debug data, collect the hardware logs, and report the + * problem to your support organization. + */ + +/*? + * Text: "%s: A check occurred on the subchannel\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the CTCM device + * Description: + * A check condition has been detected on the subchannel. + * User action: + * Check if the connection to the remote device still works. If the CTCM device + * is not operational, set it offline and back online. If this does not resolve + * the problem, perform a manual recovery. See "Device Drivers, Features, and + * Commands" for details about how to recover a CTCM device. If this problem + * persists, gather Linux debug data and report the problem to your support + * organization. + */ + +/*? + * Text: "%s: The communication peer is busy\n" + * Severity: Informational + * Parameter: + * @1: channel ID + * Description: + * A busy target device was reported. This might be a temporary problem. + * User action: + * If this problem persists or is reported frequently ensure that the target + * device is working properly. + */ + +/*? + * Text: "%s: The specified target device is not valid\n" + * Severity: Error + * Parameter: + * @1: channel ID + * Description: + * A target device was called with a faulty device specification. This is an + * adapter hardware problem. + * User action: + * Gather Linux debug data, collect the hardware logs, and contact IBM support. + */ + +/*? + * Text: "An I/O operation resulted in error %04x\n" + * Severity: Error + * Parameter: + * @1: channel ID + * @2: error information + * Description: + * A hardware operation ended with an error. + * User action: + * Check the status of the CTCM device, for example, with ifconfig. If the + * device is not operational, perform a manual recovery. See "Device Drivers, + * Features, and Commands" for details about how to recover a CTCM device. + * If this problem persists, gather Linux debug data, collect the hardware logs, + * and report the problem to your support organization. + */ + +/*? + * Text: "%s: Initialization failed with RX/TX init handshake error %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the CTCM device + * @2: error information + * Description: + * A problem occurred during the initialization of the connection. If the + * connection can be established after an automatic recovery, a success message + * is issued. + * User action: + * If the problem is not resolved by the automatic recovery process, check the + * local and remote device. If this problem persists, gather Linux debug data + * and report the problem to your support organization. + */ + +/*? + * Text: "%s: The network backlog for %s is exceeded, package dropped\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the CTCM device + * @2: calling function + * Description: + * There is more network traffic than can be handled by the device. The device + * is closed and some data has not been transmitted. The device might be + * recovered automatically. + * User action: + * Investigate and resolve the congestion. If necessary, set the device + * online to make it operational. + */ + +/*? + * Text: "%s: The XID used in the MPC protocol is not valid, rc = %d\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the CTCM device + * @2: return code + * Description: + * The exchange identification (XID) used by the CTCM device driver when + * in MPC mode is not valid. + * User action: + * Note the error information provided with this message and contact your + * support organization. + */ + +/*? Text: "CTCM driver unloaded\n" */ +/*? Text: "%s: %s Internal error: net_device is NULL, ch = 0x%p\n" */ +/*? Text: "%s / Initializing the ctcm device driver failed, ret = %d\n" */ +/*? Text: "%s: %s: Internal error: Can't determine channel for interrupt device %s\n" */ +/*? Text: "CTCM driver initialized\n" */ +/*? Text: "%s: setup OK : r/w = %s/%s, protocol : %d\n" */ +/*? Text: "%s: Connected with remote side\n" */ +/*? Text: "%s: Restarting device\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/dasd +++ linux-azure-4.13.0/Documentation/kmsg/s390/dasd @@ -0,0 +1,704 @@ +/* dasd_ioctl */ + +/*? + * Text: "%s: The DASD has been put in the quiesce state\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * No I/O operation is possible on this device. + * User action: + * Resume the DASD to enable I/O operations. + */ + +/*? + * Text: "%s: I/O operations have been resumed on the DASD\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD is no longer in state quiesce and I/O operations can be performed + * on the device. + * User action: + * None. + */ + +/*? + * Text: "%s: The DASD cannot be formatted while it is enabled\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD you try to format is enabled. Enabled devices cannot be formatted. + * User action: + * Contact the owner of the formatting tool. + */ + +/*? + * Text: "%s: The specified DASD is a partition and cannot be formatted\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD you try to format is a partition. Partitions cannot be formatted + * separately. You can only format a complete DASD including all its partitions. + * User action: + * Format the complete DASD. + * ATTENTION: Formatting irreversibly destroys all data on all partitions + * of the DASD. + */ + +/*? + * Text: "%s: The specified DASD is a partition and cannot be checked\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD you try to check is a partition. Partitions cannot be checked + * separately. You can only check a complete DASD including all its partitions. + * User action: + * Check the complete DASD. + */ + +/*? + * Text: "%s: Formatting unit %d failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: start track + * @3: return code + * Description: + * The formatting process might have been interrupted by a signal, for example, + * CTRL+C. If the process was not interrupted intentionally, an I/O error + * might have occurred. + * User action: + * Retry to format the device. If the error persists, check the log file for + * related error messages. If you cannot resolve the error, note the return + * code and contact your support organization. + */ + + +/* dasd */ + +/*? + * Text: "%s: Cancelling request %p failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to request + * @3: return code of previous function + * Description: + * In response to a user action, the DASD device driver tried but failed to + * cancel a previously started I/O operation. + * User action: + * Try the action again. + */ + +/*? + * Text: "%s: Flushing the DASD request queue failed for request %p\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to request + * Description: + * As part of the unloading process, the DASD device driver flushes the + * request queue. This failed because a previously started I/O operation + * could not be canceled. + * User action: + * Try again to unload the DASD device driver or to shut down Linux. + */ + +/*? + * Text: "The DASD device driver could not be initialized\n" + * Severity: Informational + * Description: + * The initialization of the DASD device driver failed because of previous + * errors. + * User action: + * Check for related previous error messages. + */ + +/*? + * Text: "%s: Accessing the DASD failed because it is in probeonly mode\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * The dasd= module or kernel parameter specified the probeonly attribute for + * the DASD you are trying to access. The DASD device driver cannot access + * DASDs that are in probeonly mode. + * User action: + * Change the dasd= parameter as to omit probeonly for the DASD and reload + * the DASD device driver. If the DASD device driver has been compiled into + * the kernel, reboot Linux. + */ + +/*? + * Text: "%s: cqr %p timed out (%lus), %i retries remaining\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: request + * @3: timeout value + * @4: number of retries left + * Description: + * A try of the error recovery procedure (ERP) for the channel queued request + * (cqr) timed out and failed to recover the error. ERP continues for the DASD. + * User action: + * Ignore this message if it occurs infrequently and if the recovery succeeds + * during one of the retries. If this error persists, check for related + * previous error messages and report the problem to your support organization. + * + * The timeout can be changed by writing a new value to the sysfs 'expires' attribute of the DASD. The value specifies the timeout in seconds. + */ + +/*? + * Text: "%s: cqr %p timed out (%lus) but cannot be ended, retrying in 5 s\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: request + * @3: timeout value + * Description: + * A try of the error recovery procedure (ERP) for the channel queued request + * (cqr) timed out and failed to recover the error. The I/O request submitted + * during the try could not be canceled. The ERP waits for 5 seconds before + * trying again. + * User action: + * Ignore this message if it occurs infrequently and if the recovery succeeds + * during one of the retries. If this error persists, check for related + * previous error messages and report the problem to your support organization. + * + * The timeout can be changed by writing a new value to the sysfs 'expires' attribute of the DASD. The value specifies the timeout in seconds. + */ + +/*? + * Text: "%s: The DASD cannot be set offline while it is in use\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD cannot be set offline because it is in use by an internal process. + * An action to free the DASD might not have completed yet. + * User action: + * Wait some time and set the DASD offline later. + */ + +/*? + * Text: "%s: The DASD cannot be set offline with open count %i\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: count + * Description: + * The DASD is being used by one or more processes and cannot be set offline. + * User action: + * Ensure that the DASD is not in use anymore, for example, unmount all + * partitions. Then try again to set the DASD offline. + */ + +/*? + * Text: "%s: Setting the DASD online failed with rc=%d\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * Description: + * The DASD could not be set online because of previous errors. + * User action: + * Look for previous error messages. If you cannot resolve the error, note + * the return code and contact your support organization. + */ + +/*? + * Text: "%s Setting the DASD online with discipline %s failed with rc=%i\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: discipline + * @3: return code + * Description: + * The DASD could not be set online because of previous errors. + * User action: + * Look for previous error messages. If you cannot resolve the error, note the + * return code and contact your support organization. + */ + +/*? + * Text: "%s Setting the DASD online failed because of missing DIAG discipline\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD was to be set online with discipline DIAG but this discipline of + * the DASD device driver is not available. + * User action: + * Ensure that the dasd_diag_mod module is loaded. If your Linux system does + * not include this module, you cannot set DASDs online with the DIAG + * discipline. + */ + +/*? + * Text: "%s Setting the DASD online failed because of a missing discipline\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD was to be set online with a DASD device driver discipline that + * is not available. + * User action: + * Ensure that all DASD modules are loaded correctly. + */ + +--------------------------- + +/*? + * Text: "The statistics feature has been switched off\n" + * Severity: Informational + * Description: + * The statistics feature of the DASD device driver has been switched off. + * User action: + * None. + */ + +/*? + * Text: "The statistics feature has been switched on\n" + * Severity: Informational + * Description: + * The statistics feature of the DASD device driver has been switched on. + * User action: + * None. + */ + +/*? + * Text: "The statistics have been reset\n" + * Severity: Informational + * Description: + * The DASD statistics data have been reset. + * User action: + * None. + */ + +/*? + * Text: "%s is not a supported value for /proc/dasd/statistics\n" + * Severity: Warning + * Parameter: + * @1: value + * Description: + * An incorrect value has been written to /proc/dasd/statistics. + * The supported values are: 'set on', 'set off', and 'reset'. + * User action: + * Write a supported value to /proc/dasd/statistics. + */ + +/*? + * Text: "%s is not a valid device range\n" + * Severity: Error + * Parameter: + * @1: range + * Description: + * A device range specified with the dasd= parameter is not valid. + * User action: + * Examine the dasd= parameter and correct the device range. + */ + +/*? + * Text: "The probeonly mode has been activated\n" + * Severity: Informational + * Description: + * The probeonly mode of the DASD device driver has been activated. In this + * mode the device driver rejects any 'open' syscalls with EPERM. + * User action: + * None. + */ + +/*? + * Text: "The IPL device is not a CCW device\n" + * Severity: Error + * Description: + * The value for the dasd= parameter contains the 'ipldev' keyword. During + * the boot process this keyword is replaced with the device from which the + * IPL was performed. The 'ipldev' keyword is not valid if the IPL device is + * not a CCW device. + * User action: + * Do not specify the 'ipldev' keyword when performing an IPL from a device + * other than a CCW device. + */ + +/*? + * Text: "A closing parenthesis ')' is missing in the dasd= parameter\n" + * Severity: Warning + * Description: + * The specification for the dasd= kernel or module parameter has an opening + * parenthesis '(' * without a matching closing parenthesis ')'. + * User action: + * Correct the parameter value. + */ + +/*? + * Text: "The autodetection mode has been activated\n" + * Severity: Informational + * Description: + * The autodetection mode of the DASD device driver has been activated. In + * this mode the DASD device driver sets all detected DASDs online. + * User action: + * None. + */ + +/*? + * Text: "%*s is not a supported device option\n" + * Severity: Warning + * Parameter: + * @1: length of option code + * @2: option code + * Description: + * The dasd= parameter includes an unknown option for a DASD or a device range. + * Options are specified in parenthesis and immediately follow a device or + * device range. + * User action: + * Check the dasd= syntax and remove any unsupported options from the dasd= + * parameter specification. + */ + +/*? + * Text: "PAV support has be deactivated\n" + * Severity: Informational + * Description: + * The 'nopav' keyword has been specified with the dasd= kernel or module + * parameter. The Parallel Access Volume (PAV) support of the DASD device + * driver has been deactivated. + * User action: + * None. + */ + +/*? + * Text: "'nopav' is not supported on z/VM\n" + * Severity: Informational + * Description: + * For Linux instances that run as guest operating systems of the z/VM + * hypervisor Parallel Access Volume (PAV) support is controlled by z/VM not + * by Linux. + * User action: + * Remove 'nopav' from the dasd= module or kernel parameter specification. + */ + +/*? + * Text: "High Performance FICON support has been deactivated\n" + * Severity: Informational + * Description: + * The 'nofcx' keyword has been specified with the dasd= kernel or module + * parameter. The High Performance FICON (transport mode) support of the DASD + * device driver has been deactivated. + * User action: + * None. + */ + +/*? + * Text: "The dasd= parameter value %s has an invalid ending\n" + * Severity: Warning + * Parameter: + * @1: parameter value + * Description: + * The specified value for the dasd= kernel or module parameter is not correct. + * User action: + * Check the module or the kernel parameter. + */ + +/*? + * Text: "Registering the device driver with major number %d failed\n" + * Severity: Warning + * Parameter: + * @1: DASD major + * Description: + * Major number 94 is reserved for the DASD device driver. The DASD device + * driver failed to register with this major number. Another device driver + * might have used major number 94. + * User action: + * Determine which device driver uses major number 94 instead of the DASD + * device driver and unload this device driver. Then try again to load the + * DASD device driver. + */ + +/*? + * Text: "%s: default ERP has run out of retries and failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The error recovery procedure (ERP) tried to recover an error but the number + * of retries for the I/O was exceeded before the error could be resolved. + * User action: + * Check for related previous error messages. + */ + +/*? + * Text: "%s: Unable to terminate request %p on suspend\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to request + * Description: + * As part of the suspend process, the DASD device driver terminates requests + * on the request queue. This failed because a previously started I/O operation + * could not be canceled. The suspend process will be stopped. + * User action: + * Try again to suspend the system. + */ + +/*? + * Text: "%s: ERP failed for the DASD\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * An error recovery procedure (ERP) was performed for the DASD but failed. + * User action: + * Check the message log for previous related error messages. + */ + +/*? + * Text: "%s: An error occurred in the DASD device driver, reason=%s\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: reason code + * Description: + * This problem indicates a program error in the DASD device driver. + * User action: + * Note the reason code and contact your support organization. +*/ + +/*? + * Text: "%s: No operational channel path is left for the device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * All channel paths to the device have become non-operational. The DASD + * device driver suspends I/O operations and queues I/O requests for this + * device until at least one channel path becomes operational again. + * User action: + * Ensure that each channel path to the device has been set up correctly + * and that the related physical cable connections are in place. + */ + +/*? + * Text: "%s: No verified channel paths remain for the device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * All verified channel paths to the device have become non-operational. + * Any other paths to the device have previously been identified as not usable. + * The DASD device driver suspends I/O operations and queues I/O requests + * for this device until at least one channel path becomes operational + * again. + * User action: + * Ensure that each channel path to the device has been set up correctly + * and that the related physical cable connections are in place. + * Set all paths to the device offline and online again to repeat the path + * verification. Alternatively, set the device offline and online again to + * verify all available paths for this device. + * If this problem persists, gather Linux debug data and report the problem + * to your support organization. + */ + +/*? + * Text: "%s: A channel path to the device has become operational\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * At least one channel path of this device has become operational again. + * The DASD device driver resumes I/O operations to the device and processes + * the I/O requests that were queued while there was no operational channel path. + * User action: + * None. + */ + +------------------------------------------------------------------------------------ +/* dasd_diag */ + +/*? + * Text: "%s: A 64-bit DIAG call failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * 64-bit DIAG calls require a 64-bit z/VM version. + * User action: + * Use z/VM 5.2 or later or set the sysfs 'use_diag' attribute of the DASD to 0 + * to switch off DIAG. + */ + +/*? + * Text: "%s: Accessing the DASD failed because of an incorrect format (rc=%d)\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * Description: + * The format of the DASD is not correct. + * User action: + * Check the device format. For details about the return code see the + * section about the INITIALIZE function for DIAGNOSE Code X'250' + * in "z/VM CP Programming Services". If you cannot resolve the error, note + * the return code and contact your support organization. + */ + +/*? + * Text: "%s: New DASD with %ld byte/block, total size %ld KB%s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * @2: bytes per block + * @3: size + * @4: access mode + * Description: + * A DASD with the indicated block size and total size has been set online. + * If the DASD is configured as read-only to the real or virtual hardware, + * the message includes an indication of this hardware access mode. The + * hardware access mode is independent from the 'readonly' attribute of + * the device in sysfs. + * User action: + * None. + */ + +/*? + * Text: "%s: DIAG ERP failed with rc=%d\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * Description: + * An error in the DIAG processing could not be recovered by the error + * recovery procedure (ERP) of the DIAG discipline. + * User action: + * Note the return code, check for related I/O errors, and report this problem + * to your support organization. + */ + +/*? + * Text: "%s: DIAG initialization failed with rc=%d\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * Description: + * Initializing the DASD with the DIAG discipline failed. Possible reasons for + * this problem are that the device has a device type other than FBA or ECKD, + * or has a block size other than one of the supported sizes: + * 512 byte, 1024 byte, 2048 byte, or 4096 byte. + * User action: + * Ensure that the device can be written to and has a supported device type + * and block size. For details about the return code see the section about + * the INITIALIZE function for DIAGNOSE Code X'250' in "z/VM CP Programming + * Services". If you cannot resolve the error, note the error code and contact + * your support organization. + */ + +/*? + * Text: "%s: Device type %d is not supported in DIAG mode\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: device type + * Description: + * Only DASD of type FBA and ECKD are supported in DIAG mode. + * User action: + * Set the sysfs 'use_diag' attribute of the DASD to 0 and try again to access + * the DASD. + */ + +/*? + * Text: "Discipline %s cannot be used without z/VM\n" + * Severity: Informational + * Parameter: + * @1: discipline name + * Description: + * The discipline that is specified with the dasd= kernel or module parameter + * is only available for Linux instances that run as guest operating + * systems of the z/VM hypervisor. + * User action: + * Remove the unsupported discipline from the parameter string. + */ + +/*? + * Text: "%s: The access mode of a DIAG device changed to read-only\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A device changed its access mode from writeable to + * read-only while in use. + * User action: + * Set the device offline, ensure that the device is configured correctly in + * z/VM, then set the device online again. + */ + +------------------------------------------------------------------------------------ +/* dasd_erp */ + +/*? + * Text: "%s: A timeout error occurred for cqr %p\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to request + * Description: + * A channel queued request (cqr) failed because it timed out. + * One possible reason for this error is that a request did not + * complete within the timeout interval specified for the DASD. + * The timeout interval is set as the value of the 'timeout' sysfs + * attribute of a DASD. A value of 0 disables the timeout function. + * The timeout function can be used; for example, by mirroring setups; + * to quickly process a request queue for a DASD that has become unavailable. + * User action: + * Check the message log for previous related error messages. Verify + * that the storage server and the connection from host to storage + * server are operational. If the 'timeout' sysfs attribute of the + * DASD has been set to a value other than 0, verify that this + * setting is intentional and change it if required. + */ + +/*? + * Text: "%s: A transport error occurred for cqr %p\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to request + * Description: + * A channel queued request (cqr) failed because the connection to the + * device was lost and the 'failfast' flag is set for the request. + * This flag can result from, for example: + * + * - A software layer above the DASD device driver; + * for example, in a host based mirroring setup. + * + * - Value 1 for the 'failfast' sysfs attribute of the DASD. + * This setting applies to all requests on the DASD. + * + * User action: + * Ensure that each channel path to the device has been set up + * correctly and that the related physical cable connections are in + * place. If the 'failfast' attribute of the DASD is set to 1, + * verify that this setting is intentional and change it to 0 if required. + */ + +/*? + * Text: "%s Setting the DASD online failed because the required module %s could not be loaded (rc=%d)\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: kernel module name + * @3: return code + * Description: + * The DASD was to be set online with discipline DIAG but this discipline of + * the DASD device driver is not available and an attempt to load the + * corresponding kernel module failed with the specified return code. + * + * User action: + * Ensure that the kernel module with the specified name is correctly installed + * or set the sysfs 'use_diag' attribute of the DASD to 0 to switch off DIAG. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/dasd-eckd +++ linux-azure-4.13.0/Documentation/kmsg/s390/dasd-eckd @@ -0,0 +1,2154 @@ +/* dasd_eckd */ + +/*? + * Text: "%s: ERP failed for the DASD\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * An error recovery procedure (ERP) was performed for the DASD but failed. + * User action: + * Check the message log for previous related error messages. + */ + +/*? + * Text: "%s: An error occurred in the DASD device driver, reason=%s\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: reason code + * Description: + * This problem indicates a program error in the DASD device driver. + * User action: + * Note the reason code and contact your support organization. +*/ + +/*? + * Text: "%s: Allocating memory for private DASD data failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD device driver maintains data structures for each DASD it manages. + * There is not enough memory to allocate these data structures for one or + * more DASD. + * User action: + * Free some memory and try the operation again. + */ + +/*? + * Text: "%s: DASD with %d KB/block, %d KB total size, %d KB/track, %s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * @2: block size + * @3: DASD size + * @4: track size + * @5: disc layout + * Description: + * A DASD with the shown characteristics has been set online. + * User action: + * None. + */ + +/*? + * Text: "%s: Start track number %u used in formatting is too big\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: track number + * Description: + * The DASD format I/O control was used incorrectly by a formatting tool. + * User action: + * Contact the owner of the formatting tool. + */ + +/*? + * Text: "%s: Stop track number %u used in formatting is too big\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: track number + * Description: + * The DASD format I/O control was used incorrectly by a formatting tool. + * User action: + * Contact the owner of the formatting tool. + */ + +/*? + * Text: "%s: The DASD is not formatted\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A DASD has been set online but it has not been formatted yet. You must + * format the DASD before you can use it. + * User action: + * Format the DASD, for example, with dasdfmt. + */ + +/*? + * Text: "%s: 0x%x is not a known command\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: command + * Description: + * This problem is likely to be caused by a programming error. + * User action: + * Contact your support organization. + */ + +/*? + * Text: "%s: Track 0 has no records following the VTOC\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * Linux has identified a volume table of contents (VTOC) on the DASD but + * cannot read any data records following the VTOC. A possible cause of this + * problem is that the DASD has been used with another System z operating + * system. + * User action: + * Format the DASD for usage with Linux, for example, with dasdfmt. + * ATTENTION: Formatting irreversibly destroys all data on the DASD. + */ + +/*? + * Text: "%s: An I/O control call used incorrect flags 0x%x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: flags + * Description: + * The DASD format I/O control was used incorrectly. + * User action: + * Contact the owner of the formatting tool. + */ + +/*? + * Text: "%s: New DASD %04X/%02X (CU %04X/%02X) with %d cylinders, %d heads, %d sectors%s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * @2: device type + * @3: device model + * @4: control unit type + * @5: control unit model + * @6: number of cylinders + * @7: tracks per cylinder + * @8: sectors per track + * @9: access mode + * Description: + * A DASD with the shown characteristics has been set online. + * If the DASD is configured as read-only to the real or virtual hardware, + * the message includes an indication of this hardware access mode. The + * hardware access mode is independent from the 'readonly' attribute of + * the device in sysfs. + * User action: + * None. + */ + +/*? + * Text: "%s: The disk layout of the DASD is not supported\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD device driver only supports the following disk layouts: CDL, LDL, + * FBA, CMS, and CMS RESERVED. + * User action: + * None. + */ + +/*? + * Text: "%s: Start track %u used in formatting exceeds end track\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: track number + * Description: + * The DASD format I/O control was used incorrectly by a formatting tool. + * User action: + * Contact the owner of the formatting tool. + */ + +/*? + * Text: "%s: The DASD cache mode was set to %x (%i cylinder prestage)\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * @2: operation mode + * @3: number of cylinders + * Description: + * The DASD cache mode has been changed. See the storage system documentation + * for information about the different cache operation modes. + * User action: + * None. + */ + +/*? + * Text: "%s: The DASD cannot be formatted with block size %u\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: block size + * Description: + * The block size specified for a format instruction is not valid. The block + * size must be between 512 and 4096 byte and must be a power of 2. + * User action: + * Call the format command with a supported block size. + */ + +/*? + * Text: "%s: The UID of the DASD has changed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The Unique Identifier (UID) of a DASD that is currently in use has changed. + * This indicates that the physical disk has been replaced. + * User action: + * None if the replacement was intentional. + * If the disk change is not expected, stop using the disk to prevent possible + * data loss. +*/ + + +/* dasd_3990_erp */ + +/*? + * Text: "%s: is offline or not installed - INTERVENTION REQUIRED!!\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD to be accessed is not in an accessible state. The I/O operation + * will wait until the device is operational again. This is an operating system + * independent message that is issued by the storage system. + * User action: + * Make the DASD accessible again. For details see the storage system + * documentation. + */ + +/*? + * Text: "%s: The DASD cannot be reached on any path (lpum=%x/opm=%x)\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: last path used mask + * @3: online path mask + * Description: + * After a path to the DASD failed, the error recovery procedure of the DASD + * device driver tried but failed to reconnect the DASD through an alternative + * path. + * User action: + * Ensure that the cabling between the storage server and the mainframe + * system is securely in place. Check the file systems on the DASD when it is + * accessible again. + */ + +/*? + * Text: "%s: Unable to allocate DCTL-CQR\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an internal error. + * User action: + * Contact your support organization. + */ + +/*? + * Text: "%s: FORMAT 0 - Invalid Parameter\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A data argument of a command is not valid. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - DPS Installation Check\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This operating system independent message is issued by the storage system + * for one of the following reasons: + * - A 3380 Model D or E DASD does not have the Dynamic Path Selection (DPS) + * feature in the DASD A-unit. + * - The device type of an attached DASD is not supported by the firmware. + * - A type 3390 DASD is attached to a 3 MB channel. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 2 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Drive motor switch is off\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - CCW Count less than required\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The CCW count of a command is less than required. This is an operating + * system independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Channel requested ... %02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: reason code + * Description: + * This is an operating system independent message that is issued by the + * storage system. The possible reason codes indicate the following problems: + * 00 No Message. + * 01 The channel has requested unit check sense data. + * 02 The channel has requested retry and retry is exhausted. + * 03 A SA Check-2 error has occurred. This sense is presented with + * Equipment Check. + * 04 The channel has requested retry and retry is not possible. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Status Not As Required: reason %02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: reason code + * Description: + * This is an operating system independent message that is issued by the + * storage system. There are several potential reasons for this message; + * byte 8 contains the reason code. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Device status 1 not valid\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Storage Path Restart\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * An operation for an active channel program was queued in a Storage Control + * when a warm start was received by the path. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Reset Notification\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A system reset or its equivalent was received on an interface. The Unit + * Check that generates this sense is posted to the next channel initiated + * selection following the resetting event. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Invalid Command Sequence\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * An incorrect sequence of commands has occurred. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Missing device address bit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Subsystem Processing Error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A firmware logic error has been detected. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Seek incomplete\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Invalid Command\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A command was issued that is not in the 2107/1750 command set. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Command Invalid on Secondary Address\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A command or order not allowed on a PPRC secondary device has been received + * by the secondary device. This is an operating system independent message + * that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Invalid Defective/Alternate Track Pointer\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A defective track has been accessed. The subsystem generates an invalid + * Defective/Alternate Track Pointer as a part of RAID Recovery. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Channel Returned with Incorrect retry CCW\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A command portion of the CCW returned after a command retry sequence does + * not match the command for which retry was signaled. This is an operating + * system independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Diagnostic of Special Command Violates File Mask\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A command is not allowed under the Access Authorization specified by the + * File Mask. This is an operating system independent message that is issued + * by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Head address does not compare\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Device did not respond to selection\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Device check-2 error or Set Sector is not complete\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Device Error Source\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The device has completed soft error logging. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Data Pinned for Device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * Modified data in cache or in persistent storage exists for the DASD. The + * data cannot be destaged to the device. This track is the first track pinned + * for this device. This is an operating system independent message that is + * issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel C\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Device Status 1 not as expected\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Device Fenced - device = %02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: sense data byte 4 + * Description: + * The device shown in sense byte 4 has been fenced. This is an operating + * system independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Interruption cannot be reset\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Index missing\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - DASD Fast Write inhibited\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * DASD Fast Write is not allowed because of a nonvolatile storage battery + * check condition. This is an operating system independent message that is + * issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Invalid tag-in for an extended command sequence\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Key area error; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Count area error; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Track physical address did not compare\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 2 - 3990 check-2 error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Offset active cannot be reset\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - RCC 1 and RCC 2 sequences not successful\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in count address area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Data area error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel A\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in count address area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the key area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Caching status reset to default\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The storage director has assigned two new subsystem status devices and + * resets the status to its default value. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the data area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Device not ready\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in key area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - DASD controller failed to set or reset the long busy latch\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Cylinder address did not compare\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 3 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in data area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 2 - Support facility errors\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Key area error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - End operation with transfer count not zero\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 2 - Microcode detected error %02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: error code + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the count area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 3 - Allegiance terminated\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * Allegiance terminated because of a Reset Allegiance or an Unconditional + * Reserve command on another channel. This is an operating system independent + * message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Home address area error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Count area error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Invalid tag-in during selection sequence\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in data area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in home address area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Home address area error; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Data area error; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in home address area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the home address area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the home address area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the count area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in key area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Invalid DCC selection response or timeout\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the data area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Operation Terminated\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The storage system ends an operation related to an active channel program + * when termination and redrive are required and logging is not desired. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel B\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the key area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Volume is suspended duplex\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The duplex pair volume has entered the suspended duplex state because of a + * failure. This is an operating system independent message that is issued by + * the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel D\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - RCC 1 sequence not successful\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel E\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - 3990 microcode time out when stopping selection\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel F\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - RCC initiated by a connection check alert\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel G\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - extra RCC required\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel H\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - Unexpected end operation response code\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Permanent path error (DASD controller not available)\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Missing end operation; device transfer incomplete\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Cache or nonvolatile storage equipment failure\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * An equipment failure has occurred in the cache storage or nonvolatile + * storage of the storage system. This is an operating system independent + * message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - DPS cannot be filled\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - Error correction code hardware fault\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Missing end operation; device transfer complete\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - DASD controller not available on disconnected command chain\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - No interruption from device during a command chain\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - No response to selection after a poll interruption\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 9 - Track physical address did not compare while oriented\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 9 - Head address did not compare\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Invalid tag-in for an immediate command sequence\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 9 - Cylinder address did not compare\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - DPS checks after a system reset or selective reset\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Caching reinitiated\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * Caching has been automatically reinitiated following an error. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - End operation with transfer count zero\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 9 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - Short busy time-out during device selection\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Caching terminated\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The storage system was unable to initiate caching or had to suspend caching + * for a 3990 control unit. If this problem is caused by a failure condition, + * an additional message will provide more information about the failure. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * Check for additional messages that point out possible failures. For more + * information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Subsystem status cannot be determined\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The status of a DASD Fast Write or PPRC volume cannot be determined. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Nonvolatile storage terminated\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The storage director has stopped using nonvolatile storage or cannot + * initiate nonvolatile storage. If this problem is caused by a failure, an + * additional message will provide more information about the failure. This is + * an operating system independent message that is issued by the storage system. + * User action: + * Check for additional messages that point out possible failures. For more + * information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: Write inhibited path encountered\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an informational message. + * User action: + * None. + */ + +/*? + * Text: "%s: FORMAT 9 - Device check-2 error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Track format incorrect\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A track format error occurred while data was being written to the DASD or + * while a duplex pair was being established. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Cache fast write access not authorized\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A request for Cache Fast Write Data access cannot be satisfied because + * of missing access authorization for the storage system. This is an operating + * system independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: Data recovered during retry with PCI fetch mode active\n" + * Severity: Emerg + * Parameter: + * @1: bus ID of the DASD + * Description: + * A data error has been recovered on the storages system but the Linux file + * system cannot be informed about the data mismatch. To prevent Linux from + * running with incorrect data, the DASD device driver will trigger a kernel + * panic. + * User action: + * Reset your real or virtual hardware and reboot Linux. + */ + +/*? + * Text: "%s: The specified record was not found\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The record to be accessed does not exist. The DASD might be unformatted + * or defect. + * User action: + * Try to format the DASD or replace it. + * ATTENTION: Formatting irreversibly destroys all data on the DASD. + */ + +/*? + * Text: "%s: ERP %p (%02x) refers to %p\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to ERP + * @3: ERP status + * @4: cqr + * Description: + * This message provides debug information for the enhanced error recovery + * procedure (ERP). + * User action: + * If you do not need this information, you can suppress this message by + * switching off ERP logging, for example, by writing '1' to the 'erplog' + * sysfs attribute of the DASD. + */ + +/*? + * Text: "%s: ERP chain at END of ERP-ACTION\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * This message provides debug information for the enhanced error recovery + * procedure (ERP). + * User action: + * If you do not need this information, you can suppress this message by + * switching off ERP logging, for example, by writing '1' to the 'erplog' + * sysfs attribute of the DASD. + */ + +/*? + * Text: "%s: The cylinder data for accessing the DASD is inconsistent\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * An error occurred in the storage system hardware. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: Accessing the DASD failed because of a hardware error\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * An error occurred in the storage system hardware. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: ERP chain at BEGINNING of ERP-ACTION\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * This message provides debug information for the enhanced error recovery + * procedure (ERP). + * User action: + * If you do not need this information, you can suppress this message by + * switching off ERP logging, for example, by writing '1' to the 'erplog' + * sysfs attribute of the DASD. + */ + +/*? + * Text: "%s: ERP %p has run out of retries and failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: ERP pointer + * Description: + * The error recovery procedure (ERP) tried to recover an error but the number + * of retries for the I/O was exceeded before the error could be resolved. + * User action: + * Check for related previous error messages. + */ + +/*? + * Text: "%s: SIM - SRC: %02x%02x%02x%02x\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: sense byte + * @3: sense byte + * @4: sense byte + * @5: sense byte + * Description: + * This error message is a System Information Message (SIM) generated by the + * storage system. The System Reference Code (SRC) defines the error in detail. + * User action: + * Look up the SRC in the storage server documentation. + */ + +/*? + * Text: "%s: log SIM - SRC: %02x%02x%02x%02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: sense byte + * @3: sense byte + * @4: sense byte + * @5: sense byte + * Description: + * This System Information Message (SIM) is generated by the storage system. + * The System Reference Code (SRC) defines the error in detail. + * User action: + * Look up the SRC in the storage server documentation. + */ + +/*? + * Text: "%s: Reading device feature codes failed with rc=%d\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * Description: + * The device feature codes state which advanced features are supported by a + * device. + * Examples for advanced features are PAV or high performance FICON. + * Some early devices do not provide feature codes and no advanced features are + * available on these devices. + * User action: + * None, if the DASD does not provide feature codes. If the DASD provides + * feature codes, make sure that it is working correctly, then set it offline + * and back online. + */ + +/*? + * Text: "%s: A channel path group could not be established\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * Initialization of a DASD did not complete because a channel path group + * could not be established. + * User action: + * Make sure that the DASD is working correctly, then try again to set it + * online. If initialization still fails, reboot. + */ + +/*? + * Text: "%s: The DASD is not operating in multipath mode\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD channel path group could not be configured to use multipath mode. + * This might negatively affect I/O performance on this DASD. + * User action: + * Make sure that the DASD is working correctly, then try again to set it + * online. If initialization still fails, reboot. + */ + +/*? + * Text: "%s: Detecting the DASD disk layout failed because of an I/O error\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The disk layout of the DASD could not be detected because of an unexpected + * I/O error. The DASD device driver treats the device like an unformatted DASD, + * and partitions on the device are not accessible. + * User action: + * If the DASD is formatted, make sure that the DASD is working correctly, + * then set it offline and back online. If the DASD is unformatted, format the + * DASD, for example, with dasdfmt. + * ATTENTION: Formatting irreversibly destroys all data on the DASD. + */ + +/*? + * Text: "%s: An I/O request was rejected because writing is inhibited\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * An I/O request was returned with an error indication of 'command reject' + * and 'write inhibited'. The most likely reason for this error is a + * failed write request to a device that was attached as read-only in z/VM. + * User action: + * Set the device offline, ensure that the device is configured correctly in + * z/VM, then set the device online again. + */ + +/*? + * Text: "%s: An Alias device was reassigned to a new base device with UID: %s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the alias + * @2: UID of new base device + * Description: + * The alias device with the indicated bus ID has been reassigned. The UID of the new base device is shown in the message. + * User action: + * None. + */ + +/*? + * Text: "%s: Detecting the maximum supported data size for zHPF requests failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * High Performance FICON (zHPF) requests are limited to a hardware-dependent + * maximum data size. The DASD device driver failed to detect this size and zHPF + * is not available for this device. + * User action: + * Set the device offline and online again. If this problem persists, gather + * Linux debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Reading device feature codes failed (rc=%d) for new path %x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * @3: path mask + * Description: + * A new path has been made available to the a device. + * A command to read the device feature codes on this device returned an error. + * The new path will not be used for I/O. + * User action: + * Set the new path offline and online again to repeat the path verification. + * Alternatively, set the device offline and online again to + * verify all available paths for this device. + * If this problem persists, gather Linux debug data and report the problem + * to your support organization. + */ + +/*? + * Text: "%s: Detecting the maximum data size for zHPF requests failed (rc=%d) for a new path %x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * @3: path mask + * Description: + * High Performance FICON (zHPF) requests are limited to a hardware-dependent + * maximum data size. A command to detect this size for + * a new path returned an error. The new path will not be used for I/O. + * User action: + * Set the new path offline and online again to repeat the path verification. + * Alternatively, set the device offline and online again to + * verify all available paths for this device. + * If this problem persists, gather Linux debug data and report the problem + * to your support organization. + */ + +/*? + * Text: "%s: The maximum data size for zHPF requests %u on a new path %x is below the active maximum %u\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: size in bytes + * @3: path mask + * @4: size in bytes + * Description: + * High Performance FICON (zHPF) requests are limited to a hardware-dependent + * maximum data size. The maximum of the new path is below + * the previously established common maximum for the + * existing paths for this device. This could cause requests on the new + * path to fail. The new path will not be used for I/O. + * User action: + * Set the device offline and online again to establish a new common maximum + * data size for the device. + */ + +/*? + * Text: "%s: The device reservation was lost\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * This Linux instance has lost its reservation of the device to another + * operating system instance. Depending on the reservation policy for the + * device, I/O might be blocked until the other operating system instance + * surrenders the reservation or all I/O requests might fail until the + * device is reset. + * User action: + * None, if this situation is handled by system automation software. + * If this situation is not handled by automation, check the + * last_known_reservation_state attribute of the device in sysfs. + * If the value is 'lost', verify that the device is no longer reserved + * by another operating system instance, then set the device offline and + * online again. For any other value of the last_known_reservation_state + * no action is required. I/O will resume when the device reservation is + * surrendered by the other operating system instance. + */ + +/*? + * Text: "%s: The storage server does not support raw-track access\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD cannot be accessed in raw-track access mode because the storage + * server does not have all required features for this access mode. + * In raw-track access mode, the DASD device driver accesses complete ECKD + * tracks. + * By default, the DASD device driver accesses only the data fields of ECKD + * devices and omits the count and key data fields. + * User action: + * Ensure that the raw_track_access sysfs attribute of the DASD has the value + * 0 to access the device in default ECKD mode. + */ + +/*? + * Text: "%s: The newly added channel path %02X will not be used because it leads to a different device %s\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: logical path mask + * @3: UID + * Description: + * The newly added channel path has a different UID than the DASD device. This indicates + * an incorrect cabling. This path is not going to be used. + * User action: + * Check the cabling of the DASD device. Disconnect and reconnect the cable. + */ + +/*? + * Text: "%s: Not all channel paths lead to the same device, path %02X leads to device %s instead of %s\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: logical path mask + * @3: UID + * @4: UID + * Description: + * Some channel paths have a different UID than others. This indicates + * an incorrect cabling. The DASD device is not enabled. + * User action: + * Check cabling of the DASD device and retry to enable the device. + */ + +/*? + * Text: "Service on the storage server caused path %x.%02x to go offline" + * Severity: Warning + * Parameter: + * @1: channel subsystem ID + * @2: CHPID + * Description: + * A channel path to the DASD has been set offline because of + * a service action on the storage server. The path will be set back + * online automatically when the service action is completed. + * User action: + * None. + */ + +/*? + * Text: "Path %x.%02x is back online after service on the storage server" + * Severity: Informational + * Parameter: + * @1: channel subsystem ID + * @2: CHPID + * Description: + * A path had been set offline temporarily because of a service + * action on the storage server. + * The service action has completed, and the channel path is available + * again. + * User action: + * None. + */ + +/*? + * Text: "%s: High Performance FICON disabled\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * High Performance FICON (HPF) has been disabled. Either the device + * lost HPF functionality, or none of the remaining channel paths are + * HPF capable. + * User action: + * Report the problem to your support organization. + * Ensure that the cabling between the storage server and the mainframe + * system is securely in place. + * Reset the device and channel paths by writing "all" or a logical path mask + * to the path_reset sysfs attribute of the device. + */ + +/*? + * Text: "%s: Channel path %02X lost HPF functionality and is disabled\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: logical path mask + * Description: + * A channel path has lost High Performance FICON (HPF) functionality + * and was removed from regular operations. + * User action: + * Report the problem to your support organization. + * Ensure that the cabling between the storage server and the mainframe + * system is securely in place. + * Reset the device and channel paths by writing "all" or a logical path mask + * to the path_reset sysfs attribute of the device. + */ + +/*? + * Text: "%s: Path %x.%02x (pathmask %02x) is disabled - IFCC threshold exceeded\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: cssid + * @3: chpid + * @4: logical path mask + * Description: + * Due to numerous interface or channel control checks (IFCCs), a channel path + * was removed from regular operations to retain good I/O performance. + * User action: + * Ensure that the cabling between the storage server and the mainframe + * system is securely in place. + * Reset the device and channel paths by writing "all" or a logical path mask + * to the path_reset sysfs attribute of the device. + * If the problem persists, report it to your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/dasd-fba +++ linux-azure-4.13.0/Documentation/kmsg/s390/dasd-fba @@ -0,0 +1,36 @@ + +/*? + * Text: "%s: New FBA DASD %04X/%02X (CU %04X/%02X) with %d MB and %d B/blk%s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * @2: device type + * @3: device model + * @4: control unit type + * @5: control unit model + * @6: size + * @7: bytes per block + * @8: access mode + * Description: + * A DASD with the shown characteristics has been set online. + * If the DASD is configured as read-only to the real or virtual hardware, + * the message includes an indication of this hardware access mode. The + * hardware access mode is independent from the 'readonly' attribute of + * the device in sysfs. + * User action: + * None. + */ + +/*? + * Text: "%s: Allocating memory for private DASD data failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD device driver maintains data structures for each DASD it manages. + * There is not enough memory to allocate these data structures for one or + * more DASD. + * User action: + * Free some memory and try the operation again. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/dcssblk +++ linux-azure-4.13.0/Documentation/kmsg/s390/dcssblk @@ -0,0 +1,206 @@ +/*? + * Text: "Adjacent DCSSs %s and %s are not contiguous\n" + * Severity: Error + * Parameter: + * @1: name 1 + * @2: name 2 + * Description: + * You can only map a set of two or more DCSSs to a single DCSS device if the + * DCSSs in the set form a contiguous memory space. The DCSS device cannot be + * created because there is a memory gap between two adjacent DCSSs. + * User action: + * Ensure that you have specified all DCSSs that belong to the set. Check the + * definitions of the DCSSs on the z/VM hypervisor to verify that they form + * a contiguous memory space. + */ + +/*? + * Text: "DCSS %s and DCSS %s have incompatible types\n" + * Severity: Error + * Parameter: + * @1: name 1 + * @2: name 2 + * Description: + * You can only map a set of two or more DCSSs to a single DCSS device if + * either all DCSSs in the set have the same type or if the set contains DCSSs + * of the two types EW and EN but no other type. The DCSS device cannot be + * created because at least two of the specified DCSSs are not compatible. + * User action: + * Check the definitions of the DCSSs on the z/VM hypervisor to verify that + * their types are compatible. + */ + +/*? + * Text: "DCSS %s is of type SC and cannot be loaded as exclusive-writable\n" + * Severity: Error + * Parameter: + * @1: device name + * Description: + * You cannot load a DCSS device in exclusive-writable access mode if the DCSS + * devise maps to one or more DCSSs of type SC. + * User action: + * Load the DCSS in shared access mode. + */ + +/*? + * Text: "DCSS device %s is removed after a failed access mode change\n" + * Severity: Error + * Parameter: + * @1: device name + * Description: + * To change the access mode of a DCSS device, all DCSSs that map to the device + * were unloaded. Reloading the DCSSs for the new access mode failed and the + * device is removed. + * User action: + * Look for related messages to find out why the DCSSs could not be reloaded. + * If necessary, add the device again. + */ + +/*? + * Text: "All DCSSs that map to device %s are saved\n" + * Severity: Informational + * Parameter: + * @1: device name + * Description: + * A save request has been submitted for the DCSS device. Changes to all DCSSs + * that map to the device are saved permanently. + * User action: + * None. + */ + +/*? + * Text: "Device %s is in use, its DCSSs will be saved when it becomes idle\n" + * Severity: Informational + * Parameter: + * @1: device name + * Description: + * A save request for the device has been deferred until the device becomes + * idle. Then changes to all DCSSs that the device maps to will be saved + * permanently. + * User action: + * None. + */ + +/*? + * Text: "A pending save request for device %s has been canceled\n" + * Severity: Informational + * Parameter: + * @1: device name + * Description: + * A save request for the DCSSs that map to a DCSS device has been pending + * while the device was in use. This save request has been canceled. Changes to + * the DCSSs will not be saved permanently. + * User action: + * None. + */ + +/*? + * Text: "Loaded %s with total size %lu bytes and capacity %lu sectors\n" + * Severity: Informational + * Parameter: + * @1: DCSS names + * @2: total size in bytes + * @3: total size in 512 byte sectors + * Description: + * The listed DCSSs have been verified as contiguous and successfully loaded. + * The displayed sizes are the sums of all DCSSs. + * User action: + * None. + */ + +/*? + * Text: "Device %s cannot be removed because it is not a known device\n" + * Severity: Warning + * Parameter: + * @1: device name + * Description: + * The DCSS device you are trying to remove is not known to the DCSS device + * driver. + * User action: + * List the entries under /sys/devices/dcssblk/ to see the names of the + * existing DCSS devices. + */ + +/*? + * Text: "Device %s cannot be removed while it is in use\n" + * Severity: Warning + * Parameter: + * @1: device name + * Description: + * You are trying to remove a device that is in use. + * User action: + * Make sure that all users of the device close the device before you try to + * remove it. + */ + +/*? + * Text: "Device %s has become idle and is being saved now\n" + * Severity: Informational + * Parameter: + * @1: device name + * Description: + * A save request for the DCSSs that map to a DCSS device has been pending + * while the device was in use. The device has become idle and all changes + * to the DCSSs are now saved permanently. + * User action: + * None. + */ + +/*? + * Text: "Writing to %s failed because it is a read-only device\n" + * Severity: Warning + * Parameter: + * @1: device name + * Description: + * The DCSS device is in shared access mode and cannot be written to. Depending + * on the type of the DCSSs that the device maps to, you might be able to + * change the access mode to exclusive-writable. + * User action: + * If the DCSSs of the device are of type SC, do not attempt to write to the + * device. If the DCSSs of the device are of type ER or SR, change the access + * mode to exclusive-writable before writing to the device. + */ + +/*? + * Text: "The address range of DCSS %s changed while the system was suspended\n" + * Severity: Error + * Parameter: + * @1: device name + * Description: + * After resuming the system, the start address or end address of a DCSS does + * not match the address when the system was suspended. DCSSs must not be + * changed after the system was suspended. + * This error cannot be recovered. The system is stopped with a kernel panic. + * User action: + * Reboot Linux. + */ + +/*? + * Text: "Suspending the system failed because DCSS device %s is writable\n" + * Severity: Error + * Parameter: + * @1: device name + * Description: + * A system cannot be suspended if one or more DCSSs are accessed in exclusive- + * writable mode. DCSS segment types EW, SW, and EN are always writable and + * must be removed before a system is suspended. + * User action: + * Remove all DCSSs of segment types EW, SW, and EN by writing the DCSS name to + * the sysfs 'remove' attribute. Set the access mode for all DCSSs of segment + * types SR and ER to read-only by writing 1 to the sysfs 'shared' attribute of + * the DCSS. Then try again to suspend the system. + */ + +/*? + * Text: "DCSS %s is of type SN or EN and cannot be saved\n" + * Severity: Warning + * Parameter: + * @1: DCSS name + * Description: + * DCSSs of type SN or EN cannot be saved. + * User action: + * If the DCSS was set up with the intention to prevent the content from being saved, + * no action is necessary. + * To be able to save the content, you must define the DCSS with a type other than SN or EN. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/diag288_wdt +++ linux-azure-4.13.0/Documentation/kmsg/s390/diag288_wdt @@ -0,0 +1,66 @@ +/*? + * Text: "The watchdog cannot be activated\n" + * Severity: Error + * Description: + * Diagnose instruction 0x288 was called to activate the diag288 watchdog. + * The diagnose call returned an error that cannot be handled by the device driver. + * The watchdog stays inactive. + * User action: + * Contact your support organization. + */ + +/*? + * Text: "The watchdog cannot be initialized\n" + * Severity: Error + * Description: + * Diagnose instruction 0x288 was called to initialize the diag288 watchdog. + * The diagnose call returned an error that cannot be handled by the device driver. + * The watchdog stays inactive. + * A possible reason for this error is that your real or virtual hardware does not support + * the diag288 watchdog. + * User action: + * Confirm that the diag288 watchdog is supported in your environment. + * Use a watchdog that is supported in your environment. + */ + +/*? + * Text: "The watchdog cannot be deactivated\n" + * Severity: Error + * Description: + * Diagnose instruction 0x288 was called to deactivate the diag288 watchdog. + * The diagnose call returned an error that cannot be handled by the device driver. + * The watchdog stays active and a watchdog timeout will trigger the configured timeout action. + * The diag288 watchdog device driver might intentionally be configured to prevent deactivation. + * User action: + * You can configure the diag288 watchdog device driver such that it can be deactivated. + * If the diag288 device driver has been compiled as a separate module, diag288_wdt, reload the module + * without specifying the 'nowayout' module parameter. + * If the diag288 device driver has been compiled into your kernel, + * reboot Linux without specifying the 'diag288.nowayout' kernel parameter'. + */ + +/*? + * Text: "The watchdog timer cannot be started or reset\n" + * Severity: Error + * Description: + * Diagnose instruction 0x288 was called to start the diag288 watchdog or to set timer back to zero. + * The diagnose call returned an error that cannot be handled by the device driver. + * The watchdog stays inactive or becomes inactive. + * User action: + * Contact your support organization. + */ + +/*? + * Text: "Linux cannot be suspended while the watchdog is in use\n" + * Severity: Error + * Description: + * The watchdog must not time out while Linux is suspended. + * Therefore, the diag288 watchdog device driver prevents Linux from being suspended + * while the watchdog is in use. + * User action: + * i) Stop the watchdog application. ii) If the problem persists, close the watchdog + * device node by issuing 'echo V > /dev/watchdog'. + * iii) If the device driver still prevents Linux from being suspended, + * contact your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/extmem +++ linux-azure-4.13.0/Documentation/kmsg/s390/extmem @@ -0,0 +1,293 @@ +/*? + * Text: "Querying a DCSS type failed with rc=%ld\n" + * Severity: Warning + * Parameter: + * @1: return code + * Description: + * The DCSS kernel interface used z/VM diagnose call X'64' to query the + * type of a DCSS. z/VM failed to determine the type and returned an error. + * User action: + * Look for related messages to find out which DCSS is affected. + * For details about the return codes see the section about DIAGNOSE Code + * X'64' in "z/VM CP Programming Services". + */ + +/*? + * Text: "Loading DCSS %s failed with rc=%ld\n" + * Severity: Warning + * Parameter: + * @1: DCSS name + * @2: return code + * Description: + * The DCSS kernel interface used diagnose call X'64' to load a DCSS. z/VM + * failed to load the DCSS and returned an error. + * User action: + * For details about the return codes see the section about DIAGNOSE Code + * X'64' in "z/VM CP Programming Services". + */ + +/*? + * Text: "DCSS %s of range %p to %p and type %s loaded as exclusive-writable\n" + * Severity: Informational + * Parameter: + * @1: DCSS name + * @2: starting page address + * @3: ending page address + * @4: DCSS type + * Description: + * The DCSS was loaded successfully in exclusive-writable access mode. + * User action: + * None. + */ + +/*? + * Text: "DCSS %s of range %p to %p and type %s loaded in shared access mode\n" + * Severity: Informational + * Parameter: + * @1: DCSS name + * @2: starting page address + * @3: ending page address + * @4: DCSS type + * Description: + * The DCSS was loaded successfully in shared access mode. + * User action: + * None. + */ + +/*? + * Text: "DCSS %s is already in the requested access mode\n" + * Severity: Informational + * Parameter: + * @1: DCSS name + * Description: + * A request to reload a DCSS with a new access mode has been rejected + * because the new access mode is the same as the current access mode. + * User action: + * None. + */ + +/*? + * Text: "DCSS %s is in use and cannot be reloaded\n" + * Severity: Warning + * Parameter: + * @1: DCSS name + * Description: + * Reloading a DCSS in a different access mode has failed because the DCSS is + * being used by one or more device drivers. The DCSS remains loaded with the + * current access mode. + * User action: + * Ensure that the DCSS is not used by any device driver then try again to + * load the DCSS with the new access mode. + */ + +/*? + * Text: "DCSS %s overlaps with used memory resources and cannot be reloaded\n" + * Severity: Warning + * Parameter: + * @1: DCSS name + * Description: + * The DCSS has been unloaded and cannot be reloaded because it overlaps with + * another loaded DCSS or with the memory of the z/VM guest virtual machine + * (guest storage). + * User action: + * Ensure that no DCSS is loaded that has overlapping memory resources + * with the DCSS you want to reload. If the DCSS overlaps with guest storage, + * use the DEF STORE CONFIG z/VM CP command to create a sufficient storage gap + * for the DCSS. For details, see the section about the DCSS device driver in + * "Device Drivers, Features, and Commands". + */ + +/*? + * Text: "Reloading DCSS %s failed with rc=%ld\n" + * Severity: Warning + * Parameter: + * @1: DCSS name + * @2: return code + * Description: + * The DCSS kernel interface used z/VM diagnose call X'64' to reload a DCSS + * in a different access mode. The DCSS was unloaded but z/VM failed to reload + * the DCSS. + * User action: + * For details about the return codes see the section about DIAGNOSE Code + * X'64' in "z/VM CP Programming Services". + */ + +/*? + * Text: "Unloading unknown DCSS %s failed\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * The specified DCSS cannot be unloaded. The DCSS is known to the DCSS device + * driver but not to the DCSS kernel interface. This problem indicates a + * program error in extmem.c. + * User action: + * Report this problem to your support organization. + */ + +/*? + * Text: "Saving unknown DCSS %s failed\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * The specified DCSS cannot be saved. The DCSS is known to the DCSS device + * driver but not to the DCSS kernel interface. This problem indicates a + * program error in extmem.c. + * User action: + * Report this problem to your support organization. + */ + +/*? + * Text: "Saving a DCSS failed with DEFSEG response code %i\n" + * Severity: Error + * Parameter: + * @1: response-code + * Description: + * The DEFSEG z/VM CP command failed to permanently save changes to a DCSS. + * User action: + * Ensure that the z/VM guest virtual machine is authorized to issue + * the CP DEFSEG command (typically privilege class E). + * Look for related messages to find the cause of this error. See also message + * HCPE in the DEFSEG section of the "z/VM CP Command and + * Utility Reference". + */ + +/*? + * Text: "Saving a DCSS failed with SAVESEG response code %i\n" + * Severity: Error + * Parameter: + * @1: response-code + * Description: + * The SAVESEG z/VM CP command failed to permanently save changes to a DCSS. + * User action: + * Ensure that the z/VM guest virtual machine is authorized to issue + * the CP SAVESEG command (typically privilege class E). + * Look for related messages to find the cause of this error. See also message + * HCPE in the SAVESEG section of the "z/VM CP Command and + * Utility Reference". + */ + +/*? + * Text: "DCSS %s cannot be loaded or queried\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * You cannot load or query the specified DCSS because it either is not defined + * in the z/VM hypervisor, or it is a class S DCSS, or it is above 2047 MB + * and the Linux system is a 31-bit system. + * User action: + * Use the CP command "QUERY NSS" to find out if the DCSS is a valid + * DCSS that can be loaded. + */ + +/*? + * Text: "DCSS %s cannot be loaded or queried without z/VM\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * A DCSS is a z/VM resource. Your Linux instance is not running as a z/VM + * guest operating system and, therefore, cannot load DCSSs. + * User action: + * Load DCSSs only on Linux instances that run as z/VM guest operating systems. + */ + +/*? + * Text: "Loading or querying DCSS %s resulted in a hardware error\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * Either the z/VM DIAGNOSE X'64' query or load call issued for the DCSS + * returned with an error. + * User action: + * Look for previous extmem message to find the return code from the + * DIAGNOSE X'64' query or load call. For details about the return codes see + * the section about DIAGNOSE Code X'64' in "z/VM CP Programming Services". + */ + +/*? + * Text: "DCSS %s has multiple page ranges and cannot be loaded or queried\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * You can only load or query a DCSS with multiple page ranges if: + * - The DCSS has 6 or fewer page ranges + * - The page ranges form a contiguous address space + * - The page ranges are of type EW or EN + * User action: + * Check the definition of the DCSS to make sure that the conditions for + * DCSSs with multiple page ranges are met. + */ + +/*? + * Text: "%s needs used memory resources and cannot be loaded or queried\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * You cannot load or query the DCSS because it overlaps with an already + * loaded DCSS or with the memory of the z/VM guest virtual machine + * (guest storage). + * User action: + * Ensure that no DCSS is loaded that has overlapping memory resources + * with the DCSS you want to load or query. If the DCSS overlaps with guest + * storage, use the DEF STORE CONFIG z/VM CP command to create a sufficient + * storage gap for the DCSS. For details, see the section about the DCSS + * device driver in "Device Drivers, Features, and Commands". + */ + +/*? + * Text: "DCSS %s is already loaded in a different access mode\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * The DCSS you are trying to load has already been loaded in a different + * access mode. You cannot simultaneously load the DCSS in different modes. + * User action: + * Reload the DCSS in a different mode or load it with the same mode in which + * it has already been loaded. + */ + +/*? + * Text: "There is not enough memory to load or query DCSS %s\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * The available memory is not enough to load or query the DCSS. + * User action: + * Free some memory and repeat the failed operation. + */ + +/*? + * Text: "DCSS %s overlaps with used storage and cannot be loaded\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * You cannot load the DCSS because it overlaps with an already loaded DCSS + * or with the memory of the z/VM guest virtual machine (guest storage). + * User action: + * Ensure that no DCSS is loaded that has overlapping memory resources + * with the DCSS you want to load. If the DCSS overlaps with guest storage, + * use the DEF STORE CONFIG z/VM CP command to create a sufficient storage gap + * for the DCSS. For details, see the section about the DCSS device driver in + * "Device Drivers, Features, and Commands". + */ + +/*? + * Text: "DCSS %s exceeds the kernel mapping range (%lu) and cannot be loaded\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * @2: kernel mapping range in bytes + * Description: + * You cannot load the DCSS because it exceeds the kernel mapping range limit. + * User action: + * Ensure that the DCSS range is defined below the kernel mapping range. + */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/hmcdrv +++ linux-azure-4.13.0/Documentation/kmsg/s390/hmcdrv @@ -0,0 +1,22 @@ +/*? + * Text: "Allocating the requested cache size of %zu bytes failed\n" + * Severity: Error + * Parameter: + * @1: size + * Description: + * You cannot use the 'hmcdrv' module. + * Either the cache size that was specified for the 'hmcdrv' module exceeded + * the maximum of 1048576 (1 megabyte), or not enough free memory was + * available. + * If the 'hmcdrv' module was compiled into the kernel, the cache size was + * specified with the 'hmcdrv.cachesize' kernel parameter. + * For a separate 'hmcdrv' module, the cache size was specified with the + * 'cachesize=' module parameter. + * User action: + * Specify a smaller cache size and try again to load the module. + * Do not exceed the maximum specification of 1048576 (1 megabyte). + * If necessary, free some memory and try again. + * If the module is compiled into the kernel, you must reboot Linux to change + * the cache size specification. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/hugetlb +++ linux-azure-4.13.0/Documentation/kmsg/s390/hugetlb @@ -0,0 +1,13 @@ +/*? + * Text: "hugepagesz= specifies an unsupported page size %s\n" + * Severity: Error + * Parameter: + * @1: size + * Description: + * The hugepagesz= kernel parameter specifies a huge page size + * that is not supported. + * User action: + * Specify "1M" for 1 MB huge pages. These are supported as of z10. + * Specify "2G" for 2 GB huge pages. These are supported as of zEC12 + * and zBC12 machines. + */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/hvc_iucv +++ linux-azure-4.13.0/Documentation/kmsg/s390/hvc_iucv @@ -0,0 +1,123 @@ +/*? + * Text: "The z/VM IUCV HVC device driver cannot be used without z/VM\n" + * Severity: Notice + * Description: + * The z/VM IUCV hypervisor console (HVC) device driver requires the + * z/VM inter-user communication vehicle (IUCV). + * User action: + * Set "hvc_iucv=" to zero in the kernel parameter line and reboot Linux. + */ + +/*? + * Text: "%lu is not a valid value for the hvc_iucv= kernel parameter\n" + * Severity: Error + * Parameter: + * @1: hvc_iucv_devices + * Description: + * The "hvc_iucv=" kernel parameter specifies the number of z/VM IUCV + * hypervisor console (HVC) terminal devices. + * The parameter value ranges from 0 to 8. + * If zero is specified, the z/VM IUCV HVC device driver is disabled + * and no IUCV-based terminal access is available. + * User action: + * Correct the "hvc_iucv=" setting in the kernel parameter line and + * reboot Linux. + */ + +/*? + * Text: "Creating a new HVC terminal device failed with error code=%d\n" + * Severity: Error + * Parameter: + * @1: errno + * Description: + * The device driver initialization failed to allocate a new + * HVC terminal device. + * A possible cause of this problem is memory constraints. + * User action: + * If the error code is -12 (ENOMEM), consider assigning more memory + * to your z/VM guest virtual machine. + */ + +/*? + * Text: "Registering HVC terminal device as Linux console failed\n" + * Severity: Error + * Description: + * The device driver initialization failed to set up the first HVC terminal + * device for use as Linux console. + * User action: + * If the error code is -12 (ENOMEM), consider assigning more memory + * to your z/VM guest virtual machine. + */ + +/*? + * Text: "Registering IUCV handlers failed with error code=%d\n" + * Severity: Error + * Parameter: + * @1: errno + * Description: + * The device driver initialization failed to register with z/VM IUCV to + * handle IUCV connections, as well as sending and receiving of IUCV messages. + * User action: + * Check for related IUCV error messages and see the errno manual page + * to find out what caused the problem. + */ + +/*? + * Text: "Allocating memory failed with reason code=%d\n" + * Severity: Error + * Parameter: + * @1: reason + * Description: + * The z/VM IUCV hypervisor console (HVC) device driver initialization failed, + * because of a general memory allocation failure. The reason code indicates + * the memory operation that has failed: + * kmem_cache (reason code=1), + * mempool (reason code=2), or + * hvc_iucv_allow= (reason code=3) + * User action: + * Consider assigning more memory to your z/VM guest virtual machine. + */ + +/*? + * Text: "hvc_iucv_allow= does not specify a valid z/VM user ID list\n" + * Severity: Error + * Description: + * The "hvc_iucv_allow=" kernel parameter specifies a comma-separated list + * of z/VM user IDs that are permitted to connect to the z/VM IUCV hypervisor + * device driver. + * The z/VM user IDs in the list must not exceed eight characters and must + * not contain spaces. + * User action: + * Correct the "hvc_iucv_allow=" setting in the kernel parameter line and reboot + * Linux. + */ + +/*? + * Text: "hvc_iucv_allow= specifies too many z/VM user IDs\n" + * Severity: Error + * Description: + * The "hvc_iucv_allow=" kernel parameter specifies a comma-separated list + * of z/VM user IDs that are permitted to connect to the z/VM IUCV hypervisor + * device driver. + * The number of z/VM user IDs that are specified with the "hvc_iucv_allow=" + * kernel parameter exceeds the maximum of 500. + * User action: + * Correct the "hvc_iucv_allow=" setting by reducing the z/VM user IDs in + * the list and reboot Linux. + */ + +/*? + * Text: "A connection request from z/VM user ID %s was refused\n" + * Severity: Informational + * Parameter: + * @1: ID + * Description: + * An IUCV connection request from another z/VM guest virtual machine has been + * refused. The request was from a z/VM guest virtual machine that is not + * listed by the "hvc_iucv_allow=" kernel parameter. + * User action: + * Check the "hvc_iucv_allow=" kernel parameter setting. + * Consider adding the z/VM user ID to the "hvc_iucv_allow=" list in the kernel + * parameter line and reboot Linux. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/hypfs +++ linux-azure-4.13.0/Documentation/kmsg/s390/hypfs @@ -0,0 +1,56 @@ +/*? + * Text: "The hardware system does not support hypfs\n" + * Severity: Error + * Description: + * hypfs requires DIAGNOSE Code X'204' but this diagnose code is not available + * on your hardware. You need more recent hardware to use hypfs. + * User action: + * None. + */ + +/*? + * Text: "The hardware system does not provide all functions required by hypfs\n" + * Severity: Error + * Description: + * hypfs requires DIAGNOSE Code X'224' but this diagnode code is not available + * on your hardware. You need more recent hardware to use hypfs. + * User action: + * None. + */ + +/*? + * Text: "Updating the hypfs tree failed\n" + * Severity: Error + * Description: + * There was not enough memory available to update the hypfs tree. + * User action: + * Free some memory and try again to update the hypfs tree. Consider assigning + * more memory to your LPAR or z/VM guest virtual machine. + */ + +/*? + * Text: "%s is not a valid mount option\n" + * Severity: Error + * Parameter: + * @1: mount option + * Description: + * hypfs has detected mount options that are not valid. + * User action: + * See "Device Drivers Features and Commands" for information about valid + * mount options for hypfs. + */ + +/*? + * Text: "Initialization of hypfs failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: error code + * Description: + * Initialization of hypfs failed because of resource or hardware constraints. + * Possible reasons for this problem are insufficient free memory or missing + * hardware interfaces. + * User action: + * See errno.h for information about the error codes. + */ + +/*? Text: "Hypervisor filesystem mounted\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/iucv +++ linux-azure-4.13.0/Documentation/kmsg/s390/iucv @@ -0,0 +1,33 @@ +/*? + * Text: "Defining an interrupt buffer on CPU %i failed with 0x%02x (%s)\n" + * Severity: Warning + * Parameter: + * @1: CPU number + * @2: hexadecimal error value + * @3: short error code explanation + * Description: + * Defining an interrupt buffer for external interrupts failed. Error + * value 0x03 indicates a problem with the z/VM directory entry of the + * z/VM guest virtual machine. This problem can also be caused by a + * program error. + * User action: + * If the error value is 0x03, examine the z/VM directory entry of your + * z/VM guest virtual machine. If the directory entry is correct or if the + * error value is not 0x03, report this problem to your support organization. + */ + +/*? + * Text: "Suspending Linux did not completely close all IUCV connections\n" + * Severity: Warning + * Description: + * When resuming a suspended Linux instance, the IUCV base code found + * data structures from one or more IUCV connections that existed before the + * Linux instance was suspended. Modules that use IUCV connections must close + * these connections when a Linux instance is suspended. This problem + * indicates an error in a program that used an IUCV connection. + * User action: + * Report this problem to your support organization. + */ + +/*? Text: "iucv_external_interrupt: out of memory\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/lcs +++ linux-azure-4.13.0/Documentation/kmsg/s390/lcs @@ -0,0 +1,169 @@ +/*? + * Text: "%s: Allocating a socket buffer to interface %s failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the LCS device + * @2: network interface + * Description: + * LAN channel station (LCS) devices require a socket buffer (SKB) structure + * for storing incoming data. The LCS device driver failed to allocate an SKB + * structure to the LCS device. A likely cause of this problem is memory + * constraints. + * User action: + * Free some memory and repeat the failed operation. + */ + +/*? + * Text: "%s: Shutting down the LCS device failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the LCS device + * Description: + * A request to shut down a LAN channel station (LCS) device resulted in an + * error. The error is logged in the LCS trace at trace level 4. + * User action: + * Try again to shut down the device. If the error persists, see the LCS trace + * to find out what causes the error. + */ + +/*? + * Text: "%s: Detecting a network adapter for LCS devices failed with rc=%d (0x%x)\n" + * Severity: Error + * Parameter: + * @1: bus ID of the LCS device + * @2: lcs_detect return code in decimal notation + * @3: lcs_detect return code in hexadecimal notation + * Description: + * The LCS device driver could not initialize a network adapter. + * User action: + * Ensure that the physical connection from the port to the network is + * in place. If the error persists, note the return code from the error + * message and contact IBM support. + */ + +/*? + * Text: "%s: A recovery process has been started for the LCS device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the LCS device + * Description: + * The LAN channel station (LCS) device is shut down and restarted. The recovery + * process might have been initiated by a user or started automatically as a + * response to a device problem. + * User action: + * Wait until a message indicates the completion of the recovery process. + */ + +/*? + * Text: "%s: An I/O-error occurred on the LCS device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the LCS device + * Description: + * The LAN channel station (LCS) device reported a problem that can be recovered + * by the LCS device driver. Repeated occurrences of this problem indicate a + * malfunctioning device. + * User action: + * If this problem occurs frequently, initiate a recovery process for the + * device, for example, by writing '1' to the 'recover' sysfs attribute of the + * device. + */ + +/*? + * Text: "%s: A command timed out on the LCS device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the LCS device + * Description: + * The LAN channel station (LCS) device reported a problem that can be recovered + * by the LCS device driver. Repeated occurrences of this problem indicate a + * malfunctioning device. + * User action: + * If this problem occurs frequently, initiate a recovery process for the + * device, for example, by writing '1' to the 'recover' sysfs attribute of the + * device. + */ + +/*? + * Text: "%s: An error occurred on the LCS device, rc=%ld\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the LCS device + * @2: return code + * Description: + * The LAN channel station (LCS) device reported a problem that can be recovered + * by the LCS device driver. Repeated occurrences of this problem indicate a + * malfunctioning device. + * User action: + * If this problem occurs frequently, initiate a recovery process for the + * device, for example, by writing '1' to the 'recover' sysfs attribute of the + * device. + */ + +/*? + * Text: "%s: The LCS device stopped because of an error, dstat=0x%X, cstat=0x%X \n" + * Severity: Warning + * Parameter: + * @1: bus ID of the LCS device + * @2: device status + * @3: subchannel status + * Description: + * The LAN channel station (LCS) device reported an error. The LCS device driver + * might start a device recovery process. + * User action: + * If the device driver does not start a recovery process, initiate a recovery + * process, for example, by writing '1' to the 'recover' sysfs attribute of the + * device. If the problem persists, note the status information provided with + * the message and contact IBM support. + */ + +/*? + * Text: "%s: Starting an LCS device resulted in an error, rc=%d!\n" + * Severity: Error + * Parameter: + * @1: bus ID of the LCS device + * @2: ccw_device_start return code in decimal notation + * Description: + * The LAN channel station (LCS) device driver failed to initialize an LCS + * device. The device is not operational. + * User action: + * Initiate a recovery process, for example, by writing '1' to the 'recover' + * sysfs attribute of the device. If the problem persists, contact IBM support. + */ + +/*? + * Text: "%s: Sending data from the LCS device to the LAN failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: bus ID of the LCS device + * @2: ccw_device_resume return code in decimal notation + * Description: + * The LAN channel station (LCS) device driver could not send data to the LAN + * using the LCS device. This might be a temporary problem. Operations continue + * on the LCS device. + * User action: + * If this problem occurs frequently, initiate a recovery process, for example, + * by writing '1' to the 'recover' sysfs attribute of the device. If the + * problem persists, contact IBM support. + */ + +/*? Text: "Query IPAssist failed. Assuming unsupported!\n" */ +/*? Text: "Stoplan for %s initiated by LGW\n" */ +/*? Text: "Not enough memory to add new multicast entry!\n" */ +/*? Text: "Not enough memory for debug facility.\n" */ +/*? Text: "Adding multicast address failed. Table possibly full!\n" */ +/*? Text: "Error in opening device!\n" */ +/*? Text: "LCS device %s %s IPv6 support\n" */ +/*? Text: "Device %s successfully recovered!\n" */ +/*? Text: "LCS device %s %s Multicast support\n" */ +/*? Text: " Initialization failed\n" */ +/*? Text: "Loading %s\n" */ +/*? Text: "Initialization failed\n" */ +/*? Text: "Terminating lcs module.\n" */ +/*? Text: "Device %s could not be recovered!\n" */ +/*? Text: "Initializing the lcs device driver failed\n" */ +/*? Text: "%s: The lcs device driver failed to recover the device\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/monreader +++ linux-azure-4.13.0/Documentation/kmsg/s390/monreader @@ -0,0 +1,128 @@ +/*? + * Text: "Reading monitor data failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * The z/VM *MONITOR record device driver failed to read monitor data + * because the IUCV REPLY function failed. The read function against + * the monitor record device returns EIO. All monitor data that has been read + * since the last read with 0 size is incorrect. + * User action: + * Disregard all monitor data that has been read since the last read with + * 0 size. If the device driver has been compiled as a separate module, unload + * and reload the monreader module. If the device driver has been compiled + * into the kernel, reboot Linux. For more information about possible causes + * of the error see the IUCV section in "z/VM CP Programming Services" and + * the *MONITOR section in "z/VM Performance". + */ + +/*? + * Text: "z/VM *MONITOR system service disconnected with rc=%i\n" + * Severity: Error + * Parameter: + * @1: IPUSER SEVER return code + * Description: + * The z/VM *MONITOR record device driver receives monitor records through + * an IUCV connection to the z/VM *MONITOR system service. This connection + * has been severed and the read function of the z/VM *MONITOR device driver + * returns EIO. All data received since the last read with 0 size is incorrect. + * User action: + * Disregard all monitor data read since the last read with 0 size. Close and + * reopen the monitor record device. For information about the IPUSER SEVER + * return codes see "z/VM Performance". + */ + +/*? + * Text: "The read queue for monitor data is full\n" + * Severity: Warning + * Description: + * The read function of the z/VM *MONITOR device driver returns EOVERFLOW + * because not enough monitor data has been read since the monitor device + * has been opened. Monitor data already read are valid and subsequent reads + * return valid data but some intermediate data might be missing. + * User action: + * Be aware that monitor data might be missing. Assure that you regularly + * read monitor data after opening the monitor record device. + */ + +/*? + * Text: "Connecting to the z/VM *MONITOR system service failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: IUCV CONNECT return code + * Description: + * The z/VM *MONITOR record device driver receives monitor records through + * an IUCV connection to the z/VM *MONITOR system service. This connection + * could not be established when the monitor record device was opened. If + * the return code is 15, your z/VM guest virtual machine is not authorized + * to connect to the *MONITOR system service. + * User action: + * If the return code is 15, ensure that the IUCV *MONITOR statement is + * included in the z/VM directory entry for your z/VM guest virtual machine. + * For other IUCV CONNECT return codes see the IUCV section in "CP Programming + * Services" and the *MONITOR section in "z/VM Performance". + */ + +/*? + * Text: "Disconnecting the z/VM *MONITOR system service failed with rc=%i\n" + * Severity: Warning + * Parameter: + * @1: IUCV SEVER return code + * Description: + * The z/VM *MONITOR record device driver receives monitor data through an + * IUCV connection to the z/VM *MONITOR system service. This connection + * could not be closed when the monitor record device was closed. You might + * not be able to resume monitoring. + * User action: + * No immediate action is necessary. If you cannot open the monitor record + * device in the future, reboot Linux. For information about the IUCV SEVER + * return codes see the IUCV section in "CP Programming Services" and the + * *MONITOR section in "z/VM Performance". + */ + +/*? + * Text: "The z/VM *MONITOR record device driver cannot be loaded without z/VM\n" + * Severity: Error + * Description: + * The z/VM *MONITOR record device driver uses z/VM system services to provide + * monitor data about z/VM guest operating systems to applications on Linux. + * On Linux instances that run in environments other than the z/VM hypervisor, + * the z/VM *MONITOR record device driver does not provide any useful + * function and the corresponding monreader module cannot be loaded. + * User action: + * Load the z/VM *MONITOR record device driver only on Linux instances that run + * as guest operating systems of the z/VM hypervisor. If the z/VM *MONITOR + * record device driver has been compiled into the kernel, ignore this message. + */ + +/*? + * Text: "The z/VM *MONITOR record device driver failed to register with IUCV\n" + * Severity: Error + * Description: + * The z/VM *MONITOR record device driver receives monitor data through an IUCV + * connection and needs to register with the IUCV device driver. This + * registration failed and the z/VM *MONITOR record device driver was not + * loaded. A possible cause of this problem is insufficient memory. + * User action: + * Free some memory and try again to load the module. If the z/VM *MONITOR + * record device driver has been compiled into the kernel, you might have to + * configure more memory and reboot Linux. If you do not want to read monitor + * data, ignore this message. + */ + +/*? + * Text: "The specified *MONITOR DCSS %s does not have the required type SC\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * The DCSS that was specified with the monreader.mondcss kernel parameter or + * with the mondcss module parameter cannot be a *MONITOR DCSS because it is + * not of type SC. + * User action: + * Confirm that you are using the name of the DCSS that has been configured as + * the *MONITOR DCSS on the z/VM hypervisor. If the default name, MONDCSS, is + * used, omit the monreader.mondcss or mondcss parameter. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/monwriter +++ linux-azure-4.13.0/Documentation/kmsg/s390/monwriter @@ -0,0 +1,17 @@ +/*? + * Text: "Writing monitor data failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * The monitor stream application device driver used the z/VM diagnose call + * DIAG X'DC' to start writing monitor data. z/VM returned an error and the + * monitor data cannot be written. If the return code is 5, your z/VM guest + * virtual machine is not authorized to write monitor data. + * User action: + * If the return code is 5, ensure that your z/VM guest virtual machine's + * entry in the z/VM directory includes the OPTION APPLMON statement. + * For other return codes see the section about DIAGNOSE Code X'DC' + * in "z/VM CP Programming Services". + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/netiucv +++ linux-azure-4.13.0/Documentation/kmsg/s390/netiucv @@ -0,0 +1,156 @@ +/*? + * Text: "%s: The peer interface of the IUCV device has closed the connection\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the IUCV device + * Description: + * The peer interface on the remote z/VM guest virtual machine has closed the + * connection. Do not expect further packets on this interface. Any packets + * you send to this interface will be dropped. + * User action: + * None. + */ + +/*? + * Text: "%s: The IUCV device failed to connect to z/VM guest %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the IUCV device + * @2: z/VM user ID + * Description: + * The connection cannot be established because the z/VM guest virtual + * machine with the peer interface is not running. + * User action: + * Ensure that the z/VM guest virtual machine with the peer interface is + * running; then try again to establish the connection. + */ + +/*? + * Text: "%s: The IUCV device failed to connect to the peer on z/VM guest %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the IUCV device + * @2: z/VM user ID + * Description: + * The connection cannot be established because the z/VM guest virtual machine + * with the peer interface is not configured for IUCV connections. + * User action: + * Configure the z/VM guest virtual machine with the peer interface for IUCV + * connections; then try again to establish the connection. + */ + +/*? + * Text: "%s: Connecting the IUCV device would exceed the maximum number of IUCV connections\n" + * Severity: Error + * Parameter: + * @1: bus ID of the IUCV device + * Description: + * The connection cannot be established because the maximum number of IUCV + * connections has been reached on the local z/VM guest virtual machine. + * User action: + * Close some of the established IUCV connections on the local z/VM guest + * virtual machine; then try again to establish the connection. + */ + +/*? + * Text: "%s: z/VM guest %s has too many IUCV connections to connect with the IUCV device\n" + * Severity: Error + * Parameter: + * @1: bus ID of the IUCV device + * @2: remote z/VM user ID + * Description: + * Connecting to the remote z/VM guest virtual machine failed because the + * maximum number of IUCV connections for the remote z/VM guest virtual + * machine has been reached. + * User action: + * Close some of the established IUCV connections on the remote z/VM guest + * virtual machine; then try again to establish the connection. + */ + +/*? + * Text: "%s: The IUCV device cannot connect to a z/VM guest with no IUCV authorization\n" + * Severity: Error + * Parameter: + * @1: bus ID of the IUCV device + * Description: + * Because the remote z/VM guest virtual machine is not authorized for IUCV + * connections, the connection cannot be established. + * User action: + * Add the statements 'IUCV ALLOW' and 'IUCV ANY' to the z/VM directory + * entry of the remote z/VM guest virtual machine; then try again to + * establish the connection. See "z/VM CP Planning and Administration" + * for details about the IUCV statements. + */ + +/*? + * Text: "%s: Connecting the IUCV device failed with error %d\n" + * Severity: Error + * Parameter: + * @1: bus ID of the IUCV device + * @2: error code + * Description: + * The connection cannot be established because of an IUCV CONNECT error. + * User action: + * Report this problem to your support organization. + */ + +/*? + * Text: "%s: The IUCV device has been connected successfully to %s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the IUCV device + * @2: remote z/VM user ID + * Description: + * The connection has been established and the interface is ready to + * transmit communication packages. + * User action: + * None. + */ + +/*? + * Text: "%s: The IUCV interface to %s has been established successfully\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the IUCV device + * @2: remote z/VM user ID + * Description: + * The IUCV interface to the remote z/VM guest virtual machine has been + * established and can be activated with "ifconfig up" or an equivalent + * command. + * User action: + * None. + */ + +/*? + * Text: "%s: The IUCV device is connected to %s and cannot be removed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the IUCV device + * @2: remote z/VM user ID + * Description: + * Removing a connection failed because the interface is active with a peer + * interface on a remote z/VM guest virtual machine. + * User action: + * Deactivate the interface with "ifconfig down" or an equivalent command; + * then try again to remove the interface. + */ + +/*? + * Text: "%s: The peer z/VM guest %s has closed the connection\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the IUCV device + * @2: remote z/VM user ID + * Description: + * The peer interface is no longer available. + * User action: + * Either deactivate and remove the interface, or wait for the peer + * z/VM guest to re-establish the interface. + */ + +/*? Text: "driver unloaded\n" */ +/*? Text: "driver initialized\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/numa +++ linux-azure-4.13.0/Documentation/kmsg/s390/numa @@ -0,0 +1,11 @@ +/*? + * Text: "NUMA mode: %s\n" + * Severity: Informational + * Parameter: + * @1: mode + * Description: + * Linux started with the specified NUMA mode. + * User action: + * None. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/numa_emu +++ linux-azure-4.13.0/Documentation/kmsg/s390/numa_emu @@ -0,0 +1,50 @@ +/*? + * Text: "Not enough memory for %d nodes, reducing node count\n" + * Severity: Warning + * Parameter: + * @1: requested number of nodes + * Description: + * Using the requested memory stripe size for emulating the requested number of + * NUMA nodes requires more than the available memory. The number of nodes is + * specified with the emu_nodes= kernel parameter. The memory stripe size to + * be used for distributing the available memory among the nodes is specified + * with the emu_size= kernel parameter. Fewer nodes were created than the + * requested number; each node has one memory stripe of the requested size. + * User action: + * Specify fewer nodes, reduce the memory stripe size, or make more memory + * available to your Linux instance. + */ + +/*? + * Text: "Creating %d nodes with memory stripe size %ld MB\n" + * Severity: Informational + * Parameter: + * @1: number of nodes + * @2: stripe size + * Description: + * NUMA emulation is activated with the reported number of NUMA nodes. + * The specified memory stripe size is used to distribute, in round-robin + * fashion, the available memory among the nodes. + * User action: + * None. + */ + +/*? + * Text: "Increasing memory stripe size from %ld MB to %ld MB\n" + * Severity: Warning + * Parameter: + * @1: requested memory stripe size + * @2: adjusted memory stripe size + * Description: + * NUMA emulation could not use the requested memory stripe size and + * therefore has increased it to the next possible value. + * The requested memory stripe size is a default value or it was specified + * with the emu_size= kernel parameter. + * The memory stripe size must be a multiple of the memory block size that + * can be read in hexadecimal notation from + * /sys/devices/system/memory/block_size_bytes. + * User action: + * To avoid this message in the future, specify a valid memory stripe size + * with the emu_size= kernel parameter. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/os_info +++ linux-azure-4.13.0/Documentation/kmsg/s390/os_info @@ -0,0 +1,36 @@ +/*? + * Text: "entry %i: %s (addr=0x%lx size=%lu)\n" + * Severity: Informational + * Parameter: + * @1: entry ID + * @2: entry state + * @3: entry address + * @4: entry size + * Description: + * Linux is running in kdump mode and reports information defined by the + * previously running production kernel. Possible values for + * "entry state" are: + * + * - copied: The entry has been found, verified, and copied + * + * - not available: The entry has not been defined + * + * - checksum failed: The entry has been found, but it is not valid + * User action: + * If kdump fails, contact your service organization and include this message + * in the error report. + */ + +/*? + * Text: "crashkernel: addr=0x%lx size=%lu\n" + * Severity: Informational + * Parameter: + * @1: address + * @2: size + * Description: + * Linux is running in kdump mode and reports the address and size of + * the memory area that was reserved for kdump by the previously running + * production kernel. + * User action: + * None. + */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/perf +++ linux-azure-4.13.0/Documentation/kmsg/s390/perf @@ -0,0 +1,90 @@ +/*? + * Text: "CPU[%i] CPUM_CF: ver=%u.%u A=%04x E=%04x C=%04x\n" + * Severity: Informational + * Parameter: + * @1: cpu number + * @2: first version number + * @3: second version number + * @4: counter set authorization + * @5: counter set enable controls + * @6: counter set activation controls + * Description: + * This message displays information about the CPU-measurement counter facility + * (CPUM_CF) on a particular CPU. For details, see + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * None. + */ + +/*? + * Text: "CPU[%i] CPUM_SF: basic=%i diag=%i min=%lu max=%lu cpu_speed=%u\n" + * Severity: Informational + * Parameter: + * @1: cpu number + * @2: authorization status for the basic-sampling function + * @3: authorization status for the diagnostic-sampling function + * @4: minimum sampling interval + * @5: maximum sampling interval + * @6: cpu speed + * Description: + * This message displays generic information about the CPU-measurement sampling + * facility (CPUM_SF) on a particular CPU. For details, see + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * None. + */ + +/*? + * Text: "CPU[%i] CPUM_SF: Basic-sampling: a=%i e=%i c=%i bsdes=%i tear=%016lx dear=%016lx\n" + * Severity: Informational + * Parameter: + * @1: cpu number + * @2: authorization control + * @3: enable control + * @4: activation control + * @5: basic-sampling-data-entry size + * @6: tear register contents + * @7: dear register contents + * Description: + * This message displays information about the basic-sampling function of the + * CPU-measurement sampling facility (CPUM_SF) on a particular CPU. + * For details, see + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * None. + */ + +/*? + * Text: "CPU[%i] CPUM_SF: Diagnostic-sampling: a=%i e=%i c=%i dsdes=%i tear=%016lx dear=%016lx\n" + * Severity: Informational + * Parameter: + * @1: cpu number + * @2: authorization control + * @3: enable control + * @4: activation control + * @5: diagnostic-sampling-data-entry size + * @6: tear register contents + * @7: dear register contents + * Description: + * This message displays information about the diagnostic-sampling function of the + * CPU-measurement sampling facility (CPUM_SF) on a particular CPU. + * For details, see + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * None. + */ + +/*? + * Text: "The sampling facility is already reserved by %p\n" + * Severity: Warning + * Parameter: + * @1: address of perf sampling support owner + * Description: + * A process tried to reserve the sampling facility support, but it was already + * reserved by another process. + * User action: + * Check whether another process, for example, the perf program or OProfile is + * currently active. Retry activating the sampling facility after the other + * process has ended. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/prng +++ linux-azure-4.13.0/Documentation/kmsg/s390/prng @@ -0,0 +1,103 @@ +/* prng */ + +/*? + * Text: "prng runs in TDES mode with chunksize=%d and reseed_limit=%u\n" + * Severity: Informational + * Parameter: + * @1: read chunk size in bytes + * @2: reseed limit + * Description: + * The pseudo-random number device driver started in triple DES mode. + * For IBM mainframes earlier than IBM zEnterprise EC12 (zEC12), + * triple DES is the only available mode. + * As of zEC12, the preferred mode is SHA-512. + * User action: + * If triple DES is the expected mode, no action is required. + * Otherwise, verify that the prng started with the mode= module or + * prng.mode= kernel parameter set to a value other than 1. + * The value 1 forces triple DES mode. Also ensure that the mainframe + * runs with the latest firmware level. + */ + +/*? + * Text: "The prng module stopped after running in triple DES mode\n" + * Severity: Informational + * Description: + * The pseudo-random number device driver was running in triple DES mode. + * The device driver module, prng, was unloaded, or it stopped + * because Linux shut down. + * User action: + * None. + */ + +/*? + * Text: "The prng module cannot start in SHA-512 mode\n" + * Severity: Error + * Description: + * The pseudo-random number device driver was loaded with the mode= module parameter + * or the prng.mode= kernel parameter set to 2. This setting forces SHA-512 mode, + * but the required support for MSA 5 is not available. This support requires an IBM + * zEnterprise EC12 (zEC12) or later mainframe. + * User action: + * If your mainframe is earlier than zEC12, set the mode= module or + * prng.mode= kernel parameter to 0 or 1 to run the + * pseudo-random number device driver in triple DES mode. + * Otherwise, ensure that MSA 5 support available. + */ + +/*? + * Text: "prng runs in SHA-512 mode with chunksize=%d and reseed_limit=%u\n" + * Severity: Informational + * Parameter: + * @1: read chunk size in bytes + * @2: reseed limit + * Description: + * The pseudo-random number device driver started in SHA-512 mode. + * As of IBM zEnterprise EC12, this is the preferred mode. + * User action: + * None. + */ + +/*? + * Text: "The prng module stopped after running in SHA-512 mode\n" + * Severity: Informational + * Description: + * The pseudo-random number device driver was running in SHA-512 mode. + * The device driver module, prng, was unloaded, or stopped + * because Linux shut down. + * User action: + * None. + */ + +/*? + * Text: "The prng self test state test for the SHA-512 mode failed\n" + * Severity: Error + * Description: + * The pseudo-random number device driver is not operational because the self test failed. + * After processing a published National Institute of Standards and Technology (NIST) test vector for the + * Deterministic Random Bit Generator (DRBG) algorithm, the device driver + * was not in the expected working state. This failure might indicate + * that the cryptographic software or hardware is not working correctly. + * The processed NIST test vector was: Hash Drbg, Sha-512, Count #0. + * User action: + * Unload and reload the prng module, or + * if prng was compiled into the kernel, restart Linux. + * If the error persists, contact your support organization. + */ + +/*? + * Text: "The prng self test data test for the SHA-512 mode failed\n" + * Severity: Error + * Description: + * The pseudo-random number device driver is not operational because the self test failed. + * After processing a published National Institute of Standards and Technology (NIST) test vector for the + * Deterministic Random Bit Generator (DRBG) algorithm, the device driver + * did not produce the expected pseudo-random data. This failure might indicate + * that the cryptographic software or hardware is not working correctly. + * The processed NIST test vector was: Hash Drbg, Sha-512, Count #0. + * User action: + * Unload and reload the prng module, or + * if prng was compiled into the kernel, restart Linux. + * If the error persists, contact your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/qeth +++ linux-azure-4.13.0/Documentation/kmsg/s390/qeth @@ -0,0 +1,929 @@ +/*? + * Text: "%s: The LAN is offline\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A start LAN command was sent by the qeth device driver but the physical or + * virtual adapter has not started the LAN. The LAN might take a few seconds + * to become available. + * User action: + * Check the status of the qeth device, for example, with the lsqeth command. + * If the device does not become operational within a few seconds, initiate a + * recovery process, for example, by writing '1' to the 'recover' sysfs + * attribute of the device. + */ + +/*? + * Text: "%s: A recovery process has been started for the device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A recovery process was started either by the qeth device driver or through + * a user command. + * User action: + * Wait until a message indicates the completion of the recovery process. + */ + +/*? + * Text: "%s: The qeth device driver failed to recover an error on the device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth device driver performed an automatic recovery operation to recover + * an error on a qeth device. The recovery operation failed. + * User action: + * Try the following actions in the given order: i) Check the status of the + * qeth device, for example, with the lsqeth command. ii) Initiate a recovery + * process by writing '1' to the 'recover' sysfs attribute of the device. + * iii) Ungroup and regroup the subchannel triplet of the device. vi) Reboot + * Linux. v) If the problem persists, gather Linux debug data and report the + * problem to your support organization. + */ + +/*? + * Text: "%s: Device recovery failed to restore all offload features\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth device driver performed a recovery operation on a qeth device. Part + * of the recovery is to restore the offload features that were enabled before + * the recovery. At least one of those offload features could not be restored. + * User action: + * Check which offload features are enabled on the device, for example with + * the "ethtool -k" command. Try to explicitly re-enable the missing offload + * features for the device, for example with the "ethtool -K" command. + */ + +/*? + * Text: "%s: The link for interface %s on CHPID 0x%X failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * @3: CHPID + * Description: + * A network link failed. A possible reason for this error is that a physical + * network cable has been disconnected. + * User action: + * Ensure that the network cable on the adapter hardware is connected properly. + * If the connection is to a guest LAN, ensure that the device is still coupled + * to the guest LAN. + */ + +/*? + * Text: "%s: The link for %s on CHPID 0x%X has been restored\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * @3: CHPID + * Description: + * A failed network link has been re-established. A device recovery is in + * progress. + * User action: + * Wait until a message indicates the completion of the recovery process. + */ + +/*? + * Text: "%s: A hardware operation timed out on the device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A hardware operation timed out on the qeth device. + * User action: + * Check the status of the qeth device, for example, with the lsqeth command. + * If the device is not operational, initiate a recovery process, for example, + * by writing '1' to the 'recover' sysfs attribute of the device. + */ + +/*? + * Text: "%s: The adapter hardware is of an unknown type\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth device driver does not recognize the adapter hardware. The cause + * of this problem could be a hardware error or a Linux level that does not + * support your adapter hardware. + * User action: + * i) Investigate if your adapter hardware is supported by your Linux level. + * Consider using hardware that is supported by your Linux level or upgrading + * to a Linux level that supports your hardware. ii) Install the latest + * firmware on your adapter hardware. iii) If the problem persists and is not + * caused by a version mismatch, contact IBM support. + */ + +/*? + * Text: "%s: The adapter is used exclusively by another host\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth adapter is exclusively used by another host. + * User action: + * Use another qeth adapter or configure this one not exclusively to a + * particular host. + */ + +/*? + * Text: "%s: QDIO reported an error, rc=%i\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: return code + * Description: + * The QDIO subsystem reported an error. + * User action: + * Check for related QDIO errors. Check the status of the qeth device, for + * example, with the lsqeth command. If the device is not operational, initiate + * a recovery process, for example, by writing '1' to the 'recover' sysfs + * attribute of the device. + */ + +/*? + * Text: "%s: There is no kernel module to support discipline %d\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * @2: discipline + * Description: + * The qeth device driver or a user command requested a kernel module for a + * particular qeth discipline. Either the discipline is not supported by the + * qeth device driver or the requested module is not available to your Linux + * system. + * User action: + * Check if the requested discipline module has been compiled into the kernel + * or is present in /lib/modules//kernel/drivers/s390/net. + */ + +/*? + * Text: "Initializing the qeth device driver failed\n" + * Severity: Error + * Parameter: + * Description: + * The base module of the qeth device driver could not be initialized. + * User action: + * See errno.h to determine the reason for the error. + * i) Reboot Linux. ii) If the problem persists, gather Linux debug data and + * report the problem to your support organization. + */ + +/*? + * Text: "%s: Registering IP address %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: IP address + * Description: + * An IP address could not be registered with the network adapter. + * User action: + * Check if another operating system instance has already registered the + * IP address with the same network adapter or at the same logical IP subnet. + */ + +/*? + * Text: "%s: Reading the adapter MAC address failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth device driver could not read the MAC address from the network + * adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Starting ARP processing support for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not start ARP support on the network adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Starting IP fragmentation support for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not start IP fragmentation support on the + * network adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Starting VLAN support for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not start VLAN support on the network adapter. + * User action: + * None if you do not require VLAN support. If you need VLAN support, + * ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Starting multicast support for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not start multicast support on the network + * adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Activating IPv6 support for %s failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not activate IPv6 support on the network + * adapter. + * User action: + * None if you do not require IPv6 communication. If you need IPv6 support, + * ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Enabling the passthrough mode for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not enable the passthrough mode on the + * network adapter. The passthrough mode is required for all network traffic + * other than IPv4. In particular, the passthrough mode is required for IPv6 + * traffic. + * User action: + * None if all you want to support is IPv4 communication. If you want to support + * IPv6 or other network traffic apart from IPv4, ungroup and regroup the + * subchannel triplet of the device. If this does not resolve the problem, + * reboot Linux. If the problem persists, gather Linux debug data and report + * the problem to your support organization. + */ + +/*? + * Text: "%s: Enabling broadcast filtering for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not enable broadcast filtering on the network + * adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Setting up broadcast filtering for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not set up broadcast filtering on the network + * adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Setting up broadcast echo filtering for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not set up broadcast echo filtering on the + * network adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Starting HW checksumming for %s failed, using SW checksumming\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The network adapter supports hardware checksumming for IP packages + * but the qeth device driver could not start hardware checksumming on the + * adapter. The qeth device driver continues to use software checksumming for + * IP packages. + * User action: + * None if you do not require hardware checksumming for network + * traffic. If you want to enable hardware checksumming, ungroup and regroup + * the subchannel triplet of the device. If this does not resolve the problem, + * reboot Linux. If the problem persists, gather Linux debug data and report + * the problem to your support organization. + */ + +/*? + * Text: "%s: Enabling HW checksumming for %s failed, using SW checksumming\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The network adapter supports hardware checksumming for IP packages + * but the qeth device driver could not enable hardware checksumming on the + * adapter. The qeth device driver continues to use software checksumming for + * IP packages. + * User action: + * None if you do not require hardware checksumming for network + * traffic. If you want to enable hardware checksumming, ungroup and regroup + * the subchannel triplet of the device. If this does not resolve the problem, + * reboot Linux. If the problem persists, gather Linux debug data and report + * the problem to your support organization. + */ + +/*? + * Text: "%s: Starting outbound TCP segmentation offload for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The network adapter supports TCP segmentation offload, but the qeth device + * driver could not start this support on the adapter. + * User action: + * None if you do not require TCP segmentation offload. If you want to + * enable TCP segmentation offload, ungroup and regroup the subchannel triplet + * of the device. If this does not resolve the problem, reboot Linux. If the + * problem persists, gather Linux debug data and report the problem to your + * support organization. + */ + +/*? + * Text: "%s: The network adapter failed to generate a unique ID\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * In IBM mainframe environments, network interfaces are not identified by + * a specific MAC address. Therefore, the network adapters provide the network + * interfaces with unique IDs to be used in their IPv6 link local addresses. + * Without such a unique ID, duplicate addresses might be assigned in other + * LPARs. + * User action: + * Install the latest firmware on the adapter hardware. Manually, configure + * an IPv6 link local address for this device. + */ + +/*? + * Text: "There is no IPv6 support for the layer 3 discipline\n" + * Severity: Warning + * Description: + * If you want to use IPv6 with the layer 3 discipline, you need a Linux kernel + * with IPv6 support. Because your Linux kernel has not been compiled with + * IPv6 support, you cannot use IPv6 with the layer 3 discipline, even if your + * adapter supports IPv6. + * User action: + * Use a Linux kernel that has been complied to include IPv6 support if you + * want to use IPv6 with layer 3 qeth devices. + */ + +/*? + * Text: "%s: The qeth device is not configured for the OSI layer required by z/VM\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A qeth device that connects to a virtual network on z/VM must be configured for the + * same Open Systems Interconnection (OSI) layer as the virtual network. An ETHERNET + * guest LAN or VSWITCH uses the data link layer (layer 2) while an IP guest LAN + * or VSWITCH uses the network layer (layer 3). + * User action: + * If you are connecting to an ETHERNET guest LAN or VSWITCH, set the layer2 sysfs + * attribute of the qeth device to 1. If you are connecting to an IP guest LAN or + * VSWITCH, set the layer2 sysfs attribute of the qeth device to 0. + */ + +/*? + * Text: "%s: Starting source MAC-address support for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not enable source MAC-address on the network + * adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: MAC address %pM already exists\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: MAC-address + * Description: + * Setting the MAC address for the qeth device fails, because this + * MAC address is already defined on the OSA CHPID. + * User action: + * Use a different MAC address for this qeth device. + */ + +/*? + * Text: "%s: MAC address %pM is not authorized\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: MAC-address + * Description: + * This qeth device is a virtual network interface card (NIC), to which z/VM + * has already assigned a MAC address. z/VM MAC address verification does + * not allow you to change this predefined address. + * User action: + * None; use the MAC address that has been assigned by z/VM. + */ + +/*? + * Text: "%s: The HiperSockets network traffic analyzer is activated\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'. + * The corresponding HiperSockets interface has been switched into promiscuous mode. + * As a result, the HiperSockets network traffic analyzer is started on the device. + * User action: + * None. + */ + + /*? + * Text: "%s: The HiperSockets network traffic analyzer is deactivated\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'. + * Promiscuous mode has been switched off for the corresponding HiperSockets interface + * As a result, the HiperSockets network traffic analyzer is stopped on the device. + * User action: + * None. + */ + +/*? + * Text: "%s: The device is not authorized to run as a HiperSockets network traffic analyzer\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'. + * The corresponding HiperSockets interface is switched into promiscuous mode + * but the network traffic analyzer (NTA) rules configured at the Support Element (SE) + * do not allow tracing. Possible reasons are: + * - Tracing is not authorized for all HiperSockets LANs in the mainframe system + * - Tracing is not authorized for this HiperSockets LAN + * - LPAR is not authorized to enable an NTA + * User action: + * Configure appropriate HiperSockets NTA rules at the SE. + */ + +/*? + * Text: "%s: A HiperSockets network traffic analyzer is already active in the HiperSockets LAN\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'. + * The HiperSockets interface is switched into promiscuous mode but another + * HiperSockets device on the same HiperSockets LAN is already running as + * a network traffic analyzer. + * A HiperSockets LAN can only have one active network traffic analyzer. + * User action: + * Do not configure multiple HiperSockets devices in the same HiperSockets LAN as + * tracing devices. + */ + +/*? + * Text: "%s: Enabling HW TX checksumming for %s failed, using SW TX checksumming\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The network adapter supports hardware checksumming for outgoing IP packages + * but the qeth device driver could not enable hardware TX checksumming on the + * adapter. The qeth device driver continues to use software checksumming for + * outgoing IP packages. + * User action: + * None if you do not require hardware checksumming for outgoing network + * traffic. If you want to enable hardware checksumming, ungroup and regroup + * the subchannel triplet of the device. If this does not resolve the problem, + * reboot Linux. If the problem persists, gather Linux debug data and report + * the problem to your support organization. + */ + +/*? + * Text: "%s: A connection could not be established because of an OLM limit\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * z/OS has activated Optimized Latency Mode (OLM) for a connection through an OSA Express3 adapter. + * This reduces the maximum number of concurrent connections per physical port for shared adapters. + * The new connection would exceed the maximum. Linux cannot establish further connections using + * this adapter. + * User action: + * If possible, deactivate an existing connection that uses this adapter and try again to establish + * the new connection. If you cannot free an existing connection, use a different adapter for the + * new connection. + */ + +/*? + * Text: "%s: Setting the device online failed because of insufficient authorization\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth device is configured with OSX CHPIDs. An OSX CHPID cannot be activated unless the LPAR is explicitly authorized to access it. + * For z/VM guest operating systems, the z/VM user ID must be explicitly authorized in addition to the LPAR. + * You grant these authorizations through the Service Element. + * User action: + * At the Service Element, authorize the LPAR and, if applicable, the z/VM user ID for using the OSX CHPIDs with which the qeth device has been configured. + * Then try again to set the device online. + */ + +/*? + * Text: "%s: portname is deprecated and is ignored\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * An OSA-Express port name was required to identify a shared OSA port. + * All operating system instances that shared the port had to use the same port name. + * This requirement no longer applies, and the specified portname attribute is ignored. + * User action: + * For future upgrades, remove OSA port name specifications from your + * network configuration. + */ + +/*? Text: "core functions removed\n" */ +/*? Text: "%s: Device is a%s card%s%s%s\nwith link type %s.\n" */ +/*? Text: "%s: issue_next_read failed: no iob available!\n" */ +/*? Text: "%s: Priority Queueing not supported\n" */ +/*? Text: "%s: sense data available. cstat 0x%X dstat 0x%X\n" */ +/*? Text: "loading core functions\n" */ +/*? Text: "%s: MAC address %pM successfully registered on device %s\n" */ +/*? Text: "%s: Device successfully recovered!\n" */ +/*? Text: "register layer 2 discipline\n" */ +/*? Text: "unregister layer 2 discipline\n" */ +/*? Text: "%s: Hardware IP fragmentation not supported on %s\n" */ +/*? Text: "%s: IPv6 not supported on %s\n" */ +/*? Text: "%s: VLAN not supported on %s\n" */ +/*? Text: "%s: Inbound source MAC-address not supported on %s\n" */ +/*? Text: "%s: IPV6 enabled\n" */ +/*? Text: "%s: ARP processing not supported on %s!\n" */ +/*? Text: "%s: Hardware IP fragmentation enabled \n" */ +/*? Text: "%s: set adapter parameters not supported.\n" */ +/*? Text: "%s: VLAN enabled\n" */ +/*? Text: "register layer 3 discipline\n" */ +/*? Text: "%s: Outbound TSO enabled\n" */ +/*? Text: "%s: Broadcast not supported on %s\n" */ +/*? Text: "%s: Outbound TSO not supported on %s\n" */ +/*? Text: "%s: Inbound HW Checksumming not supported on %s,\ncontinuing using Inbound SW Checksumming\n" */ +/*? Text: "%s: Using no checksumming on %s.\n" */ +/*? Text: "%s: Broadcast enabled\n" */ +/*? Text: "%s: Multicast not supported on %s\n" */ +/*? Text: "%s: Using SW checksumming on %s.\n" */ +/*? Text: "%s: HW Checksumming (%sbound) enabled\n" */ +/*? Text: "unregister layer 3 discipline\n" */ +/*? Text: "%s: Multicast enabled\n" */ +/*? Text: "%s: QDIO data connection isolation is deactivated\n" */ +/*? Text: "%s: QDIO data connection isolation is activated\n" */ +/*? Text: "%s: Adapter does not support QDIO data connection isolation\n" */ +/*? Text: "%s: Adapter is dedicated. QDIO data connection isolation not supported\n" */ +/*? Text: "%s: TSO does not permit QDIO data connection isolation\n" */ +/*? Text: "%s: HW TX Checksumming enabled\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "qeth_l3: ignoring TR device\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ + +/*? + * Text: "%s: Turning off reflective relay mode at the adjacent switch failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The policy for the QDIO data connection isolation was + * changed successfully, and communications are now handled according to the + * new policy. The ISOLATION_FORWARD policy is no longer used, but the qeth + * device driver could not turn off the reflective relay mode on the adjacent + * switch port. + * User action: + * Check the adjacent switch for errors and correct the problem. + */ + +/*? + * Text: "%s: The adjacent switch port does not support reflective relay mode\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The 'isolation' sysfs attribute of the qeth device could not be set to 'forward'. + * This setting selects the ISOLATION_FORWARD policy for the QDIO data connection + * isolation. The ISOLATION_FORWARD policy requires a network adapter in Virtual + * Ethernet Port Aggregator (VEPA) mode with an adjacent switch port in reflective + * relay mode. + * User action: + * Use a switch port that supports reflective relay mode if you want to use the + * ISOLATION_FORWARD policy for the qeth device. + */ + +/*? + * Text: "%s: The reflective relay mode cannot be enabled at the adjacent switch port" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The 'isolation' sysfs attribute of the qeth device could not be set to 'forward'. + * This setting selects the ISOLATION_FORWARD policy for the QDIO data connection + * isolation. The ISOLATION_FORWARD policy requires a network adapter in Virtual + * Ethernet Port Aggregator (VEPA) mode with an adjacent switch port in reflective relay + * mode. The qeth device driver failed to enable the required reflective relay mode on + * the adjacent switch port although the switch port supports this mode. + * User action: + * Enable reflective relay mode on the switch for the adjacent port and try again. + */ + +/*? + * Text: "%s: Interface %s is down because the adjacent port is no longer in reflective relay mode\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * @2: interface name + * Description: + * The ISOLATION_FORWARD policy is active for the QDIO data connection isolation + * of the qeth device. This policy requires a network adapter in Virtual Ethernet + * Port Aggregator (VEPA) mode with an adjacent switch port in reflective relay mode. + * The reflective relay mode on the adjacent switch port was disabled. The qeth device + * was set offline and the interface was deactivated to prevent any unintended network traffic. + * User action: + * Enable the reflective relay mode again on the adjacent port or use the 'isolation' + * sysfs attribute of the qeth device to set a different policy for the QDIO data connection + * isolation. You can then resume operations by setting the qeth device back + * online and activating the interface. + */ + +/*? + * Text: "%s: Failed to create completion queue\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The HiperSockets device could not be configured with a completion queue. + * A completion queue is required to operate AF_IUCV communication in an LPAR. + * User action: + * i) Investigate if you have the latest firmware level in place. + * ii) If the problem persists and is not caused by a version mismatch, contact IBM + * support. + */ + +/*? + * Text: "%s: Completion Queueing supported\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The HiperSockets device supports completion queueing. This is required to + * set up AF_IUCV communication in an LPAR. + */ + +/*? + * Text: "%s: Completion Queue support enabled" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The HiperSockets device is enabled for completion queueing. This is part of + * the process to set up AF_IUCV communication in an LPAR. + */ + +/*? + * Text: "%s: Completion Queue support disabled" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The HiperSockets device is disabled for completion queueing. This device + * cannot or no longer be used to set up AF_IUCV communication in an LPAR. + */ + +/*? + * Text: "%s: The device represents a Bridge Capable Port\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * You can configure this device as a Bridge Port. + * User action: + * None. + */ + +/*? + * Text: "%s: The device is not configured as a Bridge Port\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The Bridge Port role cannot be withdrawn from a device + * that is not configured as a Bridge Port. + * User action: + * None. + */ + +/*? + * Text: "%s: The LAN already has a primary Bridge Port\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A LAN can have multiple secondary Bridge Ports, but only + * one primary Bridge Port. Configuring the device as a + * primary Bridge Port failed because another port on the + * LAN has been configured as the primary Bridge Port. + * User action: + * Find out which operating system instance has configured the primary + * Bridge Port. Assure that the primary role for this port is withdrawn + * before trying again to configure your device as the primary Bridge + * Port. Alternatively, consider configuring your device as a secondary + * Bridge Port. + */ + +/*? + * Text: "%s: The device is already a secondary Bridge Port\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A device cannot be configured as a primary or secondary + * Bridge Port if it is already configured as a secondary Bridge Port. + * User action: + * None, if you want the device to be a secondary Bridge Port. + * If you want to configure the device as the primary Bridge Port, + * withdraw the secondary role by writing 'none' to the 'bridgeport_role' + * sysfs attribute of the device. Then try again to configure the + * device as the primary Bridge Port. + */ + +/*? + * Text: "%s: The LAN cannot have more secondary Bridge Ports\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A LAN can have up to five secondary Bridge Ports. + * You cannot configure a further device as a secondary + * Bridge Port unless the Bridge Ports role is withdrawn from one of + * the existing secondary Bridge Ports. + * User action: + * Assure that the Bridge Port role is withdrawn from one of the + * existing secondary Bridge Ports before trying again to configure your + * device as a secondary Bridge Port. + */ + +/*? + * Text: "%s: The device is already a primary Bridge Port\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A device cannot be configured as a primary or secondary + * Bridge Port if it is already configured as a primary Bridge Port. + * User action: + * None, if you want the device to be a primary Bridge Port. + * If you want to configure the device as a secondary Bridge Port, + * withdraw the primary role by writing 'none' to the 'bridgeport_role' + * sysfs attribute of the device. Then try again to configure the + * device as the secondary Bridge Port. + */ + +/*? + * Text: "%s: The device is not authorized to be a Bridge Port\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The device cannot be configured as a Bridge Port because + * the required authorizations in the hardware are not in place. + * User action: + * See your hardware documentation about how to authorize + * ports for becoming a Bridge Port. + */ + +/*? + * Text: "%s: A Bridge Port is already configured by a different operating system\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * Linux instances cannot configure the target port as a Bridge Port. + * Another operating system already uses a Bridge Port on the HiperSockets + * or on the OSA adapter. For example, a z/VM instance might be using + * a port in a VSWITCH configuration. Multiple Bridge Ports on the same + * HiperSockets or OSA adapter must be configured by instances of the same + * operating system, for example, all Linux or all z/VM. + * User action: + * Reconsider your network topology. Configure Bridge Ports only for ports + * on adapters where any other Bridge Ports are configured by other Linux + * instances. + */ + +/*? + * Text: "%s: Setting address notification failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * Enabling or disabling the address notification feature of a + * HiperSockets device failed. The device might not be configured as a + * Bridge Port. + * User action: + * None, unless you need address notifications for this device. + * If you need notifications, confirm that your device is attached to a + * HiperSockets LAN that supports Bridge Capable Ports and that your + * device is configured as a Bridge Port. If the 'bridgeport_role' + * sysfs attribute of the device contains, one of the values 'primary' + * or 'secondary' and you cannot set the address notification, contact + * your support organization. + */ + +/*? + * Text: "%s: Address notification from the Bridge Port stopped %s (%s)\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * @3: error reported by the hardware + * Description: + * A Bridge Port no longer provides address notifications. + * Possible reasons include traffic overflow and that the device is no + * longer configured as a Bridge Port. A udev event with + * BRIDGEDHOST=abort was emitted to alert applications that rely on the + * address notifications. + * User action: + * None. + */ + +/*? + * Text: "%s: The qeth driver ran out of channel command buffers\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * Command buffers can temporarily run out during periods of + * intense network configuration activities. + * The device driver recovers from this condition as outstanding + * commands are completed. + * User action: + * Wait for a short time. If the problem persists, + * initiate a recovery process by writing '1' to the 'recover' + * sysfs attribute of the device. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/s390dbf +++ linux-azure-4.13.0/Documentation/kmsg/s390/s390dbf @@ -0,0 +1,83 @@ +/*? + * Text: "Root becomes the owner of all s390dbf files in sysfs\n" + * Severity: Warning + * Description: + * The S/390 debug feature you are using only supports uid/gid = 0. + * User action: + * None. + */ + +/*? + * Text: "Registering debug feature %s failed\n" + * Severity: Error + * Parameter: + * @1: feature name + * Description: + * The initialization of an S/390 debug feature failed. A likely cause of this + * problem is memory constraints. The system keeps running, but the debug + * data for this feature will not be available in sysfs. + * User action: + * Consider assigning more memory to your LPAR or z/VM guest virtual machine. + */ + +/*? + * Text: "Registering view %s/%s would exceed the maximum number of views %i\n" + * Severity: Error + * Parameter: + * @1: feature name + * @2: view name + * @3: maximum + * Description: + * The maximum number of allowed debug feature views has been reached. The + * view has not been registered. The system keeps running but the new view + * will not be available in sysfs. This is a program error. + * User action: + * Report this problem to your support partner. + */ + +/*? + * Text: "%s is not a valid level for a debug feature\n" + * Severity: Warning + * Parameter: + * @1: level + * Description: + * Setting a new level for a debug feature by using the 'level' sysfs attribute + * failed. Valid levels are the minus sign (-) and the integers in the + * range 0 to 6. The minus sign switches off the feature. The numbers switch + * the feature on, where higher numbers produce more debug output. + * User action: + * Write a valid value to the 'level' sysfs attribute. + */ + +/*? + * Text: "Flushing debug data failed because %c is not a valid area\n" + * Severity: Informational + * Parameter: + * @1: debug area number + * Description: + * Flushing a debug area by using the 'flush' sysfs attribute failed. Valid + * values are the minus sign (-) for flushing all areas, or the number of the + * respective area for flushing a single area. + * User action: + * Write a valid area number or the minus sign (-) to the 'flush' sysfs + * attribute. + */ + +/*? + * Text: "Allocating memory for %i pages failed\n" + * Severity: Informational + * Parameter: + * @1: number of pages + * Description: + * Setting the debug feature size by using the 'page' sysfs attribute failed. + * Linux did not have enough memory for expanding the debug feature to the + * requested size. + * User action: + * Use a smaller number of pages for the debug feature or allocate more + * memory to your LPAR or z/VM guest virtual machine. + */ + +/*? Text: "%s: set new size (%i pages)\n" */ +/*? Text: "%s: switched off\n" */ +/*? Text: "%s: level %i is out of range (%i - %i)\n" */ +/*? Text: "Registering view %s/%s failed due to out of memory\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/sclp_cmd +++ linux-azure-4.13.0/Documentation/kmsg/s390/sclp_cmd @@ -0,0 +1,44 @@ +/*? Text: "sync request failed (cmd=0x%08x, status=0x%02x)\n" */ +/*? Text: "readcpuinfo failed (response=0x%04x)\n" */ +/*? Text: "configure cpu failed (cmd=0x%08x, response=0x%04x)\n" */ +/*? Text: "configure channel-path failed (cmd=0x%08x, response=0x%04x)\n" */ +/*? Text: "read channel-path info failed (response=0x%04x)\n" */ +/*? Text: "assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n" */ +/*? Text: "configure PCI I/O adapter failed: cmd=0x%08x response=0x%04x\n" */ +/*? Text: "request failed (status=0x%02x)\n" */ +/*? Text: "request failed with response code 0x%x\n" */ + +/*? + * Text: "Memory hotplug state changed, suspend refused.\n" + * Severity: Error + * Description: + * Suspend is refused after a memory hotplug operation was performed. + * User action: + * The system needs to be restarted and no memory hotplug operation must be + * performed in order to allow suspend. + */ + +/*? + * Text: "Standby memory at 0x%llx (%lluM of %lluM usable)\n" + * Severity: Informational + * Parameter: + * @1: start address of standby memory + * @2: usable memory in MB + * @3: total detected memory in MB + * Description: + * Standby memory was detected. It can be used for memory hotplug only + * if it is aligned to the Linux hotplug memory block size. + * If the aligned amount of memory matches the total amount, + * all detected standby memory can be used. Otherwise, some of the detected + * memory is unaligned and cannot be used. + * User action: + * None, if the usable and the total amount of detected standby memory match. + * If the amounts of memory do not match, + * check the memory setup of your guest virtual machine and ensure that + * the standby memory start and end + * address is aligned to the Linux hotplug memory block size. + * On Linux, issue "cat /sys/devices/system/memory/block_size_bytes" + * to find the hotplug memory block size value in hexadecimal notation. + * On z/VM, query your memory setup with "vmcp q v store". + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/sclp_config +++ linux-azure-4.13.0/Documentation/kmsg/s390/sclp_config @@ -0,0 +1,15 @@ +/*? + * Text: "CPU capability may have changed\n" + * Severity: Informational + * Description: + * The capability of the CPUs in the configuration may have been upgraded + * or downgraded. This message may also appear if the capability of the + * CPUs in the configuration did not change. + * For details see the STORE SYSTEM INFORMATION description in the + * "Principles of Operation." + * User action: + * The user can examine /proc/sysinfo for CPU capability values. + */ +/*? Text: "Open for Business request failed with response code 0x%04x\n" */ +/*? Text: "SCLP receiver did not register to receive Configuration Management Data Events.\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/sclp_cpi +++ linux-azure-4.13.0/Documentation/kmsg/s390/sclp_cpi @@ -0,0 +1,3 @@ +/*? Text: "request failed (status=0x%02x)\n" */ +/*? Text: "request failed with response code 0x%x\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/sclp_ocf +++ linux-azure-4.13.0/Documentation/kmsg/s390/sclp_ocf @@ -0,0 +1 @@ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/sclp_sdias +++ linux-azure-4.13.0/Documentation/kmsg/s390/sclp_sdias @@ -0,0 +1,4 @@ +/*? Text: "sclp_send failed for get_nr_blocks\n" */ +/*? Text: "SCLP error: %x\n" */ +/*? Text: "sclp_send failed: %x\n" */ +/*? Text: "Error from SCLP while copying hsa. Event status = %x\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/scm_block +++ linux-azure-4.13.0/Documentation/kmsg/s390/scm_block @@ -0,0 +1,51 @@ +/*? + * Text: "%lx: The capabilities of the SCM increment changed\n" + * Severity: Informational + * Parameter: + * @1: start address of the SCM increment + * Description: + * A configuration change is in progress for the storage class memory (SCM) + * increment. + * User action: + * Verify that the capability of the SCM increment is as intended; for + * example, with lsscm. + */ + +/*? + * Text: "An I/O operation to SCM failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * An error occurred during I/O to storage class memory (SCM). The operation + * was repeated, but the maximum number of retries was exceeded before the + * request could be fulfilled. + * User action: + * Contact your support organization. + */ + +/*? + * Text: "%lx: Write access to the SCM increment is suspended\n" + * Severity: Informational + * Parameter: + * @1: start address of the SCM increment + * Description: + * A concurrent firmware upgrade is in progress. For the duration of the + * upgrade, write access to the storage class memory (SCM) increment has been + * suspended. + * User action: + * None. + */ + +/*? + * Text: "%lx: Write access to the SCM increment is restored\n" + * Severity: Informational + * Parameter: + * @1: start address of the SCM increment + * Description: + * Write access to the storage class memory (SCM) increment was restored + * after a temporary suspension during a concurrent firmware upgrade. + * User action: + * None. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/setup +++ linux-azure-4.13.0/Documentation/kmsg/s390/setup @@ -0,0 +1,165 @@ +/*? + * Text: "The initial RAM disk does not fit into the memory\n" + * Severity: Error + * Description: + * The load address and the size of the initial RAM disk specify a memory + * area that is not available. + * User action: + * Lower the load address of the initial RAM disk, reduce the size of the + * initial RAM disk, or increase the size of the system memory to make the + * initial RAM disk fit into the memory. + */ + +/*? + * Text: "The maximum memory size is %luMB\n" + * Severity: Notice + * Parameter: + * @1: size in MB + * Description: + * The system memory size cannot exceed the amount of memory that is + * provided by the real or virtual hardware. It can be further reduced + * through an upper memory address limit that is specified with the + * mem= kernel parameter. + * User action: + * None. + */ + +/*? + * Text: "Linux is running as a z/VM guest operating system in 31-bit mode\n" + * Severity: Informational + * Description: + * The 31-bit Linux kernel detected that it is running as a guest operating + * system of the z/VM hypervisor. + * User action: + * None. + */ + +/*? + * Text: "Linux is running natively in 31-bit mode\n" + * Severity: Informational + * Description: + * The 31-bit Linux kernel detected that it is running on an IBM mainframe, + * either as the sole operating system in an LPAR or as the sole operating + * system on the entire mainframe. The Linux kernel is not running as a + * guest operating system of the z/VM hypervisor. + * User action: + * None. + */ + +/*? + * Text: "The hardware system has IEEE compatible floating point units\n" + * Severity: Informational + * Description: + * The Linux kernel detected that it is running on a hardware system with + * CPUs that have IEEE compatible floating point units. + * User action: + * None. + */ + +/*? + * Text: "The hardware system has no IEEE compatible floating point units\n" + * Severity: Informational + * Description: + * The Linux kernel detected that it is running on a hardware system with + * CPUs that do not have IEEE compatible floating point units. + * User action: + * None. + */ + +/*? + * Text: "Linux is running as a z/VM guest operating system in 64-bit mode\n" + * Severity: Informational + * Description: + * The 64-bit Linux kernel detected that it is running as a guest operating + * system of the z/VM hypervisor. + * User action: + * None. + */ + +/*? + * Text: "Linux is running under KVM in 64-bit mode\n" + * Severity: Informational + * Description: + * The 64-bit Linux kernel detected that it is running as a guest operating + * system of the KVM hypervisor. + * User action: + * None. + */ + +/*? + * Text: "Linux is running natively in 64-bit mode\n" + * Severity: Informational + * Description: + * The 64-bit Linux kernel detected that it is running on an IBM mainframe, + * either as the sole operating system in an LPAR or as the sole operating + * system on the entire mainframe. The Linux kernel is not running as a + * guest operating system of the z/VM hypervisor. + * User action: + * None. + */ + +/*? + * Text: "Defining the Linux kernel NSS failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * The Linux kernel could not define the named saved system (NSS) with + * the z/VM CP DEFSYS command. The return code represents the numeric + * portion of the CP DEFSYS error message. + * User action: + * For return code 1, the z/VM guest virtual machine is not authorized + * to define named saved systems. + * Ensure that the z/VM guest virtual machine is authorized to issue + * the CP DEFSYS command (typically privilege class E). + * For other return codes, see the help and message documentation for + * the CP DEFSYS command. + */ + +/*? + * Text: "Saving the Linux kernel NSS failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * The Linux kernel could not save the named saved system (NSS) with + * the z/VM CP SAVESYS command. The return code represents the numeric + * portion of the CP SAVESYS error message. + * User action: + * For return code 1, the z/VM guest virtual machine is not authorized + * to save named saved systems. + * Ensure that the z/VM guest virtual machine is authorized to issue + * the CP SAVESYS command (typically privilege class E). + * For other return codes, see the help and message documentation for + * the CP SAVESYS command. + */ + +/*? + * Text: "crashkernel reservation failed: %s\n" + * Severity: Informational + * Parameter: + * @1: reason string + * Description: + * The memory reservation for the kdump "crashkernel" parameter was not + * successful. The Linux kernel was either not able to find a free memory + * area or an invalid area has been defined. The reason string describes the + * cause of the failure in more detail. + * User action: + * Increase the memory footprint of your virtual machine or adjust the values + * for the "crashkernel" kernel parameter. Then boot your Linux system again. + */ + +/*? + * Text: "Reserving %lluMB of memory at %lluMB for crashkernel (System RAM: %luMB)\n" + * Severity: Informational + * Parameter: + * @1: amount of reserved memory + * @2: storage location of reserved memory + * @3: amount of system RAM + * Description: + * The memory reservation for the kdump "crashkernel" parameter was successful + * and a kdump kernel can now be loaded with the kexec tool. + * User action: + * None. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/smsgiucv +++ linux-azure-4.13.0/Documentation/kmsg/s390/smsgiucv @@ -0,0 +1 @@ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/smsgiucv_app +++ linux-azure-4.13.0/Documentation/kmsg/s390/smsgiucv_app @@ -0,0 +1 @@ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/tape +++ linux-azure-4.13.0/Documentation/kmsg/s390/tape @@ -0,0 +1,63 @@ +/*? + * Text: "%s: A tape unit was detached while in use\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A tape unit has been detached from the I/O configuration while a tape + * was being accessed. This typically results in I/O error messages and + * potentially in damaged data on the tape. + * User action: + * Check the output of the application that accesses the tape device. + * If this problem occurred during a write-type operation, consider repeating + * the operation after bringing the tape device back online. + */ + +/*? + * Text: "%s: A tape cartridge has been mounted\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the tape device + * Description: + * A tape cartridge has been inserted into the tape unit. The tape in the + * tape unit is ready to be accessed. + * User action: + * None. + */ + +/*? + * Text: "%s: The tape cartridge has been successfully unloaded\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape cartridge has been unloaded from the tape unit. Insert a tape + * cartridge before accessing the tape device. + * User action: + * None. + */ + +/*? + * Text: "A cartridge is loaded in tape device %s, refusing to suspend\n" + * Severity: Error + * Parameter: + * @1: bus ID of the tape device + * Description: + * A request to suspend a tape device currently loaded with a cartridge is + * rejected. + * User action: + * Unload the tape device. Then try to suspend the system again. + */ + +/*? + * Text: "Tape device %s is busy, refusing to suspend\n" + * Severity: Error + * Parameter: + * @1: bus ID of the tape device + * Description: + * A request to suspend a tape device being currently in use is rejected. + * User action: + * Terminate applications performing tape operations + * and then try to suspend the system again. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/tape_34xx +++ linux-azure-4.13.0/Documentation/kmsg/s390/tape_34xx @@ -0,0 +1,418 @@ +/*? + * Text: "%s: An unexpected condition %d occurred in tape error recovery\n" + * Severity: Error + * Parameter: + * @1: bus ID of the tape device + * @2: number + * Description: + * The control unit has reported an error condition that is not recognized by + * the error recovery process of the tape device driver. + * User action: + * Report this problem and the condition number from the message to your + * support organization. + */ + +/*? + * Text: "%s: A data overrun occurred between the control unit and tape unit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A data overrun error has occurred on the connection between the control + * unit and the tape unit. If this problem occurred during a write-type + * operation, the integrity of the data on the tape might be compromised. + * User action: + * Use a faster connection. If this problem occurred during a write-type + * operation, consider repositioning the tape and repeating the operation. + */ + +/*? + * Text: "%s: The block ID sequence on the tape is incorrect\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The control unit has detected an incorrect block ID sequence on the tape. + * This problem typically indicates that the data on the tape is damaged. + * User action: + * If this problem occurred during a write-type operation reposition the tape + * and repeat the operation. + */ + +/*? + * Text: "%s: A read error occurred that cannot be recovered\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A read error has occurred that cannot be recovered. The current tape might + * be damaged. + * User action: + * None. + */ + +/*? + * Text: "%s: A write error on the tape cannot be recovered\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A write error has occurred that could not be recovered by the automatic + * error recovery process. + * User action: + * Use a different tape cartridge. + */ + +/*? + * Text: "%s: Writing the ID-mark failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The ID-mark at the beginning of tape could not be written. The tape medium + * might be write-protected. + * User action: + * Try a different tape cartridge. Ensure that the write-protection on the + * cartridge is switched off. + */ + +/*? + * Text: "%s: Reading the tape beyond the end of the recorded area failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A read-type operation failed because it extended beyond the end of the + * recorded area on the tape medium. + * User action: + * None. + */ + +/*? + * Text: "%s: The tape contains an incorrect block ID sequence\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The control unit has detected an incorrect block ID sequence on the tape. + * This problem typically indicates that the data on the tape is damaged. + * User action: + * If this problem occurred during a write-type operation reposition the tape + * and repeat the operation. + */ + +/*? + * Text: "%s: A path equipment check occurred for the tape device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A path equipment check has occurred. This check indicates problems with the + * connection between the mainframe system and the tape control unit. + * User action: + * Ensure that the cable connections between the mainframe system and the + * control unit are securely in place and not damaged. + */ + +/*? + * Text: "%s: The tape unit cannot process the tape format\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * Either the tape unit is not able to read the format ID mark, or the + * specified format is not supported by the tape unit. + * User action: + * If you do not need the data recorded on the current tape, use a different + * tape or write a new format ID mark at the beginning of the tape. Be aware + * that writing a new ID mark leads to a loss of all data that has been + * recorded on the tape. If you need the data on the current tape, use a tape + * unit that supports the tape format. + */ + +/*? + * Text: "%s: The tape medium is write-protected\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A write-type operation failed because the tape medium is write-protected. + * User action: + * Eject the tape cartridge, switch off the write protection on the cartridge, + * insert the cartridge, and try the operation again. + */ + +/*? + * Text: "%s: The tape does not have the required tape tension\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape does not have the required tape tension. + * User action: + * Rewind and reposition the tape, then repeat the operation. + */ + +/*? + * Text: "%s: The tape unit failed to load the cartridge\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * An error has occurred while loading the tape cartridge. + * User action: + * Unload the cartridge and load it again. + */ + +/*? + * Text: "%s: Automatic unloading of the tape cartridge failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit failed to unload the cartridge. + * User action: + * Unload the cartridge manually by using the eject button on the tape unit. + */ + +/*? + * Text: "%s: An equipment check has occurred on the tape unit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * Possible reasons for the check condition are a unit adapter error, a buffer + * error on the lower interface, an unusable internal path, or an error that + * has occurred while loading the cartridge. + * User action: + * Examine the tape unit and the cartridge loader. Consult the tape unit + * documentation for details. + */ + +/*? + * Text: "%s: The tape information states an incorrect length\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape is shorter than stated at the beginning of the tape data. A + * possible reason for this problem is that the tape might have been physically + * truncated. Data written to the tape might be incomplete or damaged. + * User action: + * If this problem occurred during a write-type operation, consider repeating + * the operation with a different tape cartridge. + */ + +/*? + * Text: "%s: The tape unit is not ready\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit is online but not ready. + * User action: + * Turn the ready switch on the tape unit to the ready position and try the + * operation again. + */ + +/*? + * Text: "%s: The tape medium has been rewound or unloaded manually\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit rewind button, unload button, or both have been used to + * rewind or unload the tape cartridge. A tape cartridge other than the + * intended cartridge might have been inserted or the tape medium might not + * be at the expected position. + * User action: + * Verify that the correct tape cartridge has been inserted and that the tape + * medium is at the required position before continuing to work with the tape. + */ + +/*? + * Text: "%s: The tape subsystem is running in degraded mode\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape subsystem is not operating at its maximum performance. + * User action: + * Contact your service representative for the tape unit and report this + * problem. + */ + +/*? + * Text: "%s: The tape unit is already assigned\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit is already assigned to another channel path. + * User action: + * Free the tape unit from the operating system instance to which it is + * currently assigned then try again. + */ + +/*? + * Text: "%s: The tape unit is not online\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit is not online to the tape device driver. + * User action: + * Ensure that the tape unit is operational and that the cable connections + * between the control unit and the tape unit are securely in place and not + * damaged. + */ + +/*? + * Text: "%s: The control unit has fenced access to the tape volume\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The control unit fences further access to the current tape volume. The data + * integrity on the tape volume might have been compromised. + * User action: + * Rewind and unload the tape cartridge. + */ + +/*? + * Text: "%s: A parity error occurred on the tape bus\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A data parity check error occurred on the bus. Data that was read or written + * while the error occurred is not valid. + * User action: + * Reposition the tape and repeat the read-type or write-type operation. + */ + +/*? + * Text: "%s: I/O error recovery failed on the tape control unit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * An I/O error occurred that cannot be recovered by the automatic error + * recovery process of the tape control unit. The application that operates + * the tape unit will receive a return value of -EIO which indicates an + * I/O error. The data on the tape might be damaged. + * User action: + * If this problem occurred during a write-type operation, consider + * repositioning the tape and repeating the operation. + */ + +/*? + * Text: "%s: The tape unit requires a firmware update\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit requires firmware patches from the tape control unit but the + * required patches are not available on the control unit. + * User action: + * Make the require patches available on the control unit then reposition the + * tape and retry the operation. For details about obtaining and installing + * firmware updates see the control unit documentation. + */ + +/*? + * Text: "%s: The maximum block size for buffered mode is exceeded\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The block to be written is larger than allowed for the buffered mode. + * User action: + * Use a smaller block size. + */ + +/*? + * Text: "%s: A channel interface error cannot be recovered\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * An error has occurred on the channel interface. This error cannot + * be recovered by the control unit error recovery process. + * User action: + * See the documentation of the control unit. + */ + +/*? + * Text: "%s: A channel protocol error occurred\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * An error was detected in the channel protocol. + * User action: + * Reposition the tape and try the operation again. + */ + +/*? + * Text: "%s: The tape unit does not support the compaction algorithm\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit cannot read the current tape. The data on the tape has been + * compressed with an algorithm that is not supported by the tape unit. + * User action: + * Use a tape unit that supports the compaction algorithm used for the + * current tape. + */ + +/*? + * Text: "%s: The tape unit does not support tape format 3480-2 XF\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit does not support tapes recorded in the 3480-2 XF format. + * User action: + * If you do not need the data recorded on the current tape, rewind the tape + * and overwrite it with a supported format. If you need the data on the + * current tape, use a tape unit that supports the tape format. + */ + +/*? + * Text: "%s: The tape unit does not support format 3480 XF\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit does not support tapes recorded in the 3480 XF format. + * User action: + * If you do not need the data recorded on the current tape, rewind the tape + * and overwrite it with a supported format. If you need the data on the + * current tape, use a tape unit that supports the tape format. + */ + +/*? + * Text: "%s: The tape unit does not support the current tape length\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The length of the tape in the cartridge is incompatible with the tape unit. + * User action: + * Either use a different tape unit or use a tape with a supported length. + */ + +/*? + * Text: "%s: The tape unit does not support the tape length\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The length of the tape in the cartridge is incompatible with the tape + * unit. + * User action: + * Either use a different tape unit or use a tape with a supported length. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/tape_3590 +++ linux-azure-4.13.0/Documentation/kmsg/s390/tape_3590 @@ -0,0 +1,183 @@ +/*? + * Text: "%s: The tape medium must be loaded into a different tape unit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape device has indicated an error condition that requires loading + * the tape cartridge into a different tape unit to recover. + * User action: + * Unload the cartridge and use a different tape unit to retry the operation. + */ + +/*? + * Text: "%s: Tape media information: exception %s, service %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: exception + * @3: service + * Description: + * This is an operating system independent tape medium information message + * that was issued by the tape unit. The information in the message is + * intended for the IBM customer engineer. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: Device subsystem information: exception %s, service %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: exception + * @3: required service action + * Description: + * This is an operating system independent device subsystem information message + * that was issued by the tape unit. The information in the message is + * intended for the IBM customer engineer. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: I/O subsystem information: exception %s, service %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: exception + * @3: required service action + * Description: + * This is an operating system independent I/O subsystem information message + * that was issued by the tape unit. The information in the message is + * intended for the IBM customer engineer. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: The tape unit has issued sense message %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: sense message code + * Description: + * The tape unit has issued an operating system independent sense message. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: The tape unit has issued an unknown sense message code 0x%x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: code + * Description: + * The tape device driver has received an unknown sense message from the + * tape unit. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: MIM SEV=%i, MC=%02x, ES=%x/%x, RC=%02x-%04x-%02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: SEV + * @3: message code + * @4: exception + * @5: required service action + * @6: refcode + * @7: mid + * @8: fid + * Description: + * This is an operating system independent information message that was + * issued by the tape unit. The information in the message is intended for + * the IBM customer engineer. + * User action: + * See to the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: IOSIM SEV=%i, DEVTYPE=3590/%02x, MC=%02x, ES=%x/%x, REF=0x%04x-0x%04x-0x%04x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: SEV + * @3: model + * @4: message code + * @5: exception + * @6: required service action + * @7: refcode1 + * @8: refcode2 + * @9: refcode3 + * Description: + * This is an operating system independent I/O subsystem information message + * that was issued by the tape unit. The information in the message is + * intended for the IBM customer engineer. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: DEVSIM SEV=%i, DEVTYPE=3590/%02x, MC=%02x, ES=%x/%x, REF=0x%04x-0x%04x-0x%04x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: SEV + * @3: model + * @4: message code + * @5: exception + * @6: required service action + * @7: refcode1 + * @8: refcode2 + * @9: refcode3 + * Description: + * This is an operating system independent device subsystem information message + * issued by the tape unit. The information in the message is intended for + * the IBM customer engineer. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: The tape unit has issued an unknown sense message code %x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: code + * Description: + * The tape device has issued a sense message, that is unknown to the device + * driver. + * User action: + * Use the message code printed as hexadecimal value and see the documentation + * for the tape unit for further information. + */ + +/*? + * Text: "%s: The tape unit failed to obtain the encryption key from EKM\n" + * Severity: Error + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit was unable to retrieve the encryption key required to decode + * the data on the tape from the enterprise key manager (EKM). + * User action: + * See the EKM and tape unit documentation for information about how to enable + * the tape unit to retrieve the encryption key. + */ + +/*? + * Text: "%s: A different host has privileged access to the tape unit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * You cannot access the tape unit because a different operating system + * instance has privileged access to the unit. + * User action: + * Unload the current cartridge to solve this problem. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/time +++ linux-azure-4.13.0/Documentation/kmsg/s390/time @@ -0,0 +1,36 @@ +/*? + * Text: "The ETR interface has adjusted the clock by %li microseconds\n" + * Severity: Notice + * Parameter: + * @1: number of microseconds + * Description: + * The external time reference (ETR) interface has synchronized the system + * clock with the external reference and set it to a new value. The time + * difference between the old and new clock value has been passed to the + * network time protocol (NTP) as a single shot adjustment. + * User action: + * None. + */ + +/*? + * Text: "The real or virtual hardware system does not provide an ETR interface\n" + * Severity: Warning + * Description: + * The 'etr=' parameter has been passed on the kernel parameter line for + * a Linux instance that does not have access to the external time reference + * (ETR) facility. + * User action: + * To avoid this warning remove the 'etr=' kernel parameter. + */ + +/*? + * Text: "The real or virtual hardware system does not provide an STP interface\n" + * Severity: Warning + * Description: + * The 'stp=' parameter has been passed on the kernel parameter line for + * a Linux instance that does not have access to the server time protocol + * (STP) facility. + * User action: + * To avoid this warning remove the 'stp=' kernel parameter. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/vmlogrdr +++ linux-azure-4.13.0/Documentation/kmsg/s390/vmlogrdr @@ -0,0 +1,19 @@ +/*? Text: "vmlogrdr: failed to start recording automatically\n" */ +/*? Text: "vmlogrdr: connection severed with reason %i\n" */ +/*? Text: "vmlogrdr: iucv connection to %s failed with rc %i \n" */ +/*? Text: "vmlogrdr: failed to stop recording automatically\n" */ +/*? Text: "not running under VM, driver not loaded.\n" */ + +/*? + * Text: "vmlogrdr: device %s is busy. Refuse to suspend.\n" + * Severity: Error + * Parameter: + * @1: device name + * Description: + * Suspending vmlogrdr devices that are in uses is not supported. + * A request to suspend such a device is refused. + * User action: + * Close all applications that use any of the vmlogrdr devices + * and then try to suspend the system again. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/vmur +++ linux-azure-4.13.0/Documentation/kmsg/s390/vmur @@ -0,0 +1,48 @@ +/*? + * Text: "The %s cannot be loaded without z/VM\n" + * Severity: Error + * Parameter: + * @1: z/VM virtual unit record device driver + * Description: + * The z/VM virtual unit record device driver provides Linux with access to + * z/VM virtual unit record devices like punch card readers, card punches, and + * line printers. On Linux instances that run in environments other than the + * z/VM hypervisor, the device driver does not provide any useful function and + * the corresponding vmur module cannot be loaded. + * User action: + * Load the vmur module only on Linux instances that run as guest operating + * systems of the z/VM hypervisor. If the z/VM virtual unit record device + * has been compiled into the kernel, ignore this message. + */ + +/*? + * Text: "Kernel function alloc_chrdev_region failed with error code %d\n" + * Severity: Error + * Parameter: + * @1: error code according to errno definitions + * Description: + * The z/VM virtual unit record device driver (vmur) needs to register a range + * of character device minor numbers from 0x0000 to 0xffff. + * This registration failed, probably because of memory constraints. + * User action: + * Free some memory and reload the vmur module. If the z/VM virtual unit + * record device driver has been compiled into the kernel reboot Linux. + * Consider assigning more memory to your LPAR or z/VM guest virtual machine. + */ + +/*? + * Text: "Unit record device %s is busy, %s refusing to suspend.\n" + * Severity: Error + * Parameter: + * @1: bus ID of the unit record device + * @1: z/VM virtual unit record device driver + * Description: + * Linux cannot be suspended while a unit record device is in use. + * User action: + * Stop all applications that work on z/VM spool file queues, for example, the + * vmur tool. Then try again to suspend Linux. + */ + +/*? Text: "%s loaded.\n" */ +/*? Text: "%s unloaded.\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/xpram +++ linux-azure-4.13.0/Documentation/kmsg/s390/xpram @@ -0,0 +1,74 @@ +/*? + * Text: "%d is not a valid number of XPRAM devices\n" + * Severity: Error + * Parameter: + * @1: number of partitions + * Description: + * The number of XPRAM partitions specified for the 'devs' module parameter + * or with the 'xpram.parts' kernel parameter must be an integer in the + * range 1 to 32. The XPRAM device driver created a maximum of 32 partitions + * that are probably not configured as intended. + * User action: + * If the XPRAM device driver has been compiled as a separate module, + * unload the module and load it again with a correct value for the 'devs' + * module parameter. If the XPRAM device driver has been compiled + * into the kernel, correct the 'xpram.parts' parameter in the kernel + * command line and restart Linux. + */ + +/*? + * Text: "Not enough expanded memory available\n" + * Severity: Error + * Description: + * The amount of expanded memory required to set up your XPRAM partitions + * depends on the 'sizes' parameter specified for the xpram module or on + * the specifications for the 'xpram.parts' parameter if the XPRAM device + * driver has been compiled into the kernel. Your + * current specification exceed the amount of available expanded memory. + * Your XPRAM partitions are probably not configured as intended. + * User action: + * If the XPRAM device driver has been compiled as a separate module, + * unload the xpram module and load it again with an appropriate value + * for the 'sizes' module parameter. If the XPRAM device driver has been + * compiled into the kernel, adjust the 'xpram.parts' parameter in the + * kernel command line and restart Linux. If you need more than the + * available expanded memory, increase the expanded memory allocation for + * your virtual hardware or LPAR. + */ + +/*? + * Text: "No expanded memory available\n" + * Severity: Error + * Description: + * The XPRAM device driver has been loaded in a Linux instance that runs + * in an LPAR or virtual hardware without expanded memory. + * No XPRAM partitions are created. + * User action: + * Allocate expanded memory for your LPAR or virtual hardware or do not + * load the xpram module. You can ignore this message, if you do not want + * to create XPRAM partitions. + */ + +/*? + * Text: "Resuming the system failed: %s\n" + * Severity: Error + * Parameter: + * @1: cause of the failure + * Description: + * A system cannot be resumed if the expanded memory setup changes + * after hibernation. Possible reasons for the failure are: + * - Expanded memory was removed after hibernation. + * - Size of the expanded memory changed after hibernation. + * The system is stopped with a kernel panic. + * User action: + * Reboot Linux. + */ + +/*? Text: " number of devices (partitions): %d \n" */ +/*? Text: " size of partition %d: %u kB\n" */ +/*? Text: " size of partition %d to be set automatically\n" */ +/*? Text: " memory needed (for sized partitions): %lu kB\n" */ +/*? Text: " partitions to be sized automatically: %d\n" */ +/*? Text: " automatically determined partition size: %lu kB\n" */ +/*? Text: " %u pages expanded memory found (%lu KB).\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/zcrypt +++ linux-azure-4.13.0/Documentation/kmsg/s390/zcrypt @@ -0,0 +1,22 @@ +/*? + * Text: "Cryptographic device %02x.%04x failed and was set offline\n" + * Severity: Error + * Parameter: + * @1: AP device ID + * @2: AP queue + * Description: + * A cryptographic device failed to process a cryptographic request. + * The cryptographic device driver could not correct the error and + * set the device offline. The application that issued the + * request received an indication that the request has failed. + * User action: + * Use the lszcrypt command to confirm that the cryptographic + * hardware is still configured to your LPAR or z/VM guest virtual + * machine. If the device is available to your Linux instance the + * command output contains a line that begins with 'card', + * where is the two-digit decimal number in the message text. + * After ensuring that the device is available, use the chzcrypt command to + * set it online again. + * If the error persists, contact your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/zdump +++ linux-azure-4.13.0/Documentation/kmsg/s390/zdump @@ -0,0 +1,27 @@ +/*? + * Text: "The 32-bit dump tool cannot be used for a 64-bit system\n" + * Severity: Alert + * Description: + * The dump process ends without creating a system dump. + * User action: + * Use a 64-bit dump tool to obtain a system dump for 64-bit Linux instance. + */ +/*? + * Text: "The 64-bit dump tool cannot be used for a 32-bit system\n" + * Severity: Alert + * Description: + * The dump process ends without creating a system dump. + * User action: + * Use a 32-bit dump tool to obtain a system dump for 32-bit Linux instance. + */ +/*? + * Text: "The dump process started for a 64-bit operating system\n" + * Severity: Alert + * Description: + * The SCSI dump process started to create a dump for a 64-bit operating + * system instance. + * User action: + * None. + */ +/*? Text: "0x%x is an unknown architecture.\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/zfcp +++ linux-azure-4.13.0/Documentation/kmsg/s390/zfcp @@ -0,0 +1,709 @@ +/*? + * Text: "%s is not a valid SCSI device\n" + * Severity: Error + * Parameter: + * @1: device specification + * Description: + * The specification for an initial SCSI device provided with the 'zfcp.device' + * kernel parameter or with the 'device' module parameter is syntactically + * incorrect. The specified SCSI device could not be attached to the Linux + * system. + * User action: + * Correct the value for the 'zfcp.device' or 'device' parameter and reboot + * Linux. See "Device Drivers, Features, and Commands" for information about + * the syntax. + */ + +/*? + * Text: "The zfcp device driver could not register with the common I/O layer\n" + * Severity: Error + * Description: + * The device driver initialization failed. A possible cause of this problem is + * memory constraints. + * User action: + * Free some memory and try again to load the zfcp device driver. If the zfcp + * device driver has been compiled into the kernel, reboot Linux. Consider + * assigning more memory to your LPAR or z/VM guest virtual machine. If the + * problem persists, contact your support organization. + */ + +/*? + * Text: "%s: Setting up data structures for the FCP adapter failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The zfcp device driver could not allocate data structures for an FCP adapter. + * A possible reason for this problem is memory constraints. + * User action: + * Set the FCP adapter offline or detach it from the Linux system, free some + * memory and set the FCP adapter online again or attach it again. If this + * problem persists, gather Linux debug data, collect the FCP adapter + * hardware logs, and report the problem to your support organization. + */ + +/*? + * Text: "%s: The FCP device is operational again\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * An FCP device has been unavailable because it had been detached from the + * Linux system or because the corresponding CHPID was offline. The FCP device + * is now available again and the zfcp device driver resumes all operations to + * the FCP device. + * User action: + * None. + */ + +/*? + * Text: "%s: The CHPID for the FCP device is offline\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The CHPID for an FCP device has been set offline, either logically in Linux + * or on the hardware. + * User action: + * Find out which CHPID corresponds to the FCP device, for example, with the + * lscss command. Check if the CHPID has been set logically offline in sysfs. + * Write 'on' to the CHPID's status attribute to set it online. If the CHPID is + * online in sysfs, find out if it has been varied offline through a hardware + * management interface, for example the service element (SE). + */ + +/*? + * Text: "%s: The FCP device has been detached\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * An FCP device is no longer available to Linux. + * User action: + * Ensure that the FCP adapter is operational and attached to the LPAR or z/VM + * virtual machine. + */ + +/*? + * Text: "%s: The FCP device did not respond within the specified time\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The common I/O layer waited for a response from the FCP adapter but + * no response was received within the specified time limit. This might + * indicate a hardware problem. + * User action: + * Consult your hardware administrator. If this problem persists, + * gather Linux debug data, collect the FCP adapter hardware logs, and + * report the problem to your support organization. + */ + +/*? + * Text: "%s: Registering the FCP device with the SCSI stack failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter could not be registered with the Linux SCSI + * stack. A possible reason for this problem is memory constraints. + * User action: + * Set the FCP adapter offline or detach it from the Linux system, free some + * memory and set the FCP adapter online again or attach it again. If this + * problem persists, gather Linux debug data, collect the FCP adapter + * hardware logs, and report the problem to your support organization. + */ + +/*? + * Text: "%s: ERP cannot recover an error on the FCP device\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * An error occurred on an FCP device. The error recovery procedure (ERP) + * could not resolve the error. The FCP device driver cannot use the FCP device. + * User action: + * Check for previous error messages for the same FCP device to find the + * cause of the problem. + */ + +/*? + * Text: "%s: Creating an ERP thread for the FCP device failed.\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The zfcp device driver could not set up error recovery procedure (ERP) + * processing for the FCP device. The FCP device is not available for use + * in Linux. + * User action: + * Free some memory and try again to load the zfcp device driver. If the zfcp + * device driver has been compiled into the kernel, reboot Linux. Consider + * assigning more memory to your LPAR or z/VM guest virtual machine. If the + * problem persists, contact your support organization. + */ + +/*? + * Text: "%s: ERP failed for LUN 0x%016Lx on port 0x%016Lx\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: LUN + * @3: WWPN + * Description: + * An error occurred on the SCSI device at the specified LUN. The error recovery + * procedure (ERP) could not resolve the error. The SCSI device is not + * available. + * User action: + * Verify that the LUN is correct. Check the fibre channel fabric for errors + * related to the specified WWPN and LUN, the storage server, and Linux. + */ + +/*? + * Text: "%s: ERP failed for remote port 0x%016Lx\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: WWPN + * Description: + * An error occurred on a remote port. The error recovery procedure (ERP) + * could not resolve the error. The port is not available. + * User action: + * Verify that the WWPN is correct and check the fibre channel fabric for + * errors related to the WWPN. + */ + +/*? + * Text: "%s: Registering port 0x%016Lx failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: WWPN + * Description: + * The Linux kernel could not allocate enough memory to register the + * remote port with the indicated WWPN with the SCSI stack. The remote + * port is not available. + * User action: + * Free some memory and trigger the rescan for ports. + */ + +/*? + * Text: "%s: A QDIO problem occurred\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * QDIO reported a problem to the zfcp device driver. The zfcp device driver + * tries to recover this problem. + * User action: + * Check for related error messages. If this problem occurs frequently, gather + * Linux debug data and contact your support organization. + */ + +/*? + * Text: "%s: Setting up the QDIO connection to the FCP adapter failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The zfcp device driver failed to establish a QDIO connection with the FCP + * adapter. + * User action: + * Set the FCP adapter offline or detach it from the Linux system, free some + * memory and set the FCP adapter online again or attach it again. If this + * problem persists, gather Linux debug data, collect the FCP adapter + * hardware logs, and report the problem to your support organization. + */ + +/*? + * Text: "%s: The FCP adapter reported a problem that cannot be recovered\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter has a problem that cannot be recovered by the zfcp device + * driver. The zfcp device driver stopped using the FCP device. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to your support organization. + */ + +/*? + * Text: "%s: There is a wrap plug instead of a fibre channel cable\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter is not physically connected to the fibre channel fabric. + * User action: + * Remove the wrap plug from the FCP adapter and connect the adapter with the + * fibre channel fabric. + */ + +/*? + * Text: "%s: FCP device not operational because of an unsupported FC class\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter hardware does not support the fibre channel service class + * requested by the zfcp device driver. This problem indicates a program error + * in the zfcp device driver. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to your support organization. + */ + +/*? + * Text: "%s: 0x%Lx is an ambiguous request identifier\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: request ID + * Description: + * The FCP adapter reported that it received the same request ID twice. This is + * an error. The zfcp device driver stopped using the FCP device. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to your support organization. + */ + +/*? + * Text: "%s: QTCB version 0x%x not supported by FCP adapter (0x%x to 0x%x)\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: requested version + * @3: lowest supported version + * @4: highest supported version + * Description: + * See message text. + * The queue transfer control block (QTCB) version requested by the zfcp device + * driver is not supported by the FCP adapter hardware. + * User action: + * If the requested version is higher than the highest version supported by the + * hardware, install more recent firmware on the FCP adapter. If the requested + * version is lower then the lowest version supported by the hardware, upgrade + * to a Linux level with a more recent zfcp device driver. + */ + +/*? + * Text: "%s: The FCP adapter could not log in to the fibre channel fabric\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The fibre channel switch rejected the login request from the FCP adapter. + * User action: + * Check the fibre channel fabric or switch logs for possible errors. + */ + +/*? + * Text: "%s: The FCP device is suspended because of a firmware update\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP device is not available while a firmware update is in progress. This + * problem is temporary. The FCP device will resume operations when the + * firmware update is completed. + * User action: + * Wait 10 seconds and try the operation again. + */ + +/*? + * Text: "%s: All NPIV ports on the FCP adapter have been assigned\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The number of N_Port ID Virtualization (NPIV) ports that can be assigned + * on an FCP adapter is limited. Once assigned, NPIV ports are not released + * automatically but have to be released explicitly through the support + * element (SE). + * User action: + * Identify NPIV ports that have been assigned but are no longer in use and + * release them from the SE. + */ + +/*? + * Text: "%s: The link between the FCP adapter and the FC fabric is down\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter is not usable. Specific error information is not available. + * User action: + * Check the cabling and the fibre channel fabric configuration. If this + * problem persists, gather Linux debug data, collect the FCP adapter + * hardware logs, and report the problem to your support organization. + */ + +/*? + * Text: "%s: The QTCB type is not supported by the FCP adapter\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The queue transfer control block (QTCB) type requested by the zfcp device + * driver is not supported by the FCP adapter hardware. + * User action: + * Install the latest firmware on your FCP adapter hardware. If this does not + * resolve the problem, upgrade to a Linux level with a more recent zfcp device + * driver. If the problem persists, contact your support organization. + */ + +/*? + * Text: "%s: The error threshold for checksum statistics has been exceeded\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter has reported a large number of bit errors. This might + * indicate a problem with the physical components of the fibre channel fabric. + * Details about the errors have been written to the HBA trace for the FCP + * adapter. + * User action: + * Check for problems in the fibre channel fabric and ensure that all cables + * are properly plugged. + */ + +/*? + * Text: "%s: The local link has been restored\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * A problem with the connection between the FCP adapter and the adjacent node + * on the fibre channel fabric has been resolved. The FCP adapter is now + * available again. + * User action: + * None. + */ + +/*? + * Text: "%s: The mode table on the FCP adapter has been damaged\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * This is an FCP adapter hardware problem. + * User action: + * Report this problem with FCP hardware logs to IBM support. + */ + +/*? + * Text: "%s: The adjacent fibre channel node does not support FCP\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The fibre channel switch or storage system that is connected to the FCP + * channel does not support the fibre channel protocol (FCP). The zfcp + * device driver stopped using the FCP device. + * User action: + * Check the adjacent fibre channel node. + */ + +/*? + * Text: "%s: The FCP adapter does not recognize the command 0x%x\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: command + * Description: + * A command code that was sent from the zfcp device driver to the FCP adapter + * is not valid. The zfcp device driver stopped using the FCP device. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to your support organization. + */ + +/*? + * Text: "%s: There is no light signal from the local fibre channel cable\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * There is no signal on the fibre channel cable that connects the FCP adapter + * to the fibre channel fabric. + * User action: + * Ensure that the cable is in place and connected properly to the FCP adapter + * and to the adjacent fibre channel switch or storage system. + */ + +/*? + * Text: "%s: The WWPN assignment file on the FCP adapter has been damaged\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * This is an FCP adapter hardware problem. + * User action: + * Report this problem with FCP hardware logs to IBM support. + */ + +/*? + * Text: "%s: The FCP device detected a WWPN that is duplicate or not valid\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * This condition indicates an error in the FCP adapter hardware or in the z/VM + * hypervisor. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to IBM support. + */ + +/*? + * Text: "%s: The fibre channel fabric does not support NPIV\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter requires N_Port ID Virtualization (NPIV) from the adjacent + * fibre channel node. Either the FCP adapter is connected to a fibre channel + * switch that does not support NPIV or the FCP adapter tries to use NPIV in a + * point-to-point setup. The connection is not operational. + * User action: + * Verify that NPIV is correctly used for this connection. Check the FCP adapter + * configuration and the fibre channel switch configuration. If necessary, + * update the fibre channel switch firmware. + */ + +/*? + * Text: "%s: The FCP adapter cannot support more NPIV ports\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * N_Port ID Virtualization (NPIV) ports consume physical resources on the FCP + * adapter. The FCP adapter resources are exhausted. The connection is not + * operational. + * User action: + * Analyze the number of available NPIV ports and which operating system + * instances use them. If necessary, reconfigure your setup to move some + * NPIV ports to an FCP adapter with free resources. + */ + +/*? + * Text: "%s: The adjacent switch cannot support more NPIV ports\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * N_Port ID Virtualization (NPIV) ports consume physical resources. The + * resources of the fibre channel switch that is connected to the FCP adapter + * are exhausted. The connection is not operational. + * User action: + * Analyze the number of available NPIV ports on the adjacent fibre channel + * switch and how they are used. If necessary, reconfigure your fibre channel + * fabric to accommodate the required NPIV ports. + */ + +/*? + * Text: "%s: 0x%x is not a valid transfer protocol status\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: status information + * Description: + * The transfer protocol status information reported by the FCP adapter is not + * a valid status for the zfcp device driver. The zfcp device driver stopped + * using the FCP device. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to your support organization. + */ + +/*? + * Text: "%s: Unknown or unsupported arbitrated loop fibre channel topology detected\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP device is connected to a fibre channel arbitrated loop or the FCP adapter + * reported an unknown fibre channel topology. The zfcp device driver supports + * point-to-point connections and switched fibre channel fabrics but not arbitrated + * loop topologies. The FCP device cannot be used. + * User action: + * Check the fibre channel setup and ensure that only supported topologies are + * connected to the FCP adapter. + */ + +/*? + * Text: "%s: FCP adapter maximum QTCB size (%d bytes) is too small\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: maximum supported size + * @3: requested QTCB size + * Description: + * The queue transfer control block (QTCB) size requested by the zfcp + * device driver is not supported by the FCP adapter hardware. + * User action: + * Update the firmware on your FCP adapter hardware to the latest + * available level and update the Linux kernel to the latest supported + * level. If the problem persists, contact your support organization. + */ + +/*? + * Text: "%s: The FCP adapter only supports newer control block versions\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The protocol supported by the FCP adapter is not compatible with the zfcp + * device driver. + * User action: + * Upgrade your Linux kernel to a level that includes a zfcp device driver + * with support for the control block version required by your FCP adapter. + */ + +/*? + * Text: "%s: The FCP adapter only supports older control block versions\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The protocol supported by the FCP adapter is not compatible with the zfcp + * device driver. + * User action: + * Install the latest firmware on your FCP adapter. + */ + +/*? + * Text: "%s: Not enough FCP adapter resources to open remote port 0x%016Lx\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: WWPN + * Description: + * Each port that is opened consumes physical resources of the FCP adapter to + * which it is attached. These resources are exhausted and the specified port + * cannot be opened. + * User action: + * Reduce the total number of remote ports that are attached to the + * FCP adapter. + */ + +/*? + * Text: "%s: LUN 0x%Lx on port 0x%Lx is already in use by CSS%d, MIF Image ID %x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: LUN + * @3: remote port WWPN + * @4: channel subsystem ID + * @5: MIF Image ID of the LPAR + * Description: + * The SCSI device at the indicated LUN is already in use by another system. + * Only one system at a time can use the SCSI device. + * User action: + * Ensure that the other system stops using the device before trying to use it. + */ + +/*? + * Text: "%s: No handle is available for LUN 0x%016Lx on port 0x%016Lx\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: LUN + * @3: WWPN + * Description: + * The FCP adapter can only open a limited number of SCSI devices. This limit + * has been reached and the SCSI device at the indicated LUN cannot be opened. + * User action: + * For FCP subchannels running in non-NPIV mode, check all SCSI + * devices opened through the FCP adapter and close some of them. For + * FCP subchannels running in NPIV mode, verify the SAN zoning and + * host connections on the storage systems. Ensure that the zoning and + * host connections only allow access to the required LUNs. As a + * workaround, disable the automatic LUN scanning by setting the + * zfcp.allow_lun_scan kernel parameter or the allow_lun_scan module + * parameter to 0. + */ + +/*? + * Text: "%s: Incorrect direction %d, LUN 0x%016Lx on port 0x%016Lx closed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: value in direction field + * @3: LUN + * @4: WWPN + * Description: + * The direction field in a SCSI request contains an incorrect value. The zfcp + * device driver closed down the SCSI device at the indicated LUN. + * User action: + * Gather Linux debug data and report this problem to your support organization. + */ + +/*? + * Text: "%s: Incorrect CDB length %d, LUN 0x%016Lx on port 0x%016Lx closed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: value in length field + * @3: LUN + * @4: WWPN + * Description: + * The control-data-block (CDB) length field in a SCSI request is not valid or + * too large for the FCP adapter. The zfcp device driver closed down the SCSI + * device at the indicated LUN. + * User action: + * Gather Linux debug data and report this problem to your support organization. + */ + +/*? + * Text: "%s: Opening WKA port 0x%x failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: destination ID of the WKA port + * Description: + * The FCP adapter rejected a request to open the specified + * well-known address (WKA) port. No retry is possible. + * User action: + * Verify the setup and check if the maximum number of remote ports + * used through this adapter is below the maximum allowed. If the + * problem persists, gather Linux debug data, collect the FCP adapter + * hardware logs, and report the problem to your support organization. + */ + +/*? + * Text: "%s: The name server reported %d words residual data\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: number of words in residual data + * Description: + * The fibre channel name server sent too much information about remote ports. + * The zfcp device driver did not receive sufficient information to attach all + * available remote ports in the SAN. + * User action: + * Verify that you are running the latest firmware level on the FCP + * adapter. Check your SAN setup and consider reducing the number of ports + * visible to the FCP adapter by using more restrictive zoning in the SAN. + */ + +/*? + * Text: "%s: A port opened with WWPN 0x%016Lx returned data that identifies it as WWPN 0x%016Lx\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: expected WWPN + * @3: reported WWPN + * Description: + * A remote port was opened successfully, but it reported an + * unexpected WWPN in the returned port login (PLOGI) data. This + * condition might have been caused by a change applied to the SAN + * configuration while the port was being opened. + * User action: + * If this condition is only temporary and access to the remote port + * is possible, no action is required. If the condition persists, + * identify the storage system with the specified WWPN and contact the + * support organization of the storage system. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/s390/zpci +++ linux-azure-4.13.0/Documentation/kmsg/s390/zpci @@ -0,0 +1,42 @@ +/*? + * Text: "%s: Event 0x%x reconfigured PCI function 0x%x\n" + * Severity: Informational + * Parameter: + * @1: device name of the function + * @2: PCI event code + * @3: function ID + * Description: + * The availability of a PCI function has changed. + * Possible reasons for the change include PCI configuration actions on the + * Hardware Management Console or hypervisor. + * For shared PCI functions, the function might also have been reserved or + * released by another system. + * If the device name of a function is shown as 'n/a', the device registration + * with the PCI device driver has not completed. + * The function ID identifies the function to the I/O configuration (IOCDS). + * The PCI event code can be useful diagnostic information for your support + * organization. + * User action: + * None. + */ + +/*? + * Text: "%s: Event 0x%x reports an error for PCI function 0x%x\n" + * Severity: Error + * Parameter: + * @1: device name of the function + * @2: PCI event code + * @3: function ID + * Description: + * A PCI function entered an error state from which it cannot recover + * automatically. + * User action: + * Trigger a recovery action by writing '1' to the 'recover' sysfs attribute + * of the PCI function. + * In sysfs, PCI functions are represented as /sys/bus/pci/devices/, + * where is the device name of the function. + * If the device name of a function is shown as 'n/a', the device + * registration with the PCI device driver has not completed. + * If the problem persists, contact your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.13.0.orig/Documentation/kmsg/sbp_target +++ linux-azure-4.13.0/Documentation/kmsg/sbp_target @@ -0,0 +1,49 @@ +/*? Text: "ABORT TASK SET not implemented\n" */ +/*? Text: "ABORT TASK not implemented\n" */ +/*? Text: "Cannot change the directory_id on an active target.\n" */ +/*? Text: "Cannot enable a target with no LUNs!\n" */ +/*? Text: "Could not update Config ROM\n" */ +/*? Text: "Ignoring ORB_POINTER write while active.\n" */ +/*? Text: "LOGICAL UNIT RESET not implemented\n" */ +/*? Text: "Node ACL not found for %s\n" */ +/*? Text: "Only one TPG per Unit is possible.\n" */ +/*? Text: "QUERY LOGINS not implemented\n" */ +/*? Text: "Reconnect timer expired for node: %016llx\n" */ +/*? Text: "SET PASSWORD not implemented\n" */ +/*? Text: "TARGET RESET not implemented\n" */ +/*? Text: "Unable to allocate struct sbp_nacl\n" */ +/*? Text: "Unable to allocate struct sbp_tpg\n" */ +/*? Text: "Unable to allocate struct sbp_tport\n" */ +/*? Text: "Waiting for reconnect from node: %016llx\n" */ +/*? Text: "cannot find login: %d\n" */ +/*? Text: "failed to allocate login descriptor\n" */ +/*? Text: "failed to allocate login response block\n" */ +/*? Text: "failed to allocate session descriptor\n" */ +/*? Text: "failed to init se_session\n" */ +/*? Text: "failed to map command block handler: %d\n" */ +/*? Text: "failed to read peer GUID: %d\n" */ +/*? Text: "ignoring management request while busy\n" */ +/*? Text: "ignoring request from foreign node (%x != %x)\n" */ +/*? Text: "ignoring request with wrong generation\n" */ +/*? Text: "initiator already logged-in\n" */ +/*? Text: "login to unknown LUN: %d\n" */ +/*? Text: "logout from different node ID\n" */ +/*? Text: "max number of logins reached\n" */ +/*? Text: "mgt_agent LOGIN to LUN %d from %016llx\n" */ +/*? Text: "mgt_agent LOGOUT from LUN %d session %d\n" */ +/*? Text: "mgt_agent RECONNECT from %016llx\n" */ +/*? Text: "mgt_agent RECONNECT login GUID doesn't match\n" */ +/*? Text: "mgt_agent RECONNECT unknown login ID\n" */ +/*? Text: "mgt_orb bad request\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "refusing exclusive login with other active logins\n" */ +/*? Text: "refusing login while another exclusive login present\n" */ +/*? Text: "sbp_run_transaction: page size ignored\n" */ +/*? Text: "sbp_send_sense: unknown sense format: 0x%x\n" */ +/*? Text: "target_fabric_configfs_init() failed\n" */ +/*? Text: "target_fabric_configfs_register() failed for SBP\n" */ +/*? Text: "unknown management function 0x%x\n" */ +/*? Text: "unlink LUN: failed to update unit directory\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ \ No newline at end of file --- linux-azure-4.13.0.orig/Documentation/kmsg/zram +++ linux-azure-4.13.0/Documentation/kmsg/zram @@ -0,0 +1,34 @@ +/*? Text: "Error allocating compressor buffer space\n" */ +/*? Text: "Error allocating memory for compressed page: %u, size=%zu\n" */ +/*? Text: "Error creating memory pool\n" */ +/*? Text: "num_devices not specified. Using default: 1\n" */ +/*? Text: "Error allocating compressor working memory!\n" */ +/*? Text: "Error allocating zram address table\n" */ +/*? Text: "Unable to get major number\n" */ +/*? Text: "Compression failed! err=%d\n" */ +/*? Text: "Decompression failed! err=%d, page=%u\n" */ +/*? Text: "There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1%% of the size of the disk when not in use so a huge zram is wasteful.\n\tMemory Size: %zu kB\n\tSize you selected: %llu kB\nContinuing anyway ...\n" */ +/*? Text: "disk size not provided. You can use disksize_kb module param to specify size.\nUsing default: (%u%% of RAM).\n" */ +/*? Text: "Error creating sysfs group" */ +/*? Text: "Error allocating memory for incompressible page: %u\n" */ +/*? Text: "Creating %u devices ...\n" */ +/*? Text: "Initialization failed: err=%d\n" */ +/*? Text: "Error allocating disk queue for device %d\n" */ +/*? Text: "Error allocating disk structure for device %d\n" */ +/*? Text: "Invalid value for num_devices: %u\n" */ +/*? Text: "Error allocating temp memory!\n" */ +/*? Text: "Unable to allocate temp memory\n" */ +/*? Text: "Created %u device(s) ...\n" */ +/*? Text: "There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1%% of the size of the disk when not in use so a huge zram is wasteful.\n\tMemory Size: %lu kB\n\tSize you selected: %llu kB\nContinuing anyway ...\n" */ +/*? Text: "Cannot change disksize for initialized device\n" */ +/*? Text: "Can't change algorithm for initialized device\n" */ +/*? Text: "Cannot initialise %s compressing backend\n" */ +/*? Text: "Cannot change max compression streams\n" */ +/*? Text: "Destroyed %u device(s)\n" */ +/*? Text: "Created %u device(s)\n" */ +/*? Text: "Unable to register zram-control class\n" */ +/*? Text: "Removed device: %s\n" */ +/*? Text: "Added device: %s\n" */ +/*? Text: "Error creating sysfs group for device %d\n" */ +/*? Text: "Error allocating memory for compressed page: %u, size=%u\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ \ No newline at end of file --- linux-azure-4.13.0.orig/Documentation/networking/netvsc.txt +++ linux-azure-4.13.0/Documentation/networking/netvsc.txt @@ -0,0 +1,63 @@ +Hyper-V network driver +====================== + +Compatibility +============= + +This driver is compatible with Windows Server 2012 R2, 2016 and +Windows 10. + +Features +======== + + Checksum offload + ---------------- + The netvsc driver supports checksum offload as long as the + Hyper-V host version does. Windows Server 2016 and Azure + support checksum offload for TCP and UDP for both IPv4 and + IPv6. Windows Server 2012 only supports checksum offload for TCP. + + Receive Side Scaling + -------------------- + Hyper-V supports receive side scaling. For TCP, packets are + distributed among available queues based on IP address and port + number. Current versions of Hyper-V host, only distribute UDP + packets based on the IP source and destination address. + The port number is not used as part of the hash value for UDP. + Fragmented IP packets are not distributed between queues; + all fragmented packets arrive on the first channel. + + Generic Receive Offload, aka GRO + -------------------------------- + The driver supports GRO and it is enabled by default. GRO coalesces + like packets and significantly reduces CPU usage under heavy Rx + load. + + SR-IOV support + -------------- + Hyper-V supports SR-IOV as a hardware acceleration option. If SR-IOV + is enabled in both the vSwitch and the guest configuration, then the + Virtual Function (VF) device is passed to the guest as a PCI + device. In this case, both a synthetic (netvsc) and VF device are + visible in the guest OS and both NIC's have the same MAC address. + + The VF is enslaved by netvsc device. The netvsc driver will transparently + switch the data path to the VF when it is available and up. + Network state (addresses, firewall, etc) should be applied only to the + netvsc device; the slave device should not be accessed directly in + most cases. The exceptions are if some special queue discipline or + flow direction is desired, these should be applied directly to the + VF slave device. + + Receive Buffer + -------------- + Packets are received into a receive area which is created when device + is probed. The receive area is broken into MTU sized chunks and each may + contain one or more packets. The number of receive sections may be changed + via ethtool Rx ring parameters. + + There is a similar send buffer which is used to aggregate packets for sending. + The send area is broken into chunks of 6144 bytes, each of section may + contain one or more packets. The send buffer is an optimization, the driver + will use slower method to handle very large packets or if the send buffer + area is exhausted. --- linux-azure-4.13.0.orig/Documentation/printk-formats.txt +++ linux-azure-4.13.0/Documentation/printk-formats.txt @@ -441,6 +441,12 @@ Passed by reference. +Kernel messages: + + %pj 123456 + + For generating the jhash of a string truncated to six digits + If you add other ``%p`` extensions, please extend lib/test_printf.c with one or more test cases, if at all feasible. --- linux-azure-4.13.0.orig/Documentation/sysctl/kernel.txt +++ linux-azure-4.13.0/Documentation/sysctl/kernel.txt @@ -74,6 +74,7 @@ - reboot-cmd [ SPARC only ] - rtsig-max - rtsig-nr +- seccomp/ ==> Documentation/userspace-api/seccomp_filter.rst - sem - sem_next_id [ sysv ipc ] - sg-big-buff [ generic SCSI device (sg) ] --- linux-azure-4.13.0.orig/Documentation/userspace-api/seccomp_filter.rst +++ linux-azure-4.13.0/Documentation/userspace-api/seccomp_filter.rst @@ -141,6 +141,15 @@ allow use of ptrace, even of other sandboxed processes, without extreme care; ptracers can use this mechanism to escape.) +``SECCOMP_RET_LOG``: + Results in the system call being executed after it is logged. This + should be used by application developers to learn which syscalls their + application needs without having to iterate through multiple test and + development cycles to build the list. + + This action will only be logged if "log" is present in the + actions_logged sysctl string. + ``SECCOMP_RET_ALLOW``: Results in the system call being executed. @@ -169,7 +178,41 @@ and a more generic example of a higher level macro interface for BPF program generation. +Sysctls +======= + +Seccomp's sysctl files can be found in the ``/proc/sys/kernel/seccomp/`` +directory. Here's a description of each file in that directory: +``actions_avail``: + A read-only ordered list of seccomp return values (refer to the + ``SECCOMP_RET_*`` macros above) in string form. The ordering, from + left-to-right, is the least permissive return value to the most + permissive return value. + + The list represents the set of seccomp return values supported + by the kernel. A userspace program may use this list to + determine if the actions found in the ``seccomp.h``, when the + program was built, differs from the set of actions actually + supported in the current running kernel. + +``actions_logged``: + A read-write ordered list of seccomp return values (refer to the + ``SECCOMP_RET_*`` macros above) that are allowed to be logged. Writes + to the file do not need to be in ordered form but reads from the file + will be ordered in the same way as the actions_avail sysctl. + + It is important to note that the value of ``actions_logged`` does not + prevent certain actions from being logged when the audit subsystem is + configured to audit a task. If the action is not found in + ``actions_logged`` list, the final decision on whether to audit the + action for that task is ultimately left up to the audit subsystem to + decide for all seccomp return values other than ``SECCOMP_RET_ALLOW``. + + The ``allow`` string is not accepted in the ``actions_logged`` sysctl + as it is not possible to log ``SECCOMP_RET_ALLOW`` actions. Attempting + to write ``allow`` to the sysctl will result in an EINVAL being + returned. Adding architecture support =========================== --- linux-azure-4.13.0.orig/Documentation/virtual/kvm/devices/arm-vgic.txt +++ linux-azure-4.13.0/Documentation/virtual/kvm/devices/arm-vgic.txt @@ -83,6 +83,11 @@ Bits for undefined preemption levels are RAZ/WI. + Note that this differs from a CPU's view of the APRs on hardware in which + a GIC without the security extensions expose group 0 and group 1 active + priorities in separate register groups, whereas we show a combined view + similar to GICv2's GICH_APR. + For historical reasons and to provide ABI compatibility with userspace we export the GICC_PMR register in the format of the GICH_VMCR.VMPriMask field in the lower 5 bits of a word, meaning that userspace must always --- linux-azure-4.13.0.orig/Documentation/watchdog/watchdog-parameters.txt +++ linux-azure-4.13.0/Documentation/watchdog/watchdog-parameters.txt @@ -117,7 +117,7 @@ ------------------------------------------------- iTCO_wdt: heartbeat: Watchdog heartbeat in seconds. - (5<=heartbeat<=74 (TCO v1) or 1226 (TCO v2), default=30) + (2 Vikas Shivappa -This feature is enabled by the CONFIG_INTEL_RDT_A Kconfig and the -X86 /proc/cpuinfo flag bits "rdt", "cat_l3" and "cdp_l3". +This feature is enabled by the CONFIG_INTEL_RDT Kconfig and the +X86 /proc/cpuinfo flag bits "rdt", "cqm", "cat_l3" and "cdp_l3". To use the feature mount the file system: @@ -17,6 +17,13 @@ "cdp": Enable code/data prioritization in L3 cache allocations. +RDT features are orthogonal. A particular system may support only +monitoring, only control, or both monitoring and control. + +The mount succeeds if either of allocation or monitoring is present, but +only those files and directories supported by the system will be created. +For more details on the behavior of the interface during monitoring +and allocation, see the "Resource alloc and monitor groups" section. Info directory -------------- @@ -24,7 +31,12 @@ The 'info' directory contains information about the enabled resources. Each resource has its own subdirectory. The subdirectory names reflect the resource names. -Cache resource(L3/L2) subdirectory contains the following files: + +Each subdirectory contains the following files with respect to +allocation: + +Cache resource(L3/L2) subdirectory contains the following files +related to allocation: "num_closids": The number of CLOSIDs which are valid for this resource. The kernel uses the smallest number of @@ -36,7 +48,15 @@ "min_cbm_bits": The minimum number of consecutive bits which must be set when writing a mask. -Memory bandwitdh(MB) subdirectory contains the following files: +"shareable_bits": Bitmask of shareable resource with other executing + entities (e.g. I/O). User can use this when + setting up exclusive cache partitions. Note that + some platforms support devices that have their + own settings for cache use which can over-ride + these bits. + +Memory bandwitdh(MB) subdirectory contains the following files +with respect to allocation: "min_bandwidth": The minimum memory bandwidth percentage which user can request. @@ -52,48 +72,152 @@ non-linear. This field is purely informational only. -Resource groups ---------------- -Resource groups are represented as directories in the resctrl file -system. The default group is the root directory. Other groups may be -created as desired by the system administrator using the "mkdir(1)" -command, and removed using "rmdir(1)". - -There are three files associated with each group: - -"tasks": A list of tasks that belongs to this group. Tasks can be - added to a group by writing the task ID to the "tasks" file - (which will automatically remove them from the previous - group to which they belonged). New tasks created by fork(2) - and clone(2) are added to the same group as their parent. - If a pid is not in any sub partition, it is in root partition - (i.e. default partition). - -"cpus": A bitmask of logical CPUs assigned to this group. Writing - a new mask can add/remove CPUs from this group. Added CPUs - are removed from their previous group. Removed ones are - given to the default (root) group. You cannot remove CPUs - from the default group. - -"cpus_list": One or more CPU ranges of logical CPUs assigned to this - group. Same rules apply like for the "cpus" file. - -"schemata": A list of all the resources available to this group. - Each resource has its own line and format - see below for - details. +If RDT monitoring is available there will be an "L3_MON" directory +with the following files: + +"num_rmids": The number of RMIDs available. This is the + upper bound for how many "CTRL_MON" + "MON" + groups can be created. + +"mon_features": Lists the monitoring events if + monitoring is enabled for the resource. + +"max_threshold_occupancy": + Read/write file provides the largest value (in + bytes) at which a previously used LLC_occupancy + counter can be considered for re-use. -When a task is running the following rules define which resources -are available to it: + +Resource alloc and monitor groups +--------------------------------- + +Resource groups are represented as directories in the resctrl file +system. The default group is the root directory which, immediately +after mounting, owns all the tasks and cpus in the system and can make +full use of all resources. + +On a system with RDT control features additional directories can be +created in the root directory that specify different amounts of each +resource (see "schemata" below). The root and these additional top level +directories are referred to as "CTRL_MON" groups below. + +On a system with RDT monitoring the root directory and other top level +directories contain a directory named "mon_groups" in which additional +directories can be created to monitor subsets of tasks in the CTRL_MON +group that is their ancestor. These are called "MON" groups in the rest +of this document. + +Removing a directory will move all tasks and cpus owned by the group it +represents to the parent. Removing one of the created CTRL_MON groups +will automatically remove all MON groups below it. + +All groups contain the following files: + +"tasks": + Reading this file shows the list of all tasks that belong to + this group. Writing a task id to the file will add a task to the + group. If the group is a CTRL_MON group the task is removed from + whichever previous CTRL_MON group owned the task and also from + any MON group that owned the task. If the group is a MON group, + then the task must already belong to the CTRL_MON parent of this + group. The task is removed from any previous MON group. + + +"cpus": + Reading this file shows a bitmask of the logical CPUs owned by + this group. Writing a mask to this file will add and remove + CPUs to/from this group. As with the tasks file a hierarchy is + maintained where MON groups may only include CPUs owned by the + parent CTRL_MON group. + + +"cpus_list": + Just like "cpus", only using ranges of CPUs instead of bitmasks. + + +When control is enabled all CTRL_MON groups will also contain: + +"schemata": + A list of all the resources available to this group. + Each resource has its own line and format - see below for details. + +When monitoring is enabled all MON groups will also contain: + +"mon_data": + This contains a set of files organized by L3 domain and by + RDT event. E.g. on a system with two L3 domains there will + be subdirectories "mon_L3_00" and "mon_L3_01". Each of these + directories have one file per event (e.g. "llc_occupancy", + "mbm_total_bytes", and "mbm_local_bytes"). In a MON group these + files provide a read out of the current value of the event for + all tasks in the group. In CTRL_MON groups these files provide + the sum for all tasks in the CTRL_MON group and all tasks in + MON groups. Please see example section for more details on usage. + +Resource allocation rules +------------------------- +When a task is running the following rules define which resources are +available to it: 1) If the task is a member of a non-default group, then the schemata -for that group is used. + for that group is used. 2) Else if the task belongs to the default group, but is running on a -CPU that is assigned to some specific group, then the schemata for -the CPU's group is used. + CPU that is assigned to some specific group, then the schemata for the + CPU's group is used. 3) Otherwise the schemata for the default group is used. +Resource monitoring rules +------------------------- +1) If a task is a member of a MON group, or non-default CTRL_MON group + then RDT events for the task will be reported in that group. + +2) If a task is a member of the default CTRL_MON group, but is running + on a CPU that is assigned to some specific group, then the RDT events + for the task will be reported in that group. + +3) Otherwise RDT events for the task will be reported in the root level + "mon_data" group. + + +Notes on cache occupancy monitoring and control +----------------------------------------------- +When moving a task from one group to another you should remember that +this only affects *new* cache allocations by the task. E.g. you may have +a task in a monitor group showing 3 MB of cache occupancy. If you move +to a new group and immediately check the occupancy of the old and new +groups you will likely see that the old group is still showing 3 MB and +the new group zero. When the task accesses locations still in cache from +before the move, the h/w does not update any counters. On a busy system +you will likely see the occupancy in the old group go down as cache lines +are evicted and re-used while the occupancy in the new group rises as +the task accesses memory and loads into the cache are counted based on +membership in the new group. + +The same applies to cache allocation control. Moving a task to a group +with a smaller cache partition will not evict any cache lines. The +process may continue to use them from the old partition. + +Hardware uses CLOSid(Class of service ID) and an RMID(Resource monitoring ID) +to identify a control group and a monitoring group respectively. Each of +the resource groups are mapped to these IDs based on the kind of group. The +number of CLOSid and RMID are limited by the hardware and hence the creation of +a "CTRL_MON" directory may fail if we run out of either CLOSID or RMID +and creation of "MON" group may fail if we run out of RMIDs. + +max_threshold_occupancy - generic concepts +------------------------------------------ + +Note that an RMID once freed may not be immediately available for use as +the RMID is still tagged the cache lines of the previous user of RMID. +Hence such RMIDs are placed on limbo list and checked back if the cache +occupancy has gone down. If there is a time when system has a lot of +limbo RMIDs but which are not ready to be used, user may see an -EBUSY +during mkdir. + +max_threshold_occupancy is a user configurable value to determine the +occupancy at which an RMID can be freed. Schemata files - general concepts --------------------------------- @@ -143,22 +267,22 @@ sharing a core will result in both threads being throttled to use the low bandwidth. -L3 details (code and data prioritization disabled) --------------------------------------------------- +L3 schemata file details (code and data prioritization disabled) +---------------------------------------------------------------- With CDP disabled the L3 schemata format is: L3:=;=;... -L3 details (CDP enabled via mount option to resctrl) ----------------------------------------------------- +L3 schemata file details (CDP enabled via mount option to resctrl) +------------------------------------------------------------------ When CDP is enabled L3 control is split into two separate resources so you can specify independent masks for code and data like this: L3data:=;=;... L3code:=;=;... -L2 details ----------- +L2 schemata file details +------------------------ L2 cache does not support code and data prioritization, so the schemata format is always: @@ -185,6 +309,8 @@ L3DATA:0=fffff;1=fffff;2=3c0;3=fffff L3CODE:0=fffff;1=fffff;2=fffff;3=fffff +Examples for RDT allocation usage: + Example 1 --------- On a two socket machine (one L3 cache per socket) with just four bits @@ -410,3 +536,124 @@ /* code to read and write directory contents */ resctrl_release_lock(fd); } + +Examples for RDT Monitoring along with allocation usage: + +Reading monitored data +---------------------- +Reading an event file (for ex: mon_data/mon_L3_00/llc_occupancy) would +show the current snapshot of LLC occupancy of the corresponding MON +group or CTRL_MON group. + + +Example 1 (Monitor CTRL_MON group and subset of tasks in CTRL_MON group) +--------- +On a two socket machine (one L3 cache per socket) with just four bits +for cache bit masks + +# mount -t resctrl resctrl /sys/fs/resctrl +# cd /sys/fs/resctrl +# mkdir p0 p1 +# echo "L3:0=3;1=c" > /sys/fs/resctrl/p0/schemata +# echo "L3:0=3;1=3" > /sys/fs/resctrl/p1/schemata +# echo 5678 > p1/tasks +# echo 5679 > p1/tasks + +The default resource group is unmodified, so we have access to all parts +of all caches (its schemata file reads "L3:0=f;1=f"). + +Tasks that are under the control of group "p0" may only allocate from the +"lower" 50% on cache ID 0, and the "upper" 50% of cache ID 1. +Tasks in group "p1" use the "lower" 50% of cache on both sockets. + +Create monitor groups and assign a subset of tasks to each monitor group. + +# cd /sys/fs/resctrl/p1/mon_groups +# mkdir m11 m12 +# echo 5678 > m11/tasks +# echo 5679 > m12/tasks + +fetch data (data shown in bytes) + +# cat m11/mon_data/mon_L3_00/llc_occupancy +16234000 +# cat m11/mon_data/mon_L3_01/llc_occupancy +14789000 +# cat m12/mon_data/mon_L3_00/llc_occupancy +16789000 + +The parent ctrl_mon group shows the aggregated data. + +# cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy +31234000 + +Example 2 (Monitor a task from its creation) +--------- +On a two socket machine (one L3 cache per socket) + +# mount -t resctrl resctrl /sys/fs/resctrl +# cd /sys/fs/resctrl +# mkdir p0 p1 + +An RMID is allocated to the group once its created and hence the +below is monitored from its creation. + +# echo $$ > /sys/fs/resctrl/p1/tasks +# + +Fetch the data + +# cat /sys/fs/resctrl/p1/mon_data/mon_l3_00/llc_occupancy +31789000 + +Example 3 (Monitor without CAT support or before creating CAT groups) +--------- + +Assume a system like HSW has only CQM and no CAT support. In this case +the resctrl will still mount but cannot create CTRL_MON directories. +But user can create different MON groups within the root group thereby +able to monitor all tasks including kernel threads. + +This can also be used to profile jobs cache size footprint before being +able to allocate them to different allocation groups. + +# mount -t resctrl resctrl /sys/fs/resctrl +# cd /sys/fs/resctrl +# mkdir mon_groups/m01 +# mkdir mon_groups/m02 + +# echo 3478 > /sys/fs/resctrl/mon_groups/m01/tasks +# echo 2467 > /sys/fs/resctrl/mon_groups/m02/tasks + +Monitor the groups separately and also get per domain data. From the +below its apparent that the tasks are mostly doing work on +domain(socket) 0. + +# cat /sys/fs/resctrl/mon_groups/m01/mon_L3_00/llc_occupancy +31234000 +# cat /sys/fs/resctrl/mon_groups/m01/mon_L3_01/llc_occupancy +34555 +# cat /sys/fs/resctrl/mon_groups/m02/mon_L3_00/llc_occupancy +31234000 +# cat /sys/fs/resctrl/mon_groups/m02/mon_L3_01/llc_occupancy +32789 + + +Example 4 (Monitor real time tasks) +----------------------------------- + +A single socket system which has real time tasks running on cores 4-7 +and non real time tasks on other cpus. We want to monitor the cache +occupancy of the real time threads on these cores. + +# mount -t resctrl resctrl /sys/fs/resctrl +# cd /sys/fs/resctrl +# mkdir p1 + +Move the cpus 4-7 over to p1 +# echo f0 > p0/cpus + +View the llc occupancy snapshot + +# cat /sys/fs/resctrl/p1/mon_data/mon_L3_00/llc_occupancy +11234000 --- linux-azure-4.13.0.orig/Documentation/x86/orc-unwinder.txt +++ linux-azure-4.13.0/Documentation/x86/orc-unwinder.txt @@ -0,0 +1,179 @@ +ORC unwinder +============ + +Overview +-------- + +The kernel CONFIG_UNWINDER_ORC option enables the ORC unwinder, which is +similar in concept to a DWARF unwinder. The difference is that the +format of the ORC data is much simpler than DWARF, which in turn allows +the ORC unwinder to be much simpler and faster. + +The ORC data consists of unwind tables which are generated by objtool. +They contain out-of-band data which is used by the in-kernel ORC +unwinder. Objtool generates the ORC data by first doing compile-time +stack metadata validation (CONFIG_STACK_VALIDATION). After analyzing +all the code paths of a .o file, it determines information about the +stack state at each instruction address in the file and outputs that +information to the .orc_unwind and .orc_unwind_ip sections. + +The per-object ORC sections are combined at link time and are sorted and +post-processed at boot time. The unwinder uses the resulting data to +correlate instruction addresses with their stack states at run time. + + +ORC vs frame pointers +--------------------- + +With frame pointers enabled, GCC adds instrumentation code to every +function in the kernel. The kernel's .text size increases by about +3.2%, resulting in a broad kernel-wide slowdown. Measurements by Mel +Gorman [1] have shown a slowdown of 5-10% for some workloads. + +In contrast, the ORC unwinder has no effect on text size or runtime +performance, because the debuginfo is out of band. So if you disable +frame pointers and enable the ORC unwinder, you get a nice performance +improvement across the board, and still have reliable stack traces. + +Ingo Molnar says: + + "Note that it's not just a performance improvement, but also an + instruction cache locality improvement: 3.2% .text savings almost + directly transform into a similarly sized reduction in cache + footprint. That can transform to even higher speedups for workloads + whose cache locality is borderline." + +Another benefit of ORC compared to frame pointers is that it can +reliably unwind across interrupts and exceptions. Frame pointer based +unwinds can sometimes skip the caller of the interrupted function, if it +was a leaf function or if the interrupt hit before the frame pointer was +saved. + +The main disadvantage of the ORC unwinder compared to frame pointers is +that it needs more memory to store the ORC unwind tables: roughly 2-4MB +depending on the kernel config. + + +ORC vs DWARF +------------ + +ORC debuginfo's advantage over DWARF itself is that it's much simpler. +It gets rid of the complex DWARF CFI state machine and also gets rid of +the tracking of unnecessary registers. This allows the unwinder to be +much simpler, meaning fewer bugs, which is especially important for +mission critical oops code. + +The simpler debuginfo format also enables the unwinder to be much faster +than DWARF, which is important for perf and lockdep. In a basic +performance test by Jiri Slaby [2], the ORC unwinder was about 20x +faster than an out-of-tree DWARF unwinder. (Note: That measurement was +taken before some performance tweaks were added, which doubled +performance, so the speedup over DWARF may be closer to 40x.) + +The ORC data format does have a few downsides compared to DWARF. ORC +unwind tables take up ~50% more RAM (+1.3MB on an x86 defconfig kernel) +than DWARF-based eh_frame tables. + +Another potential downside is that, as GCC evolves, it's conceivable +that the ORC data may end up being *too* simple to describe the state of +the stack for certain optimizations. But IMO this is unlikely because +GCC saves the frame pointer for any unusual stack adjustments it does, +so I suspect we'll really only ever need to keep track of the stack +pointer and the frame pointer between call frames. But even if we do +end up having to track all the registers DWARF tracks, at least we will +still be able to control the format, e.g. no complex state machines. + + +ORC unwind table generation +--------------------------- + +The ORC data is generated by objtool. With the existing compile-time +stack metadata validation feature, objtool already follows all code +paths, and so it already has all the information it needs to be able to +generate ORC data from scratch. So it's an easy step to go from stack +validation to ORC data generation. + +It should be possible to instead generate the ORC data with a simple +tool which converts DWARF to ORC data. However, such a solution would +be incomplete due to the kernel's extensive use of asm, inline asm, and +special sections like exception tables. + +That could be rectified by manually annotating those special code paths +using GNU assembler .cfi annotations in .S files, and homegrown +annotations for inline asm in .c files. But asm annotations were tried +in the past and were found to be unmaintainable. They were often +incorrect/incomplete and made the code harder to read and keep updated. +And based on looking at glibc code, annotating inline asm in .c files +might be even worse. + +Objtool still needs a few annotations, but only in code which does +unusual things to the stack like entry code. And even then, far fewer +annotations are needed than what DWARF would need, so they're much more +maintainable than DWARF CFI annotations. + +So the advantages of using objtool to generate ORC data are that it +gives more accurate debuginfo, with very few annotations. It also +insulates the kernel from toolchain bugs which can be very painful to +deal with in the kernel since we often have to workaround issues in +older versions of the toolchain for years. + +The downside is that the unwinder now becomes dependent on objtool's +ability to reverse engineer GCC code flow. If GCC optimizations become +too complicated for objtool to follow, the ORC data generation might +stop working or become incomplete. (It's worth noting that livepatch +already has such a dependency on objtool's ability to follow GCC code +flow.) + +If newer versions of GCC come up with some optimizations which break +objtool, we may need to revisit the current implementation. Some +possible solutions would be asking GCC to make the optimizations more +palatable, or having objtool use DWARF as an additional input, or +creating a GCC plugin to assist objtool with its analysis. But for now, +objtool follows GCC code quite well. + + +Unwinder implementation details +------------------------------- + +Objtool generates the ORC data by integrating with the compile-time +stack metadata validation feature, which is described in detail in +tools/objtool/Documentation/stack-validation.txt. After analyzing all +the code paths of a .o file, it creates an array of orc_entry structs, +and a parallel array of instruction addresses associated with those +structs, and writes them to the .orc_unwind and .orc_unwind_ip sections +respectively. + +The ORC data is split into the two arrays for performance reasons, to +make the searchable part of the data (.orc_unwind_ip) more compact. The +arrays are sorted in parallel at boot time. + +Performance is further improved by the use of a fast lookup table which +is created at runtime. The fast lookup table associates a given address +with a range of indices for the .orc_unwind table, so that only a small +subset of the table needs to be searched. + + +Etymology +--------- + +Orcs, fearsome creatures of medieval folklore, are the Dwarves' natural +enemies. Similarly, the ORC unwinder was created in opposition to the +complexity and slowness of DWARF. + +"Although Orcs rarely consider multiple solutions to a problem, they do +excel at getting things done because they are creatures of action, not +thought." [3] Similarly, unlike the esoteric DWARF unwinder, the +veracious ORC unwinder wastes no time or siloconic effort decoding +variable-length zero-extended unsigned-integer byte-coded +state-machine-based debug information entries. + +Similar to how Orcs frequently unravel the well-intentioned plans of +their adversaries, the ORC unwinder frequently unravels stacks with +brutal, unyielding efficiency. + +ORC stands for Oops Rewind Capability. + + +[1] https://lkml.kernel.org/r/20170602104048.jkkzssljsompjdwy@suse.de +[2] https://lkml.kernel.org/r/d2ca5435-6386-29b8-db87-7f227c2b713a@suse.cz +[3] http://dustin.wikidot.com/half-orcs-and-orcs --- linux-azure-4.13.0.orig/Documentation/x86/pti.txt +++ linux-azure-4.13.0/Documentation/x86/pti.txt @@ -0,0 +1,186 @@ +Overview +======== + +Page Table Isolation (pti, previously known as KAISER[1]) is a +countermeasure against attacks on the shared user/kernel address +space such as the "Meltdown" approach[2]. + +To mitigate this class of attacks, we create an independent set of +page tables for use only when running userspace applications. When +the kernel is entered via syscalls, interrupts or exceptions, the +page tables are switched to the full "kernel" copy. When the system +switches back to user mode, the user copy is used again. + +The userspace page tables contain only a minimal amount of kernel +data: only what is needed to enter/exit the kernel such as the +entry/exit functions themselves and the interrupt descriptor table +(IDT). There are a few strictly unnecessary things that get mapped +such as the first C function when entering an interrupt (see +comments in pti.c). + +This approach helps to ensure that side-channel attacks leveraging +the paging structures do not function when PTI is enabled. It can be +enabled by setting CONFIG_PAGE_TABLE_ISOLATION=y at compile time. +Once enabled at compile-time, it can be disabled at boot with the +'nopti' or 'pti=' kernel parameters (see kernel-parameters.txt). + +Page Table Management +===================== + +When PTI is enabled, the kernel manages two sets of page tables. +The first set is very similar to the single set which is present in +kernels without PTI. This includes a complete mapping of userspace +that the kernel can use for things like copy_to_user(). + +Although _complete_, the user portion of the kernel page tables is +crippled by setting the NX bit in the top level. This ensures +that any missed kernel->user CR3 switch will immediately crash +userspace upon executing its first instruction. + +The userspace page tables map only the kernel data needed to enter +and exit the kernel. This data is entirely contained in the 'struct +cpu_entry_area' structure which is placed in the fixmap which gives +each CPU's copy of the area a compile-time-fixed virtual address. + +For new userspace mappings, the kernel makes the entries in its +page tables like normal. The only difference is when the kernel +makes entries in the top (PGD) level. In addition to setting the +entry in the main kernel PGD, a copy of the entry is made in the +userspace page tables' PGD. + +This sharing at the PGD level also inherently shares all the lower +layers of the page tables. This leaves a single, shared set of +userspace page tables to manage. One PTE to lock, one set of +accessed bits, dirty bits, etc... + +Overhead +======== + +Protection against side-channel attacks is important. But, +this protection comes at a cost: + +1. Increased Memory Use + a. Each process now needs an order-1 PGD instead of order-0. + (Consumes an additional 4k per process). + b. The 'cpu_entry_area' structure must be 2MB in size and 2MB + aligned so that it can be mapped by setting a single PMD + entry. This consumes nearly 2MB of RAM once the kernel + is decompressed, but no space in the kernel image itself. + +2. Runtime Cost + a. CR3 manipulation to switch between the page table copies + must be done at interrupt, syscall, and exception entry + and exit (it can be skipped when the kernel is interrupted, + though.) Moves to CR3 are on the order of a hundred + cycles, and are required at every entry and exit. + b. A "trampoline" must be used for SYSCALL entry. This + trampoline depends on a smaller set of resources than the + non-PTI SYSCALL entry code, so requires mapping fewer + things into the userspace page tables. The downside is + that stacks must be switched at entry time. + d. Global pages are disabled for all kernel structures not + mapped into both kernel and userspace page tables. This + feature of the MMU allows different processes to share TLB + entries mapping the kernel. Losing the feature means more + TLB misses after a context switch. The actual loss of + performance is very small, however, never exceeding 1%. + d. Process Context IDentifiers (PCID) is a CPU feature that + allows us to skip flushing the entire TLB when switching page + tables by setting a special bit in CR3 when the page tables + are changed. This makes switching the page tables (at context + switch, or kernel entry/exit) cheaper. But, on systems with + PCID support, the context switch code must flush both the user + and kernel entries out of the TLB. The user PCID TLB flush is + deferred until the exit to userspace, minimizing the cost. + See intel.com/sdm for the gory PCID/INVPCID details. + e. The userspace page tables must be populated for each new + process. Even without PTI, the shared kernel mappings + are created by copying top-level (PGD) entries into each + new process. But, with PTI, there are now *two* kernel + mappings: one in the kernel page tables that maps everything + and one for the entry/exit structures. At fork(), we need to + copy both. + f. In addition to the fork()-time copying, there must also + be an update to the userspace PGD any time a set_pgd() is done + on a PGD used to map userspace. This ensures that the kernel + and userspace copies always map the same userspace + memory. + g. On systems without PCID support, each CR3 write flushes + the entire TLB. That means that each syscall, interrupt + or exception flushes the TLB. + h. INVPCID is a TLB-flushing instruction which allows flushing + of TLB entries for non-current PCIDs. Some systems support + PCIDs, but do not support INVPCID. On these systems, addresses + can only be flushed from the TLB for the current PCID. When + flushing a kernel address, we need to flush all PCIDs, so a + single kernel address flush will require a TLB-flushing CR3 + write upon the next use of every PCID. + +Possible Future Work +==================== +1. We can be more careful about not actually writing to CR3 + unless its value is actually changed. +2. Allow PTI to be enabled/disabled at runtime in addition to the + boot-time switching. + +Testing +======== + +To test stability of PTI, the following test procedure is recommended, +ideally doing all of these in parallel: + +1. Set CONFIG_DEBUG_ENTRY=y +2. Run several copies of all of the tools/testing/selftests/x86/ tests + (excluding MPX and protection_keys) in a loop on multiple CPUs for + several minutes. These tests frequently uncover corner cases in the + kernel entry code. In general, old kernels might cause these tests + themselves to crash, but they should never crash the kernel. +3. Run the 'perf' tool in a mode (top or record) that generates many + frequent performance monitoring non-maskable interrupts (see "NMI" + in /proc/interrupts). This exercises the NMI entry/exit code which + is known to trigger bugs in code paths that did not expect to be + interrupted, including nested NMIs. Using "-c" boosts the rate of + NMIs, and using two -c with separate counters encourages nested NMIs + and less deterministic behavior. + + while true; do perf record -c 10000 -e instructions,cycles -a sleep 10; done + +4. Launch a KVM virtual machine. +5. Run 32-bit binaries on systems supporting the SYSCALL instruction. + This has been a lightly-tested code path and needs extra scrutiny. + +Debugging +========= + +Bugs in PTI cause a few different signatures of crashes +that are worth noting here. + + * Failures of the selftests/x86 code. Usually a bug in one of the + more obscure corners of entry_64.S + * Crashes in early boot, especially around CPU bringup. Bugs + in the trampoline code or mappings cause these. + * Crashes at the first interrupt. Caused by bugs in entry_64.S, + like screwing up a page table switch. Also caused by + incorrectly mapping the IRQ handler entry code. + * Crashes at the first NMI. The NMI code is separate from main + interrupt handlers and can have bugs that do not affect + normal interrupts. Also caused by incorrectly mapping NMI + code. NMIs that interrupt the entry code must be very + careful and can be the cause of crashes that show up when + running perf. + * Kernel crashes at the first exit to userspace. entry_64.S + bugs, or failing to map some of the exit code. + * Crashes at first interrupt that interrupts userspace. The paths + in entry_64.S that return to userspace are sometimes separate + from the ones that return to the kernel. + * Double faults: overflowing the kernel stack because of page + faults upon page faults. Caused by touching non-pti-mapped + data in the entry code, or forgetting to switch to kernel + CR3 before calling into C functions which are not pti-mapped. + * Userspace segfaults early in boot, sometimes manifesting + as mount(8) failing to mount the rootfs. These have + tended to be TLB invalidation issues. Usually invalidating + the wrong PCID, or otherwise missing an invalidation. + +1. https://gruss.cc/files/kaiser.pdf +2. https://meltdownattack.com/meltdown.pdf --- linux-azure-4.13.0.orig/Documentation/x86/x86_64/mm.txt +++ linux-azure-4.13.0/Documentation/x86/x86_64/mm.txt @@ -1,6 +1,4 @@ - - Virtual memory map with 4 level page tables: 0000000000000000 - 00007fffffffffff (=47 bits) user space, different per mm @@ -14,13 +12,17 @@ ... unused hole ... ffffec0000000000 - fffffbffffffffff (=44 bits) kasan shadow memory (16TB) ... unused hole ... + vaddr_end for KASLR +fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping +fffffe8000000000 - fffffeffffffffff (=39 bits) LDT remap for PTI ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ... unused hole ... ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space (variable) -ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls +ffffffffa0000000 - [fixmap start] (~1526 MB) module mapping space (variable) +[fixmap start] - ffffffffff5fffff kernel-internal fixmap range +ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole Virtual memory map with 5 level page tables: @@ -29,26 +31,31 @@ hole caused by [56:63] sign extension ff00000000000000 - ff0fffffffffffff (=52 bits) guard hole, reserved for hypervisor ff10000000000000 - ff8fffffffffffff (=55 bits) direct mapping of all phys. memory -ff90000000000000 - ff91ffffffffffff (=49 bits) hole -ff92000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space +ff90000000000000 - ff9fffffffffffff (=52 bits) LDT remap for PTI +ffa0000000000000 - ffd1ffffffffffff (=54 bits) vmalloc/ioremap space (12800 TB) ffd2000000000000 - ffd3ffffffffffff (=49 bits) hole ffd4000000000000 - ffd5ffffffffffff (=49 bits) virtual memory map (512TB) ... unused hole ... -ffd8000000000000 - fff7ffffffffffff (=53 bits) kasan shadow memory (8PB) +ffdf000000000000 - fffffc0000000000 (=53 bits) kasan shadow memory (8PB) +... unused hole ... + vaddr_end for KASLR +fffffe0000000000 - fffffe7fffffffff (=39 bits) cpu_entry_area mapping ... unused hole ... ffffff0000000000 - ffffff7fffffffff (=39 bits) %esp fixup stacks ... unused hole ... ffffffef00000000 - fffffffeffffffff (=64 GB) EFI region mapping space ... unused hole ... ffffffff80000000 - ffffffff9fffffff (=512 MB) kernel text mapping, from phys 0 -ffffffffa0000000 - ffffffffff5fffff (=1526 MB) module mapping space -ffffffffff600000 - ffffffffffdfffff (=8 MB) vsyscalls +ffffffffa0000000 - fffffffffeffffff (1520 MB) module mapping space +[fixmap start] - ffffffffff5fffff kernel-internal fixmap range +ffffffffff600000 - ffffffffff600fff (=4 kB) legacy vsyscall ABI ffffffffffe00000 - ffffffffffffffff (=2 MB) unused hole Architecture defines a 64-bit virtual address. Implementations can support less. Currently supported are 48- and 57-bit virtual addresses. Bits 63 -through to the most-significant implemented bit are set to either all ones -or all zero. This causes hole between user space and kernel addresses. +through to the most-significant implemented bit are sign extended. +This causes hole between user space and kernel addresses if you interpret them +as unsigned. The direct mapping covers all memory in the system up to the highest memory address (this means in some cases it can also include PCI memory @@ -58,19 +65,15 @@ the processes using the page fault handler, with init_top_pgt as reference. -Current X86-64 implementations support up to 46 bits of address space (64 TB), -which is our current limit. This expands into MBZ space in the page tables. - We map EFI runtime services in the 'efi_pgd' PGD in a 64Gb large virtual memory window (this size is arbitrary, it can be raised later if needed). The mappings are not part of any other kernel PGD and are only available during EFI runtime calls. -The module mapping space size changes based on the CONFIG requirements for the -following fixmap section. - Note that if CONFIG_RANDOMIZE_MEMORY is enabled, the direct mapping of all physical memory, vmalloc/ioremap space and virtual memory map are randomized. Their order is preserved but their base will be offset early at boot time. --Andi Kleen, Jul 2004 +Be very careful vs. KASLR when changing anything here. The KASLR address +range must not overlap with anything except the KASAN shadow area, which is +correct as KASAN disables KASLR. --- linux-azure-4.13.0.orig/MAINTAINERS +++ linux-azure-4.13.0/MAINTAINERS @@ -2392,6 +2392,19 @@ F: include/uapi/linux/audit.h F: kernel/audit* +AUFS (advanced multi layered unification filesystem) FILESYSTEM +M: "J. R. Okajima" +L: linux-unionfs@vger.kernel.org +L: aufs-users@lists.sourceforge.net (members only) +W: http://aufs.sourceforge.net +T: git://github.com/sfjro/aufs4-linux.git +S: Supported +F: Documentation/filesystems/aufs/ +F: Documentation/ABI/testing/debugfs-aufs +F: Documentation/ABI/testing/sysfs-aufs +F: fs/aufs/ +F: include/uapi/linux/aufs_type.h + AUXILIARY DISPLAY DRIVERS M: Miguel Ojeda Sandonis W: http://miguelojeda.es/auxdisplay.htm @@ -6138,6 +6151,14 @@ F: net/802/hippi.c F: drivers/net/hippi/ +HISILICON LPC BUS DRIVER +M: Zhichang Yuan +L: linux-arm-kernel@lists.infradead.org +W: http://www.hisilicon.com +S: Maintained +F: drivers/bus/hisi_lpc.c +F: Documentation/devicetree/bindings/arm/hisilicon/hisilicon-low-pin-count.txt + HISILICON NETWORK SUBSYSTEM DRIVER M: Yisen Zhuang M: Salil Mehta @@ -6257,7 +6278,9 @@ M: Stephen Hemminger L: devel@linuxdriverproject.org S: Maintained +F: Documentation/networking/netvsc.txt F: arch/x86/include/asm/mshyperv.h +F: arch/x86/include/asm/trace/hyperv.h F: arch/x86/include/uapi/asm/hyperv.h F: arch/x86/kernel/cpu/mshyperv.c F: arch/x86/hyperv @@ -6269,6 +6292,7 @@ F: drivers/scsi/storvsc_drv.c F: drivers/uio/uio_hv_generic.c F: drivers/video/fbdev/hyperv_fb.c +F: net/vmw_vsock/hyperv_transport.c F: include/linux/hyperv.h F: tools/hv/ F: Documentation/ABI/stable/sysfs-bus-vmbus @@ -11110,7 +11134,7 @@ L: linux-kernel@vger.kernel.org S: Supported F: arch/x86/kernel/cpu/intel_rdt* -F: arch/x86/include/asm/intel_rdt* +F: arch/x86/include/asm/intel_rdt_sched.h F: Documentation/x86/intel_rdt* READ-COPY UPDATE (RCU) --- linux-azure-4.13.0.orig/Makefile +++ linux-azure-4.13.0/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 13 -SUBLEVEL = 0 +SUBLEVEL = 16 EXTRAVERSION = NAME = Fearless Coyote @@ -175,6 +175,20 @@ KBUILD_CHECKSRC = 0 endif +# Call message checker as part of the C compilation +# +# Use 'make D=1' to enable checking +# Use 'make D=2' to create the message catalog + +ifdef D + ifeq ("$(origin D)", "command line") + KBUILD_KMSG_CHECK = $(D) + endif +endif +ifndef KBUILD_KMSG_CHECK + KBUILD_KMSG_CHECK = 0 +endif + # Use make M=dir to specify directory of external module to build # Old syntax make ... SUBDIRS=$PWD is still supported # Setting the environment variable KBUILD_EXTMOD take precedence @@ -365,6 +379,7 @@ CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \ -Wbitwise -Wno-return-void $(CF) +KMSG_CHECK = $(srctree)/scripts/kmsg-doc NOSTDINC_FLAGS = CFLAGS_MODULE = AFLAGS_MODULE = @@ -375,6 +390,12 @@ CFLAGS_GCOV := -fprofile-arcs -ftest-coverage -fno-tree-loop-im $(call cc-disable-warning,maybe-uninitialized,) CFLAGS_KCOV := $(call cc-option,-fsanitize-coverage=trace-pc,) +# Prefer linux-backports-modules +ifneq ($(KBUILD_SRC),) +ifneq ($(shell if test -e $(KBUILD_OUTPUT)/ubuntu-build; then echo yes; fi),yes) +UBUNTUINCLUDE := -I/usr/src/linux-headers-lbm-$(KERNELRELEASE) +endif +endif # Use USERINCLUDE when you must reference the UAPI directories only. USERINCLUDE := \ @@ -387,12 +408,16 @@ # Use LINUXINCLUDE when you must reference the include/ directory. # Needed to be compatible with the O= option LINUXINCLUDE := \ + $(UBUNTUINCLUDE) \ -I$(srctree)/arch/$(hdr-arch)/include \ -I$(objtree)/arch/$(hdr-arch)/include/generated \ $(if $(KBUILD_SRC), -I$(srctree)/include) \ -I$(objtree)/include \ $(USERINCLUDE) +# UBUNTU: Include our third party driver stuff too +LINUXINCLUDE += -Iubuntu/include $(if $(KBUILD_SRC),-I$(srctree)/ubuntu/include) + KBUILD_CPPFLAGS := -D__KERNEL__ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ @@ -422,6 +447,7 @@ export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KCOV CFLAGS_KASAN CFLAGS_UBSAN export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE +export KBUILD_KMSG_CHECK KMSG_CHECK export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL export KBUILD_ARFLAGS @@ -562,7 +588,7 @@ # Objects we will link into vmlinux / subdirs we need to visit init-y := init/ -drivers-y := drivers/ sound/ firmware/ +drivers-y := drivers/ sound/ firmware/ ubuntu/ net-y := net/ libs-y := lib/ core-y := usr/ @@ -621,6 +647,12 @@ # Defaults to vmlinux, but the arch makefile usually adds further targets all: vmlinux +# force no-pie for distro compilers that enable pie by default +KBUILD_CFLAGS += $(call cc-option, -fno-pie) +KBUILD_CFLAGS += $(call cc-option, -no-pie) +KBUILD_AFLAGS += $(call cc-option, -fno-pie) +KBUILD_CPPFLAGS += $(call cc-option, -fno-pie) + # The arch Makefile can set ARCH_{CPP,A,C}FLAGS to override the default # values of the respective KBUILD_* variables ARCH_CPPFLAGS := @@ -933,7 +965,11 @@ ifeq ($(has_libelf),1) objtool_target := tools/objtool FORCE else - $(warning "Cannot use CONFIG_STACK_VALIDATION, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + ifdef CONFIG_UNWINDER_ORC + $(error "Cannot generate ORC metadata for CONFIG_UNWINDER_ORC=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + else + $(warning "Cannot use CONFIG_STACK_VALIDATION=y, please install libelf-dev, libelf-devel or elfutils-libelf-devel") + endif SKIP_STACK_VALIDATION := 1 export SKIP_STACK_VALIDATION endif @@ -1167,6 +1203,7 @@ $(error Headers not exportable for the $(SRCARCH) architecture)) $(Q)$(MAKE) $(hdr-inst)=include/uapi dst=include $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/uapi $(hdr-dst) + $(Q)$(MAKE) $(hdr-inst)=ubuntu/include dst=include oldheaders= PHONY += headers_check_all headers_check_all: headers_install_all @@ -1176,6 +1213,7 @@ headers_check: headers_install $(Q)$(MAKE) $(hdr-inst)=include/uapi dst=include HDRCHECK=1 $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/uapi $(hdr-dst) HDRCHECK=1 + $(Q)$(MAKE) $(hdr-inst)=ubuntu/include dst=include oldheaders= HDRCHECK=1 # --------------------------------------------------------------------------- # Kernel selftest --- linux-azure-4.13.0.orig/arch/Kconfig +++ linux-azure-4.13.0/arch/Kconfig @@ -928,9 +928,6 @@ and non-text memory will be made non-executable. This provides protection against certain security exploits (e.g. writing to text) -config ARCH_WANT_RELAX_ORDER - bool - config REFCOUNT_FULL bool "Perform full reference count validation at the expense of speed" help --- linux-azure-4.13.0.orig/arch/alpha/include/asm/futex.h +++ linux-azure-4.13.0/arch/alpha/include/asm/futex.h @@ -25,18 +25,10 @@ : "r" (uaddr), "r"(oparg) \ : "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -62,17 +54,9 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/arc/include/asm/futex.h +++ linux-azure-4.13.0/arch/arc/include/asm/futex.h @@ -73,20 +73,11 @@ #endif -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; - #ifndef CONFIG_ARC_HAS_LLSC preempt_disable(); /* to guarantee atomic r-m-w of futex op */ #endif @@ -118,30 +109,9 @@ preempt_enable(); #endif - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/arc/kernel/entry.S +++ linux-azure-4.13.0/arch/arc/kernel/entry.S @@ -92,6 +92,12 @@ lr r0, [efa] mov r1, sp + ; hardware auto-disables MMU, re-enable it to allow kernel vaddr + ; access for say stack unwinding of modules for crash dumps + lr r3, [ARC_REG_PID] + or r3, r3, MMU_ENABLE + sr r3, [ARC_REG_PID] + lsr r3, r2, 8 bmsk r3, r3, 7 brne r3, ECR_C_MCHK_DUP_TLB, 1f --- linux-azure-4.13.0.orig/arch/arc/mm/tlb.c +++ linux-azure-4.13.0/arch/arc/mm/tlb.c @@ -908,9 +908,6 @@ local_irq_save(flags); - /* re-enable the MMU */ - write_aux_reg(ARC_REG_PID, MMU_ENABLE | read_aux_reg(ARC_REG_PID)); - /* loop thru all sets of TLB */ for (set = 0; set < mmu->sets; set++) { --- linux-azure-4.13.0.orig/arch/arm/Kconfig +++ linux-azure-4.13.0/arch/arm/Kconfig @@ -2200,6 +2200,7 @@ source "drivers/Kconfig" source "drivers/firmware/Kconfig" +source "ubuntu/Kconfig" source "fs/Kconfig" --- linux-azure-4.13.0.orig/arch/arm/boot/dts/armada-375.dtsi +++ linux-azure-4.13.0/arch/arm/boot/dts/armada-375.dtsi @@ -178,9 +178,9 @@ reg = <0x8000 0x1000>; cache-unified; cache-level = <2>; - arm,double-linefill-incr = <1>; + arm,double-linefill-incr = <0>; arm,double-linefill-wrap = <0>; - arm,double-linefill = <1>; + arm,double-linefill = <0>; prefetch-data = <1>; }; --- linux-azure-4.13.0.orig/arch/arm/boot/dts/armada-38x.dtsi +++ linux-azure-4.13.0/arch/arm/boot/dts/armada-38x.dtsi @@ -143,9 +143,9 @@ reg = <0x8000 0x1000>; cache-unified; cache-level = <2>; - arm,double-linefill-incr = <1>; + arm,double-linefill-incr = <0>; arm,double-linefill-wrap = <0>; - arm,double-linefill = <1>; + arm,double-linefill = <0>; prefetch-data = <1>; }; --- linux-azure-4.13.0.orig/arch/arm/boot/dts/armada-39x.dtsi +++ linux-azure-4.13.0/arch/arm/boot/dts/armada-39x.dtsi @@ -111,9 +111,9 @@ reg = <0x8000 0x1000>; cache-unified; cache-level = <2>; - arm,double-linefill-incr = <1>; + arm,double-linefill-incr = <0>; arm,double-linefill-wrap = <0>; - arm,double-linefill = <1>; + arm,double-linefill = <0>; prefetch-data = <1>; }; --- linux-azure-4.13.0.orig/arch/arm/boot/dts/sun6i-a31.dtsi +++ linux-azure-4.13.0/arch/arm/boot/dts/sun6i-a31.dtsi @@ -311,8 +311,8 @@ #size-cells = <0>; reg = <0>; - tcon1_in_drc1: endpoint@0 { - reg = <0>; + tcon1_in_drc1: endpoint@1 { + reg = <1>; remote-endpoint = <&drc1_out_tcon1>; }; }; @@ -1012,8 +1012,8 @@ #size-cells = <0>; reg = <1>; - be1_out_drc1: endpoint@0 { - reg = <0>; + be1_out_drc1: endpoint@1 { + reg = <1>; remote-endpoint = <&drc1_in_be1>; }; }; @@ -1042,8 +1042,8 @@ #size-cells = <0>; reg = <0>; - drc1_in_be1: endpoint@0 { - reg = <0>; + drc1_in_be1: endpoint@1 { + reg = <1>; remote-endpoint = <&be1_out_drc1>; }; }; @@ -1053,8 +1053,8 @@ #size-cells = <0>; reg = <1>; - drc1_out_tcon1: endpoint@0 { - reg = <0>; + drc1_out_tcon1: endpoint@1 { + reg = <1>; remote-endpoint = <&tcon1_in_drc1>; }; }; --- linux-azure-4.13.0.orig/arch/arm/include/asm/Kbuild +++ linux-azure-4.13.0/arch/arm/include/asm/Kbuild @@ -20,7 +20,6 @@ generic-y += sizes.h generic-y += timex.h generic-y += trace_clock.h -generic-y += unaligned.h generated-y += mach-types.h generated-y += unistd-nr.h --- linux-azure-4.13.0.orig/arch/arm/include/asm/futex.h +++ linux-azure-4.13.0/arch/arm/include/asm/futex.h @@ -128,20 +128,10 @@ #endif /* !SMP */ static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - #ifndef CONFIG_SMP preempt_disable(); #endif @@ -172,17 +162,9 @@ preempt_enable(); #endif - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/arm/include/asm/kvm_arm.h +++ linux-azure-4.13.0/arch/arm/include/asm/kvm_arm.h @@ -227,7 +227,6 @@ #define HSR_DABT_S1PTW (_AC(1, UL) << 7) #define HSR_DABT_CM (_AC(1, UL) << 8) -#define HSR_DABT_EA (_AC(1, UL) << 9) #define kvm_arm_exception_type \ {0, "RESET" }, \ --- linux-azure-4.13.0.orig/arch/arm/include/asm/kvm_emulate.h +++ linux-azure-4.13.0/arch/arm/include/asm/kvm_emulate.h @@ -149,11 +149,6 @@ return (kvm_vcpu_get_hsr(vcpu) & HSR_SRT_MASK) >> HSR_SRT_SHIFT; } -static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) -{ - return kvm_vcpu_get_hsr(vcpu) & HSR_DABT_EA; -} - static inline bool kvm_vcpu_dabt_iss1tw(struct kvm_vcpu *vcpu) { return kvm_vcpu_get_hsr(vcpu) & HSR_DABT_S1PTW; @@ -206,6 +201,25 @@ return kvm_vcpu_get_hsr(vcpu) & HSR_FSC_TYPE; } +static inline bool kvm_vcpu_dabt_isextabt(struct kvm_vcpu *vcpu) +{ + switch (kvm_vcpu_trap_get_fault_type(vcpu)) { + case FSC_SEA: + case FSC_SEA_TTW0: + case FSC_SEA_TTW1: + case FSC_SEA_TTW2: + case FSC_SEA_TTW3: + case FSC_SECC: + case FSC_SECC_TTW0: + case FSC_SECC_TTW1: + case FSC_SECC_TTW2: + case FSC_SECC_TTW3: + return true; + default: + return false; + } +} + static inline u32 kvm_vcpu_hvc_get_imm(struct kvm_vcpu *vcpu) { return kvm_vcpu_get_hsr(vcpu) & HSR_HVC_IMM_MASK; --- linux-azure-4.13.0.orig/arch/arm/include/asm/kvm_host.h +++ linux-azure-4.13.0/arch/arm/include/asm/kvm_host.h @@ -293,4 +293,10 @@ int kvm_arm_vcpu_arch_has_attr(struct kvm_vcpu *vcpu, struct kvm_device_attr *attr); +static inline bool kvm_arm_harden_branch_predictor(void) +{ + /* No way to detect it yet, pretend it is not there. */ + return false; +} + #endif /* __ARM_KVM_HOST_H__ */ --- linux-azure-4.13.0.orig/arch/arm/include/asm/kvm_mmu.h +++ linux-azure-4.13.0/arch/arm/include/asm/kvm_mmu.h @@ -221,6 +221,16 @@ return 8; } +static inline void *kvm_get_hyp_vector(void) +{ + return kvm_ksym_ref(__kvm_hyp_vector); +} + +static inline int kvm_map_vectors(void) +{ + return 0; +} + #endif /* !__ASSEMBLY__ */ #endif /* __ARM_KVM_MMU_H__ */ --- linux-azure-4.13.0.orig/arch/arm/include/asm/traps.h +++ linux-azure-4.13.0/arch/arm/include/asm/traps.h @@ -18,7 +18,6 @@ void register_undef_hook(struct undef_hook *hook); void unregister_undef_hook(struct undef_hook *hook); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER static inline int __in_irqentry_text(unsigned long ptr) { extern char __irqentry_text_start[]; @@ -27,12 +26,6 @@ return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } -#else -static inline int __in_irqentry_text(unsigned long ptr) -{ - return 0; -} -#endif static inline int in_exception_text(unsigned long ptr) { --- linux-azure-4.13.0.orig/arch/arm/include/asm/unaligned.h +++ linux-azure-4.13.0/arch/arm/include/asm/unaligned.h @@ -0,0 +1,27 @@ +#ifndef __ASM_ARM_UNALIGNED_H +#define __ASM_ARM_UNALIGNED_H + +/* + * We generally want to set CONFIG_HAVE_EFFICIENT_UNALIGNED_ACCESS on ARMv6+, + * but we don't want to use linux/unaligned/access_ok.h since that can lead + * to traps on unaligned stm/ldm or strd/ldrd. + */ +#include + +#if defined(__LITTLE_ENDIAN) +# include +# include +# include +# define get_unaligned __get_unaligned_le +# define put_unaligned __put_unaligned_le +#elif defined(__BIG_ENDIAN) +# include +# include +# include +# define get_unaligned __get_unaligned_be +# define put_unaligned __put_unaligned_be +#else +# error need to define endianess +#endif + +#endif /* __ASM_ARM_UNALIGNED_H */ --- linux-azure-4.13.0.orig/arch/arm/kernel/entry-common.S +++ linux-azure-4.13.0/arch/arm/kernel/entry-common.S @@ -12,6 +12,7 @@ #include #include #include +#include #ifdef CONFIG_AEABI #include #endif @@ -40,10 +41,14 @@ UNWIND(.fnstart ) UNWIND(.cantunwind ) disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK bne fast_work_pending + /* perform architecture specific actions before user return */ arch_ret_to_user r1, lr @@ -66,6 +71,9 @@ UNWIND(.cantunwind ) str r0, [sp, #S_R0 + S_OFF]! @ save returned r0 disable_irq_notrace @ disable interrupts + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] @ re-check for syscall tracing tst r1, #_TIF_SYSCALL_WORK | _TIF_WORK_MASK beq no_work_pending @@ -98,6 +106,9 @@ ret_slow_syscall: disable_irq_notrace @ disable interrupts ENTRY(ret_to_user_from_irq) + ldr r2, [tsk, #TI_ADDR_LIMIT] + cmp r2, #TASK_SIZE + blne addr_limit_check_failed ldr r1, [tsk, #TI_FLAGS] tst r1, #_TIF_WORK_MASK bne slow_work_pending --- linux-azure-4.13.0.orig/arch/arm/kernel/signal.c +++ linux-azure-4.13.0/arch/arm/kernel/signal.c @@ -14,6 +14,7 @@ #include #include #include +#include #include #include @@ -673,3 +674,9 @@ return page; } + +/* Defer to generic check */ +asmlinkage void addr_limit_check_failed(void) +{ + addr_limit_user_check(); +} --- linux-azure-4.13.0.orig/arch/arm/kernel/traps.c +++ linux-azure-4.13.0/arch/arm/kernel/traps.c @@ -154,30 +154,26 @@ set_fs(fs); } -static void dump_instr(const char *lvl, struct pt_regs *regs) +static void __dump_instr(const char *lvl, struct pt_regs *regs) { unsigned long addr = instruction_pointer(regs); const int thumb = thumb_mode(regs); const int width = thumb ? 4 : 8; - mm_segment_t fs; char str[sizeof("00000000 ") * 5 + 2 + 1], *p = str; int i; /* - * We need to switch to kernel mode so that we can use __get_user - * to safely read from kernel space. Note that we now dump the - * code first, just in case the backtrace kills us. + * Note that we now dump the code first, just in case the backtrace + * kills us. */ - fs = get_fs(); - set_fs(KERNEL_DS); for (i = -4; i < 1 + !!thumb; i++) { unsigned int val, bad; if (thumb) - bad = __get_user(val, &((u16 *)addr)[i]); + bad = get_user(val, &((u16 *)addr)[i]); else - bad = __get_user(val, &((u32 *)addr)[i]); + bad = get_user(val, &((u32 *)addr)[i]); if (!bad) p += sprintf(p, i == 0 ? "(%0*x) " : "%0*x ", @@ -188,8 +184,20 @@ } } printk("%sCode: %s\n", lvl, str); +} - set_fs(fs); +static void dump_instr(const char *lvl, struct pt_regs *regs) +{ + mm_segment_t fs; + + if (!user_mode(regs)) { + fs = get_fs(); + set_fs(KERNEL_DS); + __dump_instr(lvl, regs); + set_fs(fs); + } else { + __dump_instr(lvl, regs); + } } #ifdef CONFIG_ARM_UNWIND --- linux-azure-4.13.0.orig/arch/arm/kvm/emulate.c +++ linux-azure-4.13.0/arch/arm/kvm/emulate.c @@ -227,7 +227,7 @@ u32 return_offset = (is_thumb) ? 2 : 4; kvm_update_psr(vcpu, UND_MODE); - *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) - return_offset; + *vcpu_reg(vcpu, 14) = *vcpu_pc(vcpu) + return_offset; /* Branch to exception vector */ *vcpu_pc(vcpu) = exc_vector_base(vcpu) + vect_offset; @@ -239,10 +239,8 @@ */ static void inject_abt(struct kvm_vcpu *vcpu, bool is_pabt, unsigned long addr) { - unsigned long cpsr = *vcpu_cpsr(vcpu); - bool is_thumb = (cpsr & PSR_T_BIT); u32 vect_offset; - u32 return_offset = (is_thumb) ? 4 : 0; + u32 return_offset = (is_pabt) ? 4 : 8; bool is_lpae; kvm_update_psr(vcpu, ABT_MODE); --- linux-azure-4.13.0.orig/arch/arm/kvm/handle_exit.c +++ linux-azure-4.13.0/arch/arm/kvm/handle_exit.c @@ -21,7 +21,7 @@ #include #include #include -#include +#include #include #include "trace.h" @@ -36,7 +36,7 @@ kvm_vcpu_hvc_get_imm(vcpu)); vcpu->stat.hvc_exit_stat++; - ret = kvm_psci_call(vcpu); + ret = kvm_hvc_call_handler(vcpu); if (ret < 0) { kvm_inject_undefined(vcpu); return 1; --- linux-azure-4.13.0.orig/arch/arm/kvm/hyp/Makefile +++ linux-azure-4.13.0/arch/arm/kvm/hyp/Makefile @@ -2,7 +2,7 @@ # Makefile for Kernel-based Virtual Machine module, HYP part # -ccflags-y += -fno-stack-protector +ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING KVM=../../../../virt/kvm --- linux-azure-4.13.0.orig/arch/arm/mach-highbank/Makefile +++ linux-azure-4.13.0/arch/arm/mach-highbank/Makefile @@ -1,3 +1,5 @@ +KBUILD_CFLAGS += -I$(srctree)/arch/arm/mach-highbank/include + obj-y := highbank.o system.o smc.o plus_sec := $(call as-instr,.arch_extension sec,+sec) --- linux-azure-4.13.0.orig/arch/arm/mm/fault.c +++ linux-azure-4.13.0/arch/arm/mm/fault.c @@ -315,8 +315,11 @@ * signal first. We do not need to release the mmap_sem because * it would already be released in __lock_page_or_retry in * mm/filemap.c. */ - if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) + if ((fault & VM_FAULT_RETRY) && fatal_signal_pending(current)) { + if (!user_mode(regs)) + goto no_context; return 0; + } /* * Major/minor page fault accounting is only done on the --- linux-azure-4.13.0.orig/arch/arm64/Kconfig +++ linux-azure-4.13.0/arch/arm64/Kconfig @@ -22,7 +22,24 @@ select ARCH_HAS_STRICT_MODULE_RWX select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_HAVE_NMI_SAFE_CMPXCHG if ACPI_APEI_SEA + select ARCH_INLINE_READ_LOCK if !PREEMPT + select ARCH_INLINE_READ_LOCK_BH if !PREEMPT + select ARCH_INLINE_READ_LOCK_IRQ if !PREEMPT + select ARCH_INLINE_READ_LOCK_IRQSAVE if !PREEMPT + select ARCH_INLINE_READ_UNLOCK if !PREEMPT + select ARCH_INLINE_READ_UNLOCK_BH if !PREEMPT + select ARCH_INLINE_READ_UNLOCK_IRQ if !PREEMPT + select ARCH_INLINE_READ_UNLOCK_IRQRESTORE if !PREEMPT + select ARCH_INLINE_WRITE_LOCK if !PREEMPT + select ARCH_INLINE_WRITE_LOCK_BH if !PREEMPT + select ARCH_INLINE_WRITE_LOCK_IRQ if !PREEMPT + select ARCH_INLINE_WRITE_LOCK_IRQSAVE if !PREEMPT + select ARCH_INLINE_WRITE_UNLOCK if !PREEMPT + select ARCH_INLINE_WRITE_UNLOCK_BH if !PREEMPT + select ARCH_INLINE_WRITE_UNLOCK_IRQ if !PREEMPT + select ARCH_INLINE_WRITE_UNLOCK_IRQRESTORE if !PREEMPT select ARCH_USE_CMPXCHG_LOCKREF + select ARCH_USE_QUEUED_RWLOCKS select ARCH_SUPPORTS_MEMORY_FAILURE select ARCH_SUPPORTS_ATOMIC_RMW select ARCH_SUPPORTS_NUMA_BALANCING @@ -75,6 +92,7 @@ select HAVE_ARCH_SECCOMP_FILTER select HAVE_ARCH_TRACEHOOK select HAVE_ARCH_TRANSPARENT_HUGEPAGE + select HAVE_ARCH_VMAP_STACK select HAVE_ARM_SMCCC select HAVE_EBPF_JIT select HAVE_C_RECORDMCOUNT @@ -503,20 +521,13 @@ config QCOM_FALKOR_ERRATUM_1003 bool "Falkor E1003: Incorrect translation due to ASID change" default y - select ARM64_PAN if ARM64_SW_TTBR0_PAN help On Falkor v1, an incorrect ASID may be cached in the TLB when ASID - and BADDR are changed together in TTBRx_EL1. The workaround for this - issue is to use a reserved ASID in cpu_do_switch_mm() before - switching to the new ASID. Saying Y here selects ARM64_PAN if - ARM64_SW_TTBR0_PAN is selected. This is done because implementing and - maintaining the E1003 workaround in the software PAN emulation code - would be an unnecessary complication. The affected Falkor v1 CPU - implements ARMv8.1 hardware PAN support and using hardware PAN - support versus software PAN emulation is mutually exclusive at - runtime. - - If unsure, say Y. + and BADDR are changed together in TTBRx_EL1. Since we keep the ASID + in TTBR1_EL1, this situation only occurs in the entry trampoline and + then only for entries in the walk cache, since the leaf translation + is unchanged. Work around the erratum by invalidating the walk cache + entries for the trampoline before entering the kernel proper. config QCOM_FALKOR_ERRATUM_1009 bool "Falkor E1009: Prematurely complete a DSB after a TLBI" @@ -538,6 +549,16 @@ If unsure, say Y. +config QCOM_FALKOR_ERRATUM_E1041 + bool "Falkor E1041: Speculative instruction fetches might cause errant memory access" + default y + help + Falkor CPU may speculatively fetch instructions from an improper + memory location when MMU translation is changed from SCTLR_ELn[M]=1 + to SCTLR_ELn[M]=0. Prefix an ISB instruction to fix the problem. + + If unsure, say Y. + endmenu @@ -782,6 +803,7 @@ config FORCE_MAX_ZONEORDER int default "14" if (ARM64_64K_PAGES && TRANSPARENT_HUGEPAGE) + default "13" if (ARCH_THUNDER && ARM64_4K_PAGES) default "12" if (ARM64_16K_PAGES && TRANSPARENT_HUGEPAGE) default "11" help @@ -802,6 +824,35 @@ However for 4K, we choose a higher default value, 11 as opposed to 10, giving us 4M allocations matching the default size used by generic code. +config UNMAP_KERNEL_AT_EL0 + bool "Unmap kernel when running in userspace (aka \"KAISER\")" if EXPERT + default y + help + Speculation attacks against some high-performance processors can + be used to bypass MMU permission checks and leak kernel data to + userspace. This can be defended against by unmapping the kernel + when running in userspace, mapping it back in on exception entry + via a trampoline page in the vector table. + + If unsure, say Y. + +config HARDEN_BRANCH_PREDICTOR + bool "Harden the branch predictor against aliasing attacks" if EXPERT + default y + help + Speculation attacks against some high-performance processors rely on + being able to manipulate the branch predictor for a victim context by + executing aliasing branches in the attacker context. Such attacks + can be partially mitigated against by clearing internal branch + predictor state and limiting the prediction logic in some situations. + + This config option will take CPU-specific actions to harden the + branch predictor against aliasing attacks and may rely on specific + instruction sequences or control bits being set by the system + firmware. + + If unsure, say Y. + menuconfig ARMV8_DEPRECATED bool "Emulate deprecated/obsolete ARMv8 instructions" depends on COMPAT @@ -960,6 +1011,18 @@ regular load/store instructions if the cpu does not implement the feature. +config ARM64_PMEM + bool "Enable support for persistent memory" + select ARCH_HAS_PMEM_API + select ARCH_HAS_UACCESS_FLUSHCACHE + help + Say Y to enable support for the persistent memory API based on the + ARMv8.2 DCPoP feature. + + The feature is detected at runtime, and the kernel will use DC CVAC + operations if DC CVAP is not supported (following the behaviour of + DC CVAP itself if the system does not define a point of persistence). + endmenu config ARM64_MODULE_CMODEL_LARGE @@ -1135,6 +1198,8 @@ source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "drivers/firmware/Kconfig" source "drivers/acpi/Kconfig" --- linux-azure-4.13.0.orig/arch/arm64/boot/dts/hisilicon/hip06-d03.dts +++ linux-azure-4.13.0/arch/arm64/boot/dts/hisilicon/hip06-d03.dts @@ -52,3 +52,7 @@ &usb_ehci { status = "ok"; }; + +&ipmi0 { + status = "ok"; +}; --- linux-azure-4.13.0.orig/arch/arm64/boot/dts/hisilicon/hip06.dtsi +++ linux-azure-4.13.0/arch/arm64/boot/dts/hisilicon/hip06.dtsi @@ -318,6 +318,20 @@ #size-cells = <2>; ranges; + isa@a01b0000 { + compatible = "hisilicon,hip06-lpc"; + #size-cells = <1>; + #address-cells = <2>; + reg = <0x0 0xa01b0000 0x0 0x1000>; + + ipmi0: bt@e4 { + compatible = "ipmi-bt"; + device_type = "ipmi"; + reg = <0x01 0xe4 0x04>; + status = "disabled"; + }; + }; + refclk: refclk { compatible = "fixed-clock"; clock-frequency = <50000000>; --- linux-azure-4.13.0.orig/arch/arm64/boot/dts/hisilicon/hip07-d05.dts +++ linux-azure-4.13.0/arch/arm64/boot/dts/hisilicon/hip07-d05.dts @@ -84,3 +84,7 @@ &sas1 { status = "ok"; }; + +&ipmi0 { + status = "ok"; +}; --- linux-azure-4.13.0.orig/arch/arm64/boot/dts/hisilicon/hip07.dtsi +++ linux-azure-4.13.0/arch/arm64/boot/dts/hisilicon/hip07.dtsi @@ -1089,6 +1089,20 @@ #size-cells = <2>; ranges; + isa@a01b0000 { + compatible = "hisilicon,hip07-lpc"; + #size-cells = <1>; + #address-cells = <2>; + reg = <0x0 0xa01b0000 0x0 0x1000>; + + ipmi0: bt@e4 { + compatible = "ipmi-bt"; + device_type = "ipmi"; + reg = <0x01 0xe4 0x04>; + status = "disabled"; + }; + }; + uart0: uart@602b0000 { compatible = "arm,sbsa-uart"; reg = <0x0 0x602b0000 0x0 0x1000>; --- linux-azure-4.13.0.orig/arch/arm64/boot/dts/marvell/armada-37xx.dtsi +++ linux-azure-4.13.0/arch/arm64/boot/dts/marvell/armada-37xx.dtsi @@ -323,6 +323,7 @@ interrupt-controller; reg = <0x1d00000 0x10000>, /* GICD */ <0x1d40000 0x40000>; /* GICR */ + interrupts = ; }; }; --- linux-azure-4.13.0.orig/arch/arm64/boot/dts/marvell/armada-ap806.dtsi +++ linux-azure-4.13.0/arch/arm64/boot/dts/marvell/armada-ap806.dtsi @@ -254,7 +254,7 @@ ap_syscon: system-controller@6f4000 { compatible = "syscon", "simple-mfd"; - reg = <0x6f4000 0x1000>; + reg = <0x6f4000 0x2000>; ap_clk: clock { compatible = "marvell,ap806-clock"; @@ -265,7 +265,7 @@ compatible = "marvell,ap806-pinctrl"; }; - ap_gpio: gpio { + ap_gpio: gpio@1040 { compatible = "marvell,armada-8k-gpio"; offset = <0x1040>; ngpios = <20>; --- linux-azure-4.13.0.orig/arch/arm64/boot/dts/rockchip/rk3399-firefly.dts +++ linux-azure-4.13.0/arch/arm64/boot/dts/rockchip/rk3399-firefly.dts @@ -370,10 +370,10 @@ regulator-always-on; regulator-boot-on; regulator-min-microvolt = <1800000>; - regulator-max-microvolt = <3300000>; + regulator-max-microvolt = <3000000>; regulator-state-mem { regulator-on-in-suspend; - regulator-suspend-microvolt = <3300000>; + regulator-suspend-microvolt = <3000000>; }; }; --- linux-azure-4.13.0.orig/arch/arm64/include/asm/Kbuild +++ linux-azure-4.13.0/arch/arm64/include/asm/Kbuild @@ -16,6 +16,7 @@ generic-y += mm-arch-hooks.h generic-y += msi.h generic-y += preempt.h +generic-y += qrwlock.h generic-y += rwsem.h generic-y += segment.h generic-y += serial.h --- linux-azure-4.13.0.orig/arch/arm64/include/asm/asm-bug.h +++ linux-azure-4.13.0/arch/arm64/include/asm/asm-bug.h @@ -0,0 +1,54 @@ +#ifndef __ASM_ASM_BUG_H +/* + * Copyright (C) 2017 ARM Limited + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#define __ASM_ASM_BUG_H + +#include + +#ifdef CONFIG_DEBUG_BUGVERBOSE +#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line) +#define __BUGVERBOSE_LOCATION(file, line) \ + .pushsection .rodata.str,"aMS",@progbits,1; \ + 2: .string file; \ + .popsection; \ + \ + .long 2b - 0b; \ + .short line; +#else +#define _BUGVERBOSE_LOCATION(file, line) +#endif + +#ifdef CONFIG_GENERIC_BUG + +#define __BUG_ENTRY(flags) \ + .pushsection __bug_table,"aw"; \ + .align 2; \ + 0: .long 1f - 0b; \ +_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ + .short flags; \ + .popsection; \ + 1: +#else +#define __BUG_ENTRY(flags) +#endif + +#define ASM_BUG_FLAGS(flags) \ + __BUG_ENTRY(flags) \ + brk BUG_BRK_IMM + +#define ASM_BUG() ASM_BUG_FLAGS(0) + +#endif /* __ASM_ASM_BUG_H */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/asm-uaccess.h +++ linux-azure-4.13.0/arch/arm64/include/asm/asm-uaccess.h @@ -3,6 +3,7 @@ #include #include +#include #include #include @@ -12,51 +13,62 @@ #ifdef CONFIG_ARM64_SW_TTBR0_PAN .macro __uaccess_ttbr0_disable, tmp1 mrs \tmp1, ttbr1_el1 // swapper_pg_dir + bic \tmp1, \tmp1, #TTBR_ASID_MASK add \tmp1, \tmp1, #SWAPPER_DIR_SIZE // reserved_ttbr0 at the end of swapper_pg_dir msr ttbr0_el1, \tmp1 // set reserved TTBR0_EL1 isb + sub \tmp1, \tmp1, #SWAPPER_DIR_SIZE + msr ttbr1_el1, \tmp1 // set reserved ASID + isb .endm - .macro __uaccess_ttbr0_enable, tmp1 + .macro __uaccess_ttbr0_enable, tmp1, tmp2 get_thread_info \tmp1 ldr \tmp1, [\tmp1, #TSK_TI_TTBR0] // load saved TTBR0_EL1 + mrs \tmp2, ttbr1_el1 + extr \tmp2, \tmp2, \tmp1, #48 + ror \tmp2, \tmp2, #16 + msr ttbr1_el1, \tmp2 // set the active ASID + isb msr ttbr0_el1, \tmp1 // set the non-PAN TTBR0_EL1 isb .endm - .macro uaccess_ttbr0_disable, tmp1 + .macro uaccess_ttbr0_disable, tmp1, tmp2 alternative_if_not ARM64_HAS_PAN + save_and_disable_irq \tmp2 // avoid preemption __uaccess_ttbr0_disable \tmp1 + restore_irq \tmp2 alternative_else_nop_endif .endm - .macro uaccess_ttbr0_enable, tmp1, tmp2 + .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3 alternative_if_not ARM64_HAS_PAN - save_and_disable_irq \tmp2 // avoid preemption - __uaccess_ttbr0_enable \tmp1 - restore_irq \tmp2 + save_and_disable_irq \tmp3 // avoid preemption + __uaccess_ttbr0_enable \tmp1, \tmp2 + restore_irq \tmp3 alternative_else_nop_endif .endm #else - .macro uaccess_ttbr0_disable, tmp1 + .macro uaccess_ttbr0_disable, tmp1, tmp2 .endm - .macro uaccess_ttbr0_enable, tmp1, tmp2 + .macro uaccess_ttbr0_enable, tmp1, tmp2, tmp3 .endm #endif /* * These macros are no-ops when UAO is present. */ - .macro uaccess_disable_not_uao, tmp1 - uaccess_ttbr0_disable \tmp1 + .macro uaccess_disable_not_uao, tmp1, tmp2 + uaccess_ttbr0_disable \tmp1, \tmp2 alternative_if ARM64_ALT_PAN_NOT_UAO SET_PSTATE_PAN(1) alternative_else_nop_endif .endm - .macro uaccess_enable_not_uao, tmp1, tmp2 - uaccess_ttbr0_enable \tmp1, \tmp2 + .macro uaccess_enable_not_uao, tmp1, tmp2, tmp3 + uaccess_ttbr0_enable \tmp1, \tmp2, \tmp3 alternative_if ARM64_ALT_PAN_NOT_UAO SET_PSTATE_PAN(0) alternative_else_nop_endif --- linux-azure-4.13.0.orig/arch/arm64/include/asm/assembler.h +++ linux-azure-4.13.0/arch/arm64/include/asm/assembler.h @@ -25,7 +25,6 @@ #include #include -#include #include #include #include @@ -97,6 +96,24 @@ .endm /* + * Value prediction barrier + */ + .macro csdb + hint #20 + .endm + +/* + * Sanitise a 64-bit bounded index wrt speculation, returning zero if out + * of bounds. + */ + .macro mask_nospec64, idx, limit, tmp + sub \tmp, \idx, \limit + bic \tmp, \tmp, \idx + and \idx, \idx, \tmp, asr #63 + csdb + .endm + +/* * NOP sequence */ .macro nops, num @@ -230,12 +247,18 @@ .endm /* - * @dst: Result of per_cpu(sym, smp_processor_id()) + * @dst: Result of per_cpu(sym, smp_processor_id()), can be SP for + * non-module code * @sym: The name of the per-cpu variable * @tmp: scratch register */ .macro adr_this_cpu, dst, sym, tmp +#ifndef MODULE + adrp \tmp, \sym + add \dst, \tmp, #:lo12:\sym +#else adr_l \dst, \sym +#endif mrs \tmp, tpidr_el1 add \dst, \dst, \tmp .endm @@ -353,6 +376,12 @@ alternative_else dc civac, \kaddr alternative_endif + .elseif (\op == cvap) +alternative_if ARM64_HAS_DCPOP + sys 3, c7, c12, 1, \kaddr // dc cvap +alternative_else + dc cvac, \kaddr +alternative_endif .else dc \op, \kaddr .endif @@ -403,6 +432,17 @@ .size __pi_##x, . - x; \ ENDPROC(x) +/* + * Annotate a function as being unsuitable for kprobes. + */ +#ifdef CONFIG_KPROBES +#define NOKPROBE(x) \ + .pushsection "_kprobe_blacklist", "aw"; \ + .quad x; \ + .popsection; +#else +#define NOKPROBE(x) +#endif /* * Emit a 64-bit absolute little endian symbol reference in a way that * ensures that it will be resolved at build time, even when building a @@ -441,39 +481,18 @@ mrs \rd, sp_el0 .endm -/* - * Errata workaround prior to TTBR0_EL1 update - * - * val: TTBR value with new BADDR, preserved - * tmp0: temporary register, clobbered - * tmp1: other temporary register, clobbered - */ - .macro pre_ttbr0_update_workaround, val, tmp0, tmp1 -#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 -alternative_if ARM64_WORKAROUND_QCOM_FALKOR_E1003 - mrs \tmp0, ttbr0_el1 - mov \tmp1, #FALKOR_RESERVED_ASID - bfi \tmp0, \tmp1, #48, #16 // reserved ASID + old BADDR - msr ttbr0_el1, \tmp0 - isb - bfi \tmp0, \val, #0, #48 // reserved ASID + new BADDR - msr ttbr0_el1, \tmp0 +/** + * Errata workaround prior to disable MMU. Insert an ISB immediately prior + * to executing the MSR that will change SCTLR_ELn[M] from a value of 1 to 0. + */ + .macro pre_disable_mmu_workaround +#ifdef CONFIG_QCOM_FALKOR_ERRATUM_E1041 isb -alternative_else_nop_endif #endif .endm -/* - * Errata workaround post TTBR0_EL1 update. - */ - .macro post_ttbr0_update_workaround -#ifdef CONFIG_CAVIUM_ERRATUM_27456 -alternative_if ARM64_WORKAROUND_CAVIUM_27456 - ic iallu - dsb nsh - isb -alternative_else_nop_endif -#endif + .macro pte_to_phys, phys, pte + and \phys, \pte, #(((1 << (48 - PAGE_SHIFT)) - 1) << PAGE_SHIFT) .endm #endif /* __ASM_ASSEMBLER_H */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/barrier.h +++ linux-azure-4.13.0/arch/arm64/include/asm/barrier.h @@ -31,6 +31,8 @@ #define dmb(opt) asm volatile("dmb " #opt : : : "memory") #define dsb(opt) asm volatile("dsb " #opt : : : "memory") +#define csdb() asm volatile("hint #20" : : : "memory") + #define mb() dsb(sy) #define rmb() dsb(ld) #define wmb() dsb(st) @@ -38,6 +40,27 @@ #define dma_rmb() dmb(oshld) #define dma_wmb() dmb(oshst) +/* + * Generate a mask for array_index__nospec() that is ~0UL when 0 <= idx < sz + * and 0 otherwise. + */ +#define array_index_mask_nospec array_index_mask_nospec +static inline unsigned long array_index_mask_nospec(unsigned long idx, + unsigned long sz) +{ + unsigned long mask; + + asm volatile( + " cmp %1, %2\n" + " sbc %0, xzr, xzr\n" + : "=r" (mask) + : "r" (idx), "Ir" (sz) + : "cc"); + + csdb(); + return mask; +} + #define __smp_mb() dmb(ish) #define __smp_rmb() dmb(ishld) #define __smp_wmb() dmb(ishst) --- linux-azure-4.13.0.orig/arch/arm64/include/asm/bug.h +++ linux-azure-4.13.0/arch/arm64/include/asm/bug.h @@ -18,41 +18,12 @@ #ifndef _ARCH_ARM64_ASM_BUG_H #define _ARCH_ARM64_ASM_BUG_H -#include +#include -#ifdef CONFIG_DEBUG_BUGVERBOSE -#define _BUGVERBOSE_LOCATION(file, line) __BUGVERBOSE_LOCATION(file, line) -#define __BUGVERBOSE_LOCATION(file, line) \ - ".pushsection .rodata.str,\"aMS\",@progbits,1\n" \ - "2: .string \"" file "\"\n\t" \ - ".popsection\n\t" \ - \ - ".long 2b - 0b\n\t" \ - ".short " #line "\n\t" -#else -#define _BUGVERBOSE_LOCATION(file, line) -#endif - -#ifdef CONFIG_GENERIC_BUG - -#define __BUG_ENTRY(flags) \ - ".pushsection __bug_table,\"aw\"\n\t" \ - ".align 2\n\t" \ - "0: .long 1f - 0b\n\t" \ -_BUGVERBOSE_LOCATION(__FILE__, __LINE__) \ - ".short " #flags "\n\t" \ - ".popsection\n" \ - "1: " -#else -#define __BUG_ENTRY(flags) "" -#endif +#include #define __BUG_FLAGS(flags) \ - asm volatile ( \ - __BUG_ENTRY(flags) \ - "brk %[imm]" :: [imm] "i" (BUG_BRK_IMM) \ - ); - + asm volatile (__stringify(ASM_BUG_FLAGS(flags))); #define BUG() do { \ __BUG_FLAGS(0); \ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/cacheflush.h +++ linux-azure-4.13.0/arch/arm64/include/asm/cacheflush.h @@ -67,7 +67,9 @@ */ extern void flush_icache_range(unsigned long start, unsigned long end); extern void __flush_dcache_area(void *addr, size_t len); +extern void __inval_dcache_area(void *addr, size_t len); extern void __clean_dcache_area_poc(void *addr, size_t len); +extern void __clean_dcache_area_pop(void *addr, size_t len); extern void __clean_dcache_area_pou(void *addr, size_t len); extern long __flush_cache_user_range(unsigned long start, unsigned long end); extern void sync_icache_aliases(void *kaddr, unsigned long len); @@ -150,6 +152,6 @@ { } -int set_memory_valid(unsigned long addr, unsigned long size, int enable); +int set_memory_valid(unsigned long addr, int numpages, int enable); #endif --- linux-azure-4.13.0.orig/arch/arm64/include/asm/cpucaps.h +++ linux-azure-4.13.0/arch/arm64/include/asm/cpucaps.h @@ -39,7 +39,11 @@ #define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18 #define ARM64_WORKAROUND_858921 19 #define ARM64_WORKAROUND_CAVIUM_30115 20 +#define ARM64_HAS_DCPOP 21 +#define ARM64_UNMAP_KERNEL_AT_EL0 23 +#define ARM64_HARDEN_BRANCH_PREDICTOR 24 +#define ARM64_HARDEN_BP_POST_GUEST_EXIT 25 -#define ARM64_NCAPS 21 +#define ARM64_NCAPS 26 #endif /* __ASM_CPUCAPS_H */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/cputype.h +++ linux-azure-4.13.0/arch/arm64/include/asm/cputype.h @@ -79,26 +79,37 @@ #define ARM_CPU_PART_AEM_V8 0xD0F #define ARM_CPU_PART_FOUNDATION 0xD00 #define ARM_CPU_PART_CORTEX_A57 0xD07 +#define ARM_CPU_PART_CORTEX_A72 0xD08 #define ARM_CPU_PART_CORTEX_A53 0xD03 #define ARM_CPU_PART_CORTEX_A73 0xD09 +#define ARM_CPU_PART_CORTEX_A75 0xD0A #define APM_CPU_PART_POTENZA 0x000 #define CAVIUM_CPU_PART_THUNDERX 0x0A1 #define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2 #define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3 +#define CAVIUM_CPU_PART_THUNDERX2 0x0AF #define BRCM_CPU_PART_VULCAN 0x516 #define QCOM_CPU_PART_FALKOR_V1 0x800 +#define QCOM_CPU_PART_FALKOR 0xC00 +#define QCOM_CPU_PART_KRYO 0x200 #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) +#define MIDR_CORTEX_A72 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A72) #define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73) +#define MIDR_CORTEX_A75 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A75) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) #define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) +#define MIDR_CAVIUM_THUNDERX2 MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX2) +#define MIDR_BRCM_VULCAN MIDR_CPU_MODEL(ARM_CPU_IMP_BRCM, BRCM_CPU_PART_VULCAN) #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) +#define MIDR_QCOM_FALKOR MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR) +#define MIDR_QCOM_KRYO MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_KRYO) #ifndef __ASSEMBLY__ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/efi.h +++ linux-azure-4.13.0/arch/arm64/include/asm/efi.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -48,6 +49,13 @@ */ #define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */ +/* + * In some configurations (e.g. VMAP_STACK && 64K pages), stacks built into the + * kernel need greater alignment than we require the segments to be padded to. + */ +#define EFI_KIMG_ALIGN \ + (SEGMENT_ALIGN > THREAD_ALIGN ? SEGMENT_ALIGN : THREAD_ALIGN) + /* on arm64, the FDT may be located anywhere in system RAM */ static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base) { @@ -108,22 +116,22 @@ if (mm != current->active_mm) { /* * Update the current thread's saved ttbr0 since it is - * restored as part of a return from exception. Set - * the hardware TTBR0_EL1 using cpu_switch_mm() - * directly to enable potential errata workarounds. + * restored as part of a return from exception. Enable + * access to the valid TTBR0_EL1 and invoke the errata + * workaround directly since there is no return from + * exception when invoking the EFI run-time services. */ update_saved_ttbr0(current, mm); - cpu_switch_mm(mm->pgd, mm); + uaccess_ttbr0_enable(); + post_ttbr_update_workaround(); } else { /* * Defer the switch to the current thread's TTBR0_EL1 * until uaccess_enable(). Restore the current * thread's saved ttbr0 corresponding to its active_mm - * (if different from init_mm). */ - cpu_set_reserved_ttbr0(); - if (current->active_mm != &init_mm) - update_saved_ttbr0(current, current->active_mm); + uaccess_ttbr0_disable(); + update_saved_ttbr0(current, current->active_mm); } } } --- linux-azure-4.13.0.orig/arch/arm64/include/asm/elf.h +++ linux-azure-4.13.0/arch/arm64/include/asm/elf.h @@ -139,7 +139,6 @@ #define SET_PERSONALITY(ex) \ ({ \ - clear_bit(TIF_32BIT, ¤t->mm->context.flags); \ clear_thread_flag(TIF_32BIT); \ current->personality &= ~READ_IMPLIES_EXEC; \ }) @@ -195,7 +194,6 @@ */ #define COMPAT_SET_PERSONALITY(ex) \ ({ \ - set_bit(TIF_32BIT, ¤t->mm->context.flags); \ set_thread_flag(TIF_32BIT); \ }) #define COMPAT_ARCH_DLINFO --- linux-azure-4.13.0.orig/arch/arm64/include/asm/esr.h +++ linux-azure-4.13.0/arch/arm64/include/asm/esr.h @@ -157,9 +157,10 @@ /* * User space cache operations have the following sysreg encoding * in System instructions. - * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 14 }, WRITE (L=0) + * op0=1, op1=3, op2=1, crn=7, crm={ 5, 10, 11, 12, 14 }, WRITE (L=0) */ #define ESR_ELx_SYS64_ISS_CRM_DC_CIVAC 14 +#define ESR_ELx_SYS64_ISS_CRM_DC_CVAP 12 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAU 11 #define ESR_ELx_SYS64_ISS_CRM_DC_CVAC 10 #define ESR_ELx_SYS64_ISS_CRM_IC_IVAU 5 --- linux-azure-4.13.0.orig/arch/arm64/include/asm/fixmap.h +++ linux-azure-4.13.0/arch/arm64/include/asm/fixmap.h @@ -51,6 +51,18 @@ FIX_EARLYCON_MEM_BASE, FIX_TEXT_POKE0, + +#ifdef CONFIG_ACPI_APEI_GHES + /* Used for GHES mapping from assorted contexts */ + FIX_APEI_GHES_IRQ, + FIX_APEI_GHES_NMI, +#endif /* CONFIG_ACPI_APEI_GHES */ + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + FIX_ENTRY_TRAMP_DATA, + FIX_ENTRY_TRAMP_TEXT, +#define TRAMP_VALIAS (__fix_to_virt(FIX_ENTRY_TRAMP_TEXT)) +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ __end_of_permanent_fixed_addresses, /* --- linux-azure-4.13.0.orig/arch/arm64/include/asm/futex.h +++ linux-azure-4.13.0/arch/arm64/include/asm/futex.h @@ -48,19 +48,10 @@ } while (0) static inline int -futex_atomic_op_inuser(unsigned int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *_uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (int)(encoded_op << 8) >> 20; - int cmparg = (int)(encoded_op << 20) >> 20; int oldval = 0, ret, tmp; - - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1U << (oparg & 0x1f); - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; + u32 __user *uaddr = __uaccess_mask_ptr(_uaddr); pagefault_disable(); @@ -91,30 +82,24 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } static inline int -futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, +futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *_uaddr, u32 oldval, u32 newval) { int ret = 0; u32 val, tmp; + u32 __user *uaddr; - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) + if (!access_ok(VERIFY_WRITE, _uaddr, sizeof(u32))) return -EFAULT; + uaddr = __uaccess_mask_ptr(_uaddr); uaccess_enable(); asm volatile("// futex_atomic_cmpxchg_inatomic\n" " prfm pstl1strm, %2\n" --- linux-azure-4.13.0.orig/arch/arm64/include/asm/irq.h +++ linux-azure-4.13.0/arch/arm64/include/asm/irq.h @@ -1,45 +1,12 @@ #ifndef __ASM_IRQ_H #define __ASM_IRQ_H -#define IRQ_STACK_SIZE THREAD_SIZE -#define IRQ_STACK_START_SP THREAD_START_SP - #ifndef __ASSEMBLER__ -#include - #include -#include struct pt_regs; -DECLARE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); - -/* - * The highest address on the stack, and the first to be used. Used to - * find the dummy-stack frame put down by el?_irq() in entry.S, which - * is structured as follows: - * - * ------------ - * | | <- irq_stack_ptr - * top ------------ - * | x19 | <- irq_stack_ptr - 0x08 - * ------------ - * | x29 | <- irq_stack_ptr - 0x10 - * ------------ - * - * where x19 holds a copy of the task stack pointer where the struct pt_regs - * from kernel_entry can be found. - * - */ -#define IRQ_STACK_PTR(cpu) ((unsigned long)per_cpu(irq_stack, cpu) + IRQ_STACK_START_SP) - -/* - * The offset from irq_stack_ptr where entry.S will store the original - * stack pointer. Used by unwind_frame() and dump_backtrace(). - */ -#define IRQ_STACK_TO_TASK_STACK(ptr) (*((unsigned long *)((ptr) - 0x08))) - extern void set_handle_irq(void (*handle_irq)(struct pt_regs *)); static inline int nr_legacy_irqs(void) @@ -47,14 +14,5 @@ return 0; } -static inline bool on_irq_stack(unsigned long sp, int cpu) -{ - /* variable names the same as kernel/stacktrace.c */ - unsigned long low = (unsigned long)per_cpu(irq_stack, cpu); - unsigned long high = low + IRQ_STACK_START_SP; - - return (low <= sp && sp <= high); -} - #endif /* !__ASSEMBLER__ */ #endif --- linux-azure-4.13.0.orig/arch/arm64/include/asm/kvm_asm.h +++ linux-azure-4.13.0/arch/arm64/include/asm/kvm_asm.h @@ -66,6 +66,8 @@ extern u32 __init_stage2_translation(void); +extern void __qcom_hyp_sanitize_btac_predictors(void); + #endif #endif /* __ARM_KVM_ASM_H__ */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/kvm_emulate.h +++ linux-azure-4.13.0/arch/arm64/include/asm/kvm_emulate.h @@ -188,11 +188,6 @@ return (kvm_vcpu_get_hsr(vcpu) & ESR_ELx_SRT_MASK) >> ESR_ELx_SRT_SHIFT; } -static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu) -{ - return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_EA); -} - static inline bool kvm_vcpu_dabt_iss1tw(const struct kvm_vcpu *vcpu) { return !!(kvm_vcpu_get_hsr(vcpu) & ESR_ELx_S1PTW); @@ -240,6 +235,25 @@ return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_FSC_TYPE; } +static inline bool kvm_vcpu_dabt_isextabt(const struct kvm_vcpu *vcpu) +{ + switch (kvm_vcpu_trap_get_fault_type(vcpu)) { + case FSC_SEA: + case FSC_SEA_TTW0: + case FSC_SEA_TTW1: + case FSC_SEA_TTW2: + case FSC_SEA_TTW3: + case FSC_SECC: + case FSC_SECC_TTW0: + case FSC_SECC_TTW1: + case FSC_SECC_TTW2: + case FSC_SECC_TTW3: + return true; + default: + return false; + } +} + static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) { u32 esr = kvm_vcpu_get_hsr(vcpu); --- linux-azure-4.13.0.orig/arch/arm64/include/asm/kvm_host.h +++ linux-azure-4.13.0/arch/arm64/include/asm/kvm_host.h @@ -384,4 +384,9 @@ "PARange is %d bits, unsupported configuration!", parange); } +static inline bool kvm_arm_harden_branch_predictor(void) +{ + return cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR); +} + #endif /* __ARM64_KVM_HOST_H__ */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/kvm_mmu.h +++ linux-azure-4.13.0/arch/arm64/include/asm/kvm_mmu.h @@ -175,18 +175,15 @@ static inline void kvm_set_s2pte_readonly(pte_t *pte) { - pteval_t pteval; - unsigned long tmp; + pteval_t old_pteval, pteval; - asm volatile("// kvm_set_s2pte_readonly\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " and %0, %0, %3 // clear PTE_S2_RDWR\n" - " orr %0, %0, %4 // set PTE_S2_RDONLY\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*pte)) - : "L" (~PTE_S2_RDWR), "L" (PTE_S2_RDONLY)); + pteval = READ_ONCE(pte_val(*pte)); + do { + old_pteval = pteval; + pteval &= ~PTE_S2_RDWR; + pteval |= PTE_S2_RDONLY; + pteval = cmpxchg_relaxed(&pte_val(*pte), old_pteval, pteval); + } while (pteval != old_pteval); } static inline bool kvm_s2pte_readonly(pte_t *pte) @@ -312,5 +309,43 @@ return (cpuid_feature_extract_unsigned_field(reg, ID_AA64MMFR1_VMIDBITS_SHIFT) == 2) ? 16 : 8; } +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +#include + +static inline void *kvm_get_hyp_vector(void) +{ + struct bp_hardening_data *data = arm64_get_bp_hardening_data(); + void *vect = kvm_ksym_ref(__kvm_hyp_vector); + + if (data->fn) { + vect = __bp_harden_hyp_vecs_start + + data->hyp_vectors_slot * SZ_2K; + + if (!has_vhe()) + vect = lm_alias(vect); + } + + return vect; +} + +static inline int kvm_map_vectors(void) +{ + return create_hyp_mappings(kvm_ksym_ref(__bp_harden_hyp_vecs_start), + kvm_ksym_ref(__bp_harden_hyp_vecs_end), + PAGE_HYP_EXEC); +} + +#else +static inline void *kvm_get_hyp_vector(void) +{ + return kvm_ksym_ref(__kvm_hyp_vector); +} + +static inline int kvm_map_vectors(void) +{ + return 0; +} +#endif + #endif /* __ASSEMBLY__ */ #endif /* __ARM64_KVM_MMU_H__ */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/memory.h +++ linux-azure-4.13.0/arch/arm64/include/asm/memory.h @@ -25,6 +25,7 @@ #include #include #include +#include #include /* @@ -60,8 +61,6 @@ * KIMAGE_VADDR - the virtual address of the start of the kernel image * VA_BITS - the maximum number of bits for virtual addresses. * VA_START - the first kernel virtual address. - * TASK_SIZE - the maximum size of a user space task. - * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. */ #define VA_BITS (CONFIG_ARM64_VA_BITS) #define VA_START (UL(0xffffffffffffffff) - \ @@ -76,31 +75,73 @@ #define PCI_IO_END (VMEMMAP_START - SZ_2M) #define PCI_IO_START (PCI_IO_END - PCI_IO_SIZE) #define FIXADDR_TOP (PCI_IO_START - SZ_2M) -#define TASK_SIZE_64 (UL(1) << VA_BITS) - -#ifdef CONFIG_COMPAT -#define TASK_SIZE_32 UL(0x100000000) -#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ - TASK_SIZE_32 : TASK_SIZE_64) -#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ - TASK_SIZE_32 : TASK_SIZE_64) -#else -#define TASK_SIZE TASK_SIZE_64 -#endif /* CONFIG_COMPAT */ - -#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) #define KERNEL_START _text #define KERNEL_END _end /* - * The size of the KASAN shadow region. This should be 1/8th of the - * size of the entire kernel virtual address space. + * KASAN requires 1/8th of the kernel virtual address space for the shadow + * region. KASAN can bloat the stack significantly, so double the (minimum) + * stack size when KASAN is in use. */ #ifdef CONFIG_KASAN #define KASAN_SHADOW_SIZE (UL(1) << (VA_BITS - 3)) +#define KASAN_THREAD_SHIFT 1 #else #define KASAN_SHADOW_SIZE (0) +#define KASAN_THREAD_SHIFT 0 +#endif + +#define MIN_THREAD_SHIFT (14 + KASAN_THREAD_SHIFT) + +/* + * VMAP'd stacks are allocated at page granularity, so we must ensure that such + * stacks are a multiple of page size. + */ +#if defined(CONFIG_VMAP_STACK) && (MIN_THREAD_SHIFT < PAGE_SHIFT) +#define THREAD_SHIFT PAGE_SHIFT +#else +#define THREAD_SHIFT MIN_THREAD_SHIFT +#endif + +#if THREAD_SHIFT >= PAGE_SHIFT +#define THREAD_SIZE_ORDER (THREAD_SHIFT - PAGE_SHIFT) +#endif + +#define THREAD_SIZE (UL(1) << THREAD_SHIFT) + +/* + * By aligning VMAP'd stacks to 2 * THREAD_SIZE, we can detect overflow by + * checking sp & (1 << THREAD_SHIFT), which we can do cheaply in the entry + * assembly. + */ +#ifdef CONFIG_VMAP_STACK +#define THREAD_ALIGN (2 * THREAD_SIZE) +#else +#define THREAD_ALIGN THREAD_SIZE +#endif + +#define IRQ_STACK_SIZE THREAD_SIZE + +#define OVERFLOW_STACK_SIZE SZ_4K + +/* + * Alignment of kernel segments (e.g. .text, .data). + */ +#if defined(CONFIG_DEBUG_ALIGN_RODATA) +/* + * 4 KB granule: 1 level 2 entry + * 16 KB granule: 128 level 3 entries, with contiguous bit + * 64 KB granule: 32 level 3 entries, with contiguous bit + */ +#define SEGMENT_ALIGN SZ_2M +#else +/* + * 4 KB granule: 16 level 3 entries, with contiguous bit + * 16 KB granule: 4 level 3 entries, without contiguous bit + * 64 KB granule: 1 level 3 entry + */ +#define SEGMENT_ALIGN SZ_64K #endif /* --- linux-azure-4.13.0.orig/arch/arm64/include/asm/mmu.h +++ linux-azure-4.13.0/arch/arm64/include/asm/mmu.h @@ -16,6 +16,12 @@ #ifndef __ASM_MMU_H #define __ASM_MMU_H +#define MMCF_AARCH32 0x1 /* mm context flag for AArch32 executables */ +#define USER_ASID_FLAG (UL(1) << 48) +#define TTBR_ASID_MASK (UL(0xffff) << 48) + +#ifndef __ASSEMBLY__ + typedef struct { atomic64_t id; void *vdso; @@ -29,6 +35,49 @@ */ #define ASID(mm) ((mm)->context.id.counter & 0xffff) +static inline bool arm64_kernel_unmapped_at_el0(void) +{ + return IS_ENABLED(CONFIG_UNMAP_KERNEL_AT_EL0) && + cpus_have_const_cap(ARM64_UNMAP_KERNEL_AT_EL0); +} + +typedef void (*bp_hardening_cb_t)(void); + +struct bp_hardening_data { + int hyp_vectors_slot; + bp_hardening_cb_t fn; +}; + +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +extern char __bp_harden_hyp_vecs_start[], __bp_harden_hyp_vecs_end[]; + +DECLARE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) +{ + return this_cpu_ptr(&bp_hardening_data); +} + +static inline void arm64_apply_bp_hardening(void) +{ + struct bp_hardening_data *d; + + if (!cpus_have_const_cap(ARM64_HARDEN_BRANCH_PREDICTOR)) + return; + + d = arm64_get_bp_hardening_data(); + if (d->fn) + d->fn(); +} +#else +static inline struct bp_hardening_data *arm64_get_bp_hardening_data(void) +{ + return NULL; +} + +static inline void arm64_apply_bp_hardening(void) { } +#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ + extern void paging_init(void); extern void bootmem_init(void); extern void __iomem *early_io_map(phys_addr_t phys, unsigned long virt); @@ -39,4 +88,5 @@ extern void *fixmap_remap_fdt(phys_addr_t dt_phys); extern void mark_linear_text_alias_ro(void); +#endif /* !__ASSEMBLY__ */ #endif --- linux-azure-4.13.0.orig/arch/arm64/include/asm/mmu_context.h +++ linux-azure-4.13.0/arch/arm64/include/asm/mmu_context.h @@ -19,8 +19,6 @@ #ifndef __ASM_MMU_CONTEXT_H #define __ASM_MMU_CONTEXT_H -#define FALKOR_RESERVED_ASID 1 - #ifndef __ASSEMBLY__ #include @@ -57,6 +55,13 @@ isb(); } +static inline void cpu_switch_mm(pgd_t *pgd, struct mm_struct *mm) +{ + BUG_ON(pgd == swapper_pg_dir); + cpu_set_reserved_ttbr0(); + cpu_do_switch_mm(virt_to_phys(pgd),mm); +} + /* * TCR.T0SZ value to use when the ID map is active. Usually equals * TCR_T0SZ(VA_BITS), unless system RAM is positioned very high in @@ -156,29 +161,21 @@ #define init_new_context(tsk,mm) ({ atomic64_set(&(mm)->context.id, 0); 0; }) -/* - * This is called when "tsk" is about to enter lazy TLB mode. - * - * mm: describes the currently active mm context - * tsk: task which is entering lazy tlb - * cpu: cpu number which is entering lazy tlb - * - * tsk->mm will be NULL - */ -static inline void -enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ -} - #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void update_saved_ttbr0(struct task_struct *tsk, struct mm_struct *mm) { - if (system_uses_ttbr0_pan()) { - BUG_ON(mm->pgd == swapper_pg_dir); - task_thread_info(tsk)->ttbr0 = - virt_to_phys(mm->pgd) | ASID(mm) << 48; - } + u64 ttbr; + + if (!system_uses_ttbr0_pan()) + return; + + if (mm == &init_mm) + ttbr = __pa_symbol(empty_zero_page); + else + ttbr = virt_to_phys(mm->pgd) | ASID(mm) << 48; + + WRITE_ONCE(task_thread_info(tsk)->ttbr0, ttbr); } #else static inline void update_saved_ttbr0(struct task_struct *tsk, @@ -187,6 +184,16 @@ } #endif +static inline void +enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) +{ + /* + * We don't actually care about the ttbr0 mapping, so point it at the + * zero page. + */ + update_saved_ttbr0(tsk, &init_mm); +} + static inline void __switch_mm(struct mm_struct *next) { unsigned int cpu = smp_processor_id(); @@ -214,17 +221,16 @@ * Update the saved TTBR0_EL1 of the scheduled-in task as the previous * value may have not been initialised yet (activate_mm caller) or the * ASID has changed since the last run (following the context switch - * of another thread of the same process). Avoid setting the reserved - * TTBR0_EL1 to swapper_pg_dir (init_mm; e.g. via idle_task_exit). + * of another thread of the same process). */ - if (next != &init_mm) - update_saved_ttbr0(tsk, next); + update_saved_ttbr0(tsk, next); } #define deactivate_mm(tsk,mm) do { } while (0) #define activate_mm(prev,next) switch_mm(prev, next, current) void verify_cpu_asid_bits(void); +void post_ttbr_update_workaround(void); #endif /* !__ASSEMBLY__ */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/page-def.h +++ linux-azure-4.13.0/arch/arm64/include/asm/page-def.h @@ -0,0 +1,34 @@ +/* + * Based on arch/arm/include/asm/page.h + * + * Copyright (C) 1995-2003 Russell King + * Copyright (C) 2017 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ +#ifndef __ASM_PAGE_DEF_H +#define __ASM_PAGE_DEF_H + +#include + +/* PAGE_SHIFT determines the page size */ +/* CONT_SHIFT determines the number of pages which can be tracked together */ +#define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT +#define CONT_SHIFT CONFIG_ARM64_CONT_SHIFT +#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) +#define PAGE_MASK (~(PAGE_SIZE-1)) + +#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT)) +#define CONT_MASK (~(CONT_SIZE-1)) + +#endif /* __ASM_PAGE_DEF_H */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/page.h +++ linux-azure-4.13.0/arch/arm64/include/asm/page.h @@ -19,17 +19,7 @@ #ifndef __ASM_PAGE_H #define __ASM_PAGE_H -#include - -/* PAGE_SHIFT determines the page size */ -/* CONT_SHIFT determines the number of pages which can be tracked together */ -#define PAGE_SHIFT CONFIG_ARM64_PAGE_SHIFT -#define CONT_SHIFT CONFIG_ARM64_CONT_SHIFT -#define PAGE_SIZE (_AC(1, UL) << PAGE_SHIFT) -#define PAGE_MASK (~(PAGE_SIZE-1)) - -#define CONT_SIZE (_AC(1, UL) << (CONT_SHIFT + PAGE_SHIFT)) -#define CONT_MASK (~(CONT_SIZE-1)) +#include #ifndef __ASSEMBLY__ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/pgtable-hwdef.h +++ linux-azure-4.13.0/arch/arm64/include/asm/pgtable-hwdef.h @@ -272,6 +272,7 @@ #define TCR_TG1_4K (UL(2) << TCR_TG1_SHIFT) #define TCR_TG1_64K (UL(3) << TCR_TG1_SHIFT) +#define TCR_A1 (UL(1) << 22) #define TCR_ASID16 (UL(1) << 36) #define TCR_TBI0 (UL(1) << 37) #define TCR_HA (UL(1) << 39) --- linux-azure-4.13.0.orig/arch/arm64/include/asm/pgtable-prot.h +++ linux-azure-4.13.0/arch/arm64/include/asm/pgtable-prot.h @@ -34,8 +34,14 @@ #include -#define PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) -#define PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) +#define _PROT_DEFAULT (PTE_TYPE_PAGE | PTE_AF | PTE_SHARED) +#define _PROT_SECT_DEFAULT (PMD_TYPE_SECT | PMD_SECT_AF | PMD_SECT_S) + +#define PTE_MAYBE_NG (arm64_kernel_unmapped_at_el0() ? PTE_NG : 0) +#define PMD_MAYBE_NG (arm64_kernel_unmapped_at_el0() ? PMD_SECT_NG : 0) + +#define PROT_DEFAULT (_PROT_DEFAULT | PTE_MAYBE_NG) +#define PROT_SECT_DEFAULT (_PROT_SECT_DEFAULT | PMD_MAYBE_NG) #define PROT_DEVICE_nGnRnE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRnE)) #define PROT_DEVICE_nGnRE (PROT_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_ATTRINDX(MT_DEVICE_nGnRE)) @@ -47,39 +53,38 @@ #define PROT_SECT_NORMAL (PROT_SECT_DEFAULT | PMD_SECT_PXN | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) #define PROT_SECT_NORMAL_EXEC (PROT_SECT_DEFAULT | PMD_SECT_UXN | PMD_ATTRINDX(MT_NORMAL)) -#define _PAGE_DEFAULT (PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) +#define _PAGE_DEFAULT (_PROT_DEFAULT | PTE_ATTRINDX(MT_NORMAL)) +#define _HYP_PAGE_DEFAULT _PAGE_DEFAULT -#define PAGE_KERNEL __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_WRITE) -#define PAGE_KERNEL_RO __pgprot(_PAGE_DEFAULT | PTE_PXN | PTE_UXN | PTE_DIRTY | PTE_RDONLY) -#define PAGE_KERNEL_ROX __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_RDONLY) -#define PAGE_KERNEL_EXEC __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE) -#define PAGE_KERNEL_EXEC_CONT __pgprot(_PAGE_DEFAULT | PTE_UXN | PTE_DIRTY | PTE_WRITE | PTE_CONT) - -#define PAGE_HYP __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) -#define PAGE_HYP_EXEC __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) -#define PAGE_HYP_RO __pgprot(_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) +#define PAGE_KERNEL __pgprot(PROT_NORMAL) +#define PAGE_KERNEL_RO __pgprot((PROT_NORMAL & ~PTE_WRITE) | PTE_RDONLY) +#define PAGE_KERNEL_ROX __pgprot((PROT_NORMAL & ~(PTE_WRITE | PTE_PXN)) | PTE_RDONLY) +#define PAGE_KERNEL_EXEC __pgprot(PROT_NORMAL & ~PTE_PXN) +#define PAGE_KERNEL_EXEC_CONT __pgprot((PROT_NORMAL & ~PTE_PXN) | PTE_CONT) + +#define PAGE_HYP __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_HYP_XN) +#define PAGE_HYP_EXEC __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY) +#define PAGE_HYP_RO __pgprot(_HYP_PAGE_DEFAULT | PTE_HYP | PTE_RDONLY | PTE_HYP_XN) #define PAGE_HYP_DEVICE __pgprot(PROT_DEVICE_nGnRE | PTE_HYP) -#define PAGE_S2 __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) -#define PAGE_S2_DEVICE __pgprot(PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN) +#define PAGE_S2 __pgprot(_PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_NORMAL) | PTE_S2_RDONLY) +#define PAGE_S2_DEVICE __pgprot(_PROT_DEFAULT | PTE_S2_MEMATTR(MT_S2_DEVICE_nGnRE) | PTE_S2_RDONLY | PTE_UXN) -#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_PXN | PTE_UXN) +#define PAGE_NONE __pgprot(((_PAGE_DEFAULT) & ~PTE_VALID) | PTE_PROT_NONE | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) #define PAGE_SHARED __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN | PTE_WRITE) #define PAGE_SHARED_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_WRITE) -#define PAGE_COPY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define PAGE_COPY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) -#define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN | PTE_UXN) -#define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_NG | PTE_PXN) -#define PAGE_EXECONLY __pgprot(_PAGE_DEFAULT | PTE_NG | PTE_PXN) +#define PAGE_READONLY __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN | PTE_UXN) +#define PAGE_READONLY_EXEC __pgprot(_PAGE_DEFAULT | PTE_USER | PTE_RDONLY | PTE_NG | PTE_PXN) +#define PAGE_EXECONLY __pgprot(_PAGE_DEFAULT | PTE_RDONLY | PTE_NG | PTE_PXN) #define __P000 PAGE_NONE #define __P001 PAGE_READONLY -#define __P010 PAGE_COPY -#define __P011 PAGE_COPY +#define __P010 PAGE_READONLY +#define __P011 PAGE_READONLY #define __P100 PAGE_EXECONLY #define __P101 PAGE_READONLY_EXEC -#define __P110 PAGE_COPY_EXEC -#define __P111 PAGE_COPY_EXEC +#define __P110 PAGE_READONLY_EXEC +#define __P111 PAGE_READONLY_EXEC #define __S000 PAGE_NONE #define __S001 PAGE_READONLY --- linux-azure-4.13.0.orig/arch/arm64/include/asm/pgtable.h +++ linux-azure-4.13.0/arch/arm64/include/asm/pgtable.h @@ -39,6 +39,7 @@ #ifndef __ASSEMBLY__ +#include #include #include @@ -84,11 +85,7 @@ (__boundary - 1 < (end) - 1) ? __boundary : (end); \ }) -#ifdef CONFIG_ARM64_HW_AFDBM #define pte_hw_dirty(pte) (pte_write(pte) && !(pte_val(pte) & PTE_RDONLY)) -#else -#define pte_hw_dirty(pte) (0) -#endif #define pte_sw_dirty(pte) (!!(pte_val(pte) & PTE_DIRTY)) #define pte_dirty(pte) (pte_sw_dirty(pte) || pte_hw_dirty(pte)) @@ -124,12 +121,16 @@ static inline pte_t pte_wrprotect(pte_t pte) { - return clear_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = clear_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = set_pte_bit(pte, __pgprot(PTE_RDONLY)); + return pte; } static inline pte_t pte_mkwrite(pte_t pte) { - return set_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = set_pte_bit(pte, __pgprot(PTE_WRITE)); + pte = clear_pte_bit(pte, __pgprot(PTE_RDONLY)); + return pte; } static inline pte_t pte_mkclean(pte_t pte) @@ -168,11 +169,6 @@ return clear_pte_bit(pte, __pgprot(PTE_CONT)); } -static inline pte_t pte_clear_rdonly(pte_t pte) -{ - return clear_pte_bit(pte, __pgprot(PTE_RDONLY)); -} - static inline pte_t pte_mkpresent(pte_t pte) { return set_pte_bit(pte, __pgprot(PTE_VALID)); @@ -220,22 +216,15 @@ static inline void set_pte_at(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pte) { - if (pte_present(pte)) { - if (pte_sw_dirty(pte) && pte_write(pte)) - pte_val(pte) &= ~PTE_RDONLY; - else - pte_val(pte) |= PTE_RDONLY; - if (pte_user_exec(pte) && !pte_special(pte)) - __sync_icache_dcache(pte, addr); - } + if (pte_present(pte) && pte_user_exec(pte) && !pte_special(pte)) + __sync_icache_dcache(pte, addr); /* * If the existing pte is valid, check for potential race with * hardware updates of the pte (ptep_set_access_flags safely changes * valid ptes without going through an invalid entry). */ - if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && - pte_valid(*ptep) && pte_valid(pte)) { + if (pte_valid(*ptep) && pte_valid(pte)) { VM_WARN_ONCE(!pte_young(pte), "%s: racy access flag clearing: 0x%016llx -> 0x%016llx", __func__, pte_val(*ptep), pte_val(pte)); @@ -412,7 +401,7 @@ /* Find an entry in the third-level page table. */ #define pte_index(addr) (((addr) >> PAGE_SHIFT) & (PTRS_PER_PTE - 1)) -#define pte_offset_phys(dir,addr) (pmd_page_paddr(*(dir)) + pte_index(addr) * sizeof(pte_t)) +#define pte_offset_phys(dir,addr) (pmd_page_paddr(READ_ONCE(*(dir))) + pte_index(addr) * sizeof(pte_t)) #define pte_offset_kernel(dir,addr) ((pte_t *)__va(pte_offset_phys((dir), (addr)))) #define pte_offset_map(dir,addr) pte_offset_kernel((dir), (addr)) @@ -571,7 +560,6 @@ return pte_pmd(pte_modify(pmd_pte(pmd), newprot)); } -#ifdef CONFIG_ARM64_HW_AFDBM #define __HAVE_ARCH_PTEP_SET_ACCESS_FLAGS extern int ptep_set_access_flags(struct vm_area_struct *vma, unsigned long address, pte_t *ptep, @@ -593,20 +581,17 @@ #define __HAVE_ARCH_PTEP_TEST_AND_CLEAR_YOUNG static inline int __ptep_test_and_clear_young(pte_t *ptep) { - pteval_t pteval; - unsigned int tmp, res; + pte_t old_pte, pte; - asm volatile("// __ptep_test_and_clear_young\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " ubfx %w3, %w0, %5, #1 // extract PTE_AF (young)\n" - " and %0, %0, %4 // clear PTE_AF\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)), "=&r" (res) - : "L" (~PTE_AF), "I" (ilog2(PTE_AF))); + pte = READ_ONCE(*ptep); + do { + old_pte = pte; + pte = pte_mkold(pte); + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), + pte_val(old_pte), pte_val(pte)); + } while (pte_val(pte) != pte_val(old_pte)); - return res; + return pte_young(pte); } static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, @@ -630,17 +615,7 @@ static inline pte_t ptep_get_and_clear(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pteval_t old_pteval; - unsigned int tmp; - - asm volatile("// ptep_get_and_clear\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " stxr %w1, xzr, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep))); - - return __pte(old_pteval); + return __pte(xchg_relaxed(&pte_val(*ptep), 0)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -653,27 +628,32 @@ #endif /* CONFIG_TRANSPARENT_HUGEPAGE */ /* - * ptep_set_wrprotect - mark read-only while trasferring potential hardware - * dirty status (PTE_DBM && !PTE_RDONLY) to the software PTE_DIRTY bit. + * ptep_set_wrprotect - mark read-only while preserving the hardware update of + * the Access Flag. */ #define __HAVE_ARCH_PTEP_SET_WRPROTECT static inline void ptep_set_wrprotect(struct mm_struct *mm, unsigned long address, pte_t *ptep) { - pteval_t pteval; - unsigned long tmp; + pte_t old_pte, pte; - asm volatile("// ptep_set_wrprotect\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " tst %0, %4 // check for hw dirty (!PTE_RDONLY)\n" - " csel %1, %3, xzr, eq // set PTE_DIRTY|PTE_RDONLY if dirty\n" - " orr %0, %0, %1 // if !dirty, PTE_RDONLY is already set\n" - " and %0, %0, %5 // clear PTE_WRITE/PTE_DBM\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) - : "r" (PTE_DIRTY|PTE_RDONLY), "L" (PTE_RDONLY), "L" (~PTE_WRITE) - : "cc"); + /* + * ptep_set_wrprotect() is only called on CoW mappings which are + * private (!VM_SHARED) with the pte either read-only (!PTE_WRITE && + * PTE_RDONLY) or writable and software-dirty (PTE_WRITE && + * !PTE_RDONLY && PTE_DIRTY); see is_cow_mapping() and + * protection_map[]. There is no race with the hardware update of the + * dirty state: clearing of PTE_RDONLY when PTE_WRITE (a.k.a. PTE_DBM) + * is set. + */ + VM_WARN_ONCE(pte_write(*ptep) && !pte_dirty(*ptep), + "%s: potential race with hardware DBM", __func__); + pte = READ_ONCE(*ptep); + do { + old_pte = pte; + pte = pte_wrprotect(pte); + pte_val(pte) = cmpxchg_relaxed(&pte_val(*ptep), + pte_val(old_pte), pte_val(pte)); + } while (pte_val(pte) != pte_val(old_pte)); } #ifdef CONFIG_TRANSPARENT_HUGEPAGE @@ -684,10 +664,10 @@ ptep_set_wrprotect(mm, address, (pte_t *)pmdp); } #endif -#endif /* CONFIG_ARM64_HW_AFDBM */ extern pgd_t swapper_pg_dir[PTRS_PER_PGD]; extern pgd_t idmap_pg_dir[PTRS_PER_PGD]; +extern pgd_t tramp_pg_dir[PTRS_PER_PGD]; /* * Encode and decode a swap entry: --- linux-azure-4.13.0.orig/arch/arm64/include/asm/proc-fns.h +++ linux-azure-4.13.0/arch/arm64/include/asm/proc-fns.h @@ -35,12 +35,6 @@ #include -#define cpu_switch_mm(pgd,mm) \ -do { \ - BUG_ON(pgd == swapper_pg_dir); \ - cpu_do_switch_mm(virt_to_phys(pgd),mm); \ -} while (0) - #endif /* __ASSEMBLY__ */ #endif /* __KERNEL__ */ #endif /* __ASM_PROCFNS_H */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/processor.h +++ linux-azure-4.13.0/arch/arm64/include/asm/processor.h @@ -19,6 +19,13 @@ #ifndef __ASM_PROCESSOR_H #define __ASM_PROCESSOR_H +#define TASK_SIZE_64 (UL(1) << VA_BITS) + +#define KERNEL_DS UL(-1) +#define USER_DS (TASK_SIZE_64 - 1) + +#ifndef __ASSEMBLY__ + /* * Default implementation of macro that returns current * instruction pointer ("program counter"). @@ -37,6 +44,22 @@ #include #include +/* + * TASK_SIZE - the maximum size of a user space task. + * TASK_UNMAPPED_BASE - the lower boundary of the mmap VM area. + */ +#ifdef CONFIG_COMPAT +#define TASK_SIZE_32 UL(0x100000000) +#define TASK_SIZE (test_thread_flag(TIF_32BIT) ? \ + TASK_SIZE_32 : TASK_SIZE_64) +#define TASK_SIZE_OF(tsk) (test_tsk_thread_flag(tsk, TIF_32BIT) ? \ + TASK_SIZE_32 : TASK_SIZE_64) +#else +#define TASK_SIZE TASK_SIZE_64 +#endif /* CONFIG_COMPAT */ + +#define TASK_UNMAPPED_BASE (PAGE_ALIGN(TASK_SIZE / 4)) + #define STACK_TOP_MAX TASK_SIZE_64 #ifdef CONFIG_COMPAT #define AARCH32_VECTORS_BASE 0xffff0000 @@ -112,7 +135,7 @@ static inline void start_thread_common(struct pt_regs *regs, unsigned long pc) { memset(regs, 0, sizeof(*regs)); - regs->syscallno = ~0UL; + regs->syscallno = ~0; regs->pc = pc; } @@ -159,7 +182,7 @@ struct task_struct *next); #define task_pt_regs(p) \ - ((struct pt_regs *)(THREAD_START_SP + task_stack_page(p)) - 1) + ((struct pt_regs *)(THREAD_SIZE + task_stack_page(p)) - 1) #define KSTK_EIP(tsk) ((unsigned long)task_pt_regs(tsk)->pc) #define KSTK_ESP(tsk) user_stack_pointer(task_pt_regs(tsk)) @@ -194,4 +217,5 @@ int cpu_enable_pan(void *__unused); int cpu_enable_cache_maint_trap(void *__unused); +#endif /* __ASSEMBLY__ */ #endif /* __ASM_PROCESSOR_H */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/ptrace.h +++ linux-azure-4.13.0/arch/arm64/include/asm/ptrace.h @@ -116,9 +116,17 @@ }; }; u64 orig_x0; - u64 syscallno; +#ifdef __AARCH64EB__ + u32 unused2; + s32 syscallno; +#else + s32 syscallno; + u32 unused2; +#endif + u64 orig_addr_limit; u64 unused; // maintain 16 byte alignment + u64 stackframe[2]; }; #define MAX_REG_OFFSET offsetof(struct pt_regs, pstate) --- linux-azure-4.13.0.orig/arch/arm64/include/asm/signal32.h +++ linux-azure-4.13.0/arch/arm64/include/asm/signal32.h @@ -22,8 +22,6 @@ #define AARCH32_KERN_SIGRET_CODE_OFFSET 0x500 -extern const compat_ulong_t aarch32_sigret_code[6]; - int compat_setup_frame(int usig, struct ksignal *ksig, sigset_t *set, struct pt_regs *regs); int compat_setup_rt_frame(int usig, struct ksignal *ksig, sigset_t *set, --- linux-azure-4.13.0.orig/arch/arm64/include/asm/spinlock.h +++ linux-azure-4.13.0/arch/arm64/include/asm/spinlock.h @@ -187,169 +187,7 @@ } #define arch_spin_is_contended arch_spin_is_contended -/* - * Write lock implementation. - * - * Write locks set bit 31. Unlocking, is done by writing 0 since the lock is - * exclusively held. - * - * The memory barriers are implicit with the load-acquire and store-release - * instructions. - */ - -static inline void arch_write_lock(arch_rwlock_t *rw) -{ - unsigned int tmp; - - asm volatile(ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ - " sevl\n" - "1: wfe\n" - "2: ldaxr %w0, %1\n" - " cbnz %w0, 1b\n" - " stxr %w0, %w2, %1\n" - " cbnz %w0, 2b\n" - __nops(1), - /* LSE atomics */ - "1: mov %w0, wzr\n" - "2: casa %w0, %w2, %1\n" - " cbz %w0, 3f\n" - " ldxr %w0, %1\n" - " cbz %w0, 2b\n" - " wfe\n" - " b 1b\n" - "3:") - : "=&r" (tmp), "+Q" (rw->lock) - : "r" (0x80000000) - : "memory"); -} - -static inline int arch_write_trylock(arch_rwlock_t *rw) -{ - unsigned int tmp; - - asm volatile(ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ - "1: ldaxr %w0, %1\n" - " cbnz %w0, 2f\n" - " stxr %w0, %w2, %1\n" - " cbnz %w0, 1b\n" - "2:", - /* LSE atomics */ - " mov %w0, wzr\n" - " casa %w0, %w2, %1\n" - __nops(2)) - : "=&r" (tmp), "+Q" (rw->lock) - : "r" (0x80000000) - : "memory"); - - return !tmp; -} - -static inline void arch_write_unlock(arch_rwlock_t *rw) -{ - asm volatile(ARM64_LSE_ATOMIC_INSN( - " stlr wzr, %0", - " swpl wzr, wzr, %0") - : "=Q" (rw->lock) :: "memory"); -} - -/* write_can_lock - would write_trylock() succeed? */ -#define arch_write_can_lock(x) ((x)->lock == 0) - -/* - * Read lock implementation. - * - * It exclusively loads the lock value, increments it and stores the new value - * back if positive and the CPU still exclusively owns the location. If the - * value is negative, the lock is already held. - * - * During unlocking there may be multiple active read locks but no write lock. - * - * The memory barriers are implicit with the load-acquire and store-release - * instructions. - * - * Note that in UNDEFINED cases, such as unlocking a lock twice, the LL/SC - * and LSE implementations may exhibit different behaviour (although this - * will have no effect on lockdep). - */ -static inline void arch_read_lock(arch_rwlock_t *rw) -{ - unsigned int tmp, tmp2; - - asm volatile( - " sevl\n" - ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ - "1: wfe\n" - "2: ldaxr %w0, %2\n" - " add %w0, %w0, #1\n" - " tbnz %w0, #31, 1b\n" - " stxr %w1, %w0, %2\n" - " cbnz %w1, 2b\n" - __nops(1), - /* LSE atomics */ - "1: wfe\n" - "2: ldxr %w0, %2\n" - " adds %w1, %w0, #1\n" - " tbnz %w1, #31, 1b\n" - " casa %w0, %w1, %2\n" - " sbc %w0, %w1, %w0\n" - " cbnz %w0, 2b") - : "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock) - : - : "cc", "memory"); -} - -static inline void arch_read_unlock(arch_rwlock_t *rw) -{ - unsigned int tmp, tmp2; - - asm volatile(ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ - "1: ldxr %w0, %2\n" - " sub %w0, %w0, #1\n" - " stlxr %w1, %w0, %2\n" - " cbnz %w1, 1b", - /* LSE atomics */ - " movn %w0, #0\n" - " staddl %w0, %2\n" - __nops(2)) - : "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock) - : - : "memory"); -} - -static inline int arch_read_trylock(arch_rwlock_t *rw) -{ - unsigned int tmp, tmp2; - - asm volatile(ARM64_LSE_ATOMIC_INSN( - /* LL/SC */ - " mov %w1, #1\n" - "1: ldaxr %w0, %2\n" - " add %w0, %w0, #1\n" - " tbnz %w0, #31, 2f\n" - " stxr %w1, %w0, %2\n" - " cbnz %w1, 1b\n" - "2:", - /* LSE atomics */ - " ldr %w0, %2\n" - " adds %w1, %w0, #1\n" - " tbnz %w1, #31, 1f\n" - " casa %w0, %w1, %2\n" - " sbc %w1, %w1, %w0\n" - __nops(1) - "1:") - : "=&r" (tmp), "=&r" (tmp2), "+Q" (rw->lock) - : - : "cc", "memory"); - - return !tmp2; -} - -/* read_can_lock - would read_trylock() succeed? */ -#define arch_read_can_lock(x) ((x)->lock < 0x80000000) +#include #define arch_read_lock_flags(lock, flags) arch_read_lock(lock) #define arch_write_lock_flags(lock, flags) arch_write_lock(lock) --- linux-azure-4.13.0.orig/arch/arm64/include/asm/spinlock_types.h +++ linux-azure-4.13.0/arch/arm64/include/asm/spinlock_types.h @@ -36,10 +36,6 @@ #define __ARCH_SPIN_LOCK_UNLOCKED { 0 , 0 } -typedef struct { - volatile unsigned int lock; -} arch_rwlock_t; - -#define __ARCH_RW_LOCK_UNLOCKED { 0 } +#include #endif --- linux-azure-4.13.0.orig/arch/arm64/include/asm/stacktrace.h +++ linux-azure-4.13.0/arch/arm64/include/asm/stacktrace.h @@ -16,11 +16,15 @@ #ifndef __ASM_STACKTRACE_H #define __ASM_STACKTRACE_H -struct task_struct; +#include +#include +#include + +#include +#include struct stackframe { unsigned long fp; - unsigned long sp; unsigned long pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER unsigned int graph; @@ -32,4 +36,57 @@ int (*fn)(struct stackframe *, void *), void *data); extern void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk); +DECLARE_PER_CPU(unsigned long *, irq_stack_ptr); + +static inline bool on_irq_stack(unsigned long sp) +{ + unsigned long low = (unsigned long)raw_cpu_read(irq_stack_ptr); + unsigned long high = low + IRQ_STACK_SIZE; + + if (!low) + return false; + + return (low <= sp && sp < high); +} + +static inline bool on_task_stack(struct task_struct *tsk, unsigned long sp) +{ + unsigned long low = (unsigned long)task_stack_page(tsk); + unsigned long high = low + THREAD_SIZE; + + return (low <= sp && sp < high); +} + +#ifdef CONFIG_VMAP_STACK +DECLARE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack); + +static inline bool on_overflow_stack(unsigned long sp) +{ + unsigned long low = (unsigned long)raw_cpu_ptr(overflow_stack); + unsigned long high = low + OVERFLOW_STACK_SIZE; + + return (low <= sp && sp < high); +} +#else +static inline bool on_overflow_stack(unsigned long sp) { return false; } +#endif + +/* + * We can only safely access per-cpu stacks from current in a non-preemptible + * context. + */ +static inline bool on_accessible_stack(struct task_struct *tsk, unsigned long sp) +{ + if (on_task_stack(tsk, sp)) + return true; + if (tsk != current || preemptible()) + return false; + if (on_irq_stack(sp)) + return true; + if (on_overflow_stack(sp)) + return true; + + return false; +} + #endif /* __ASM_STACKTRACE_H */ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/string.h +++ linux-azure-4.13.0/arch/arm64/include/asm/string.h @@ -52,6 +52,10 @@ #define __HAVE_ARCH_MEMCMP extern int memcmp(const void *, const void *, size_t); +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +#define __HAVE_ARCH_MEMCPY_FLUSHCACHE +void memcpy_flushcache(void *dst, const void *src, size_t cnt); +#endif #if defined(CONFIG_KASAN) && !defined(__SANITIZE_ADDRESS__) --- linux-azure-4.13.0.orig/arch/arm64/include/asm/sysreg.h +++ linux-azure-4.13.0/arch/arm64/include/asm/sysreg.h @@ -329,8 +329,11 @@ #define ID_AA64ISAR1_LRCPC_SHIFT 20 #define ID_AA64ISAR1_FCMA_SHIFT 16 #define ID_AA64ISAR1_JSCVT_SHIFT 12 +#define ID_AA64ISAR1_DPB_SHIFT 0 /* id_aa64pfr0 */ +#define ID_AA64PFR0_CSV3_SHIFT 60 +#define ID_AA64PFR0_CSV2_SHIFT 56 #define ID_AA64PFR0_GIC_SHIFT 24 #define ID_AA64PFR0_ASIMD_SHIFT 20 #define ID_AA64PFR0_FP_SHIFT 16 --- linux-azure-4.13.0.orig/arch/arm64/include/asm/thread_info.h +++ linux-azure-4.13.0/arch/arm64/include/asm/thread_info.h @@ -23,19 +23,11 @@ #include -#ifdef CONFIG_ARM64_4K_PAGES -#define THREAD_SIZE_ORDER 2 -#elif defined(CONFIG_ARM64_16K_PAGES) -#define THREAD_SIZE_ORDER 0 -#endif - -#define THREAD_SIZE 16384 -#define THREAD_START_SP (THREAD_SIZE - 16) - #ifndef __ASSEMBLY__ struct task_struct; +#include #include #include @@ -68,6 +60,9 @@ #define thread_saved_fp(tsk) \ ((unsigned long)(tsk->thread.cpu_context.fp)) +void arch_setup_new_exec(void); +#define arch_setup_new_exec arch_setup_new_exec + #endif /* @@ -86,6 +81,7 @@ #define TIF_NOTIFY_RESUME 2 /* callback before returning to user */ #define TIF_FOREIGN_FPSTATE 3 /* CPU's FP state is not current's */ #define TIF_UPROBE 4 /* uprobe breakpoint or singlestep */ +#define TIF_FSCHECK 5 /* Check FS is USER_DS on return */ #define TIF_NOHZ 7 #define TIF_SYSCALL_TRACE 8 #define TIF_SYSCALL_AUDIT 9 @@ -107,11 +103,12 @@ #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_SECCOMP (1 << TIF_SECCOMP) #define _TIF_UPROBE (1 << TIF_UPROBE) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) #define _TIF_32BIT (1 << TIF_32BIT) #define _TIF_WORK_MASK (_TIF_NEED_RESCHED | _TIF_SIGPENDING | \ _TIF_NOTIFY_RESUME | _TIF_FOREIGN_FPSTATE | \ - _TIF_UPROBE) + _TIF_UPROBE | _TIF_FSCHECK) #define _TIF_SYSCALL_WORK (_TIF_SYSCALL_TRACE | _TIF_SYSCALL_AUDIT | \ _TIF_SYSCALL_TRACEPOINT | _TIF_SECCOMP | \ --- linux-azure-4.13.0.orig/arch/arm64/include/asm/tlbflush.h +++ linux-azure-4.13.0/arch/arm64/include/asm/tlbflush.h @@ -23,6 +23,7 @@ #include #include +#include /* * Raw TLBI operations. @@ -54,6 +55,11 @@ #define __tlbi(op, ...) __TLBI_N(op, ##__VA_ARGS__, 1, 0) +#define __tlbi_user(op, arg) do { \ + if (arm64_kernel_unmapped_at_el0()) \ + __tlbi(op, (arg) | USER_ASID_FLAG); \ +} while (0) + /* * TLB Management * ============== @@ -115,6 +121,7 @@ dsb(ishst); __tlbi(aside1is, asid); + __tlbi_user(aside1is, asid); dsb(ish); } @@ -125,6 +132,7 @@ dsb(ishst); __tlbi(vale1is, addr); + __tlbi_user(vale1is, addr); dsb(ish); } @@ -151,10 +159,13 @@ dsb(ishst); for (addr = start; addr < end; addr += 1 << (PAGE_SHIFT - 12)) { - if (last_level) + if (last_level) { __tlbi(vale1is, addr); - else + __tlbi_user(vale1is, addr); + } else { __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); + } } dsb(ish); } @@ -194,6 +205,7 @@ unsigned long addr = uaddr >> 12 | (ASID(mm) << 48); __tlbi(vae1is, addr); + __tlbi_user(vae1is, addr); dsb(ish); } --- linux-azure-4.13.0.orig/arch/arm64/include/asm/traps.h +++ linux-azure-4.13.0/arch/arm64/include/asm/traps.h @@ -37,18 +37,11 @@ void arm64_notify_segfault(struct pt_regs *regs, unsigned long addr); -#ifdef CONFIG_FUNCTION_GRAPH_TRACER static inline int __in_irqentry_text(unsigned long ptr) { return ptr >= (unsigned long)&__irqentry_text_start && ptr < (unsigned long)&__irqentry_text_end; } -#else -static inline int __in_irqentry_text(unsigned long ptr) -{ - return 0; -} -#endif static inline int in_exception_text(unsigned long ptr) { @@ -60,4 +53,9 @@ return in ? : __in_irqentry_text(ptr); } +static inline int in_entry_text(unsigned long ptr) +{ + return ptr >= (unsigned long)&__entry_text_start && + ptr < (unsigned long)&__entry_text_end; +} #endif --- linux-azure-4.13.0.orig/arch/arm64/include/asm/uaccess.h +++ linux-azure-4.13.0/arch/arm64/include/asm/uaccess.h @@ -35,10 +35,7 @@ #include #include -#define KERNEL_DS (-1UL) #define get_ds() (KERNEL_DS) - -#define USER_DS TASK_SIZE_64 #define get_fs() (current_thread_info()->addr_limit) static inline void set_fs(mm_segment_t fs) @@ -46,6 +43,16 @@ current_thread_info()->addr_limit = fs; /* + * Prevent a mispredicted conditional call to set_fs from forwarding + * the wrong address limit to access_ok under speculation. + */ + dsb(nsh); + isb(); + + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); + + /* * Enable/disable UAO so that copy_to_user() etc can access * kernel memory with the unprivileged instructions. */ @@ -63,22 +70,32 @@ * Returns 1 if the range is valid, 0 otherwise. * * This is equivalent to the following test: - * (u65)addr + (u65)size <= current->addr_limit - * - * This needs 65-bit arithmetic. + * (u65)addr + (u65)size <= (u65)current->addr_limit + 1 */ -#define __range_ok(addr, size) \ -({ \ - unsigned long __addr = (unsigned long)(addr); \ - unsigned long flag, roksum; \ - __chk_user_ptr(addr); \ - asm("adds %1, %1, %3; ccmp %1, %4, #2, cc; cset %0, ls" \ - : "=&r" (flag), "=&r" (roksum) \ - : "1" (__addr), "Ir" (size), \ - "r" (current_thread_info()->addr_limit) \ - : "cc"); \ - flag; \ -}) +static inline unsigned long __range_ok(unsigned long addr, unsigned long size) +{ + unsigned long limit = current_thread_info()->addr_limit; + + __chk_user_ptr(addr); + asm volatile( + // A + B <= C + 1 for all A,B,C, in four easy steps: + // 1: X = A + B; X' = X % 2^64 + " adds %0, %0, %2\n" + // 2: Set C = 0 if X > 2^64, to guarantee X' > C in step 4 + " csel %1, xzr, %1, hi\n" + // 3: Set X' = ~0 if X >= 2^64. For X == 2^64, this decrements X' + // to compensate for the carry flag being set in step 4. For + // X > 2^64, X' merely has to remain nonzero, which it does. + " csinv %0, %0, xzr, cc\n" + // 4: For X < 2^64, this gives us X' - C - 1 <= 0, where the -1 + // comes from the carry in being clear. Otherwise, we are + // testing X' - C == 0, subject to the previous adjustments. + " sbcs xzr, %0, %1\n" + " cset %0, ls\n" + : "+r" (addr), "+r" (limit) : "Ir" (size) : "cc"); + + return addr; +} /* * When dealing with data aborts, watchpoints, or instruction traps we may end @@ -87,7 +104,7 @@ */ #define untagged_addr(addr) sign_extend64(addr, 55) -#define access_ok(type, addr, size) __range_ok(addr, size) +#define access_ok(type, addr, size) __range_ok((unsigned long)(addr), size) #define user_addr_max get_fs #define _ASM_EXTABLE(from, to) \ @@ -102,17 +119,23 @@ #ifdef CONFIG_ARM64_SW_TTBR0_PAN static inline void __uaccess_ttbr0_disable(void) { - unsigned long ttbr; + unsigned long flags, ttbr; + local_irq_save(flags); + ttbr = read_sysreg(ttbr1_el1); + ttbr &= ~TTBR_ASID_MASK; /* reserved_ttbr0 placed at the end of swapper_pg_dir */ - ttbr = read_sysreg(ttbr1_el1) + SWAPPER_DIR_SIZE; - write_sysreg(ttbr, ttbr0_el1); + write_sysreg(ttbr + SWAPPER_DIR_SIZE, ttbr0_el1); isb(); + /* Set reserved ASID */ + write_sysreg(ttbr, ttbr1_el1); + isb(); + local_irq_restore(flags); } static inline void __uaccess_ttbr0_enable(void) { - unsigned long flags; + unsigned long flags, ttbr0, ttbr1; /* * Disable interrupts to avoid preemption between reading the 'ttbr0' @@ -120,7 +143,17 @@ * roll-over and an update of 'ttbr0'. */ local_irq_save(flags); - write_sysreg(current_thread_info()->ttbr0, ttbr0_el1); + ttbr0 = READ_ONCE(current_thread_info()->ttbr0); + + /* Restore active ASID */ + ttbr1 = read_sysreg(ttbr1_el1); + ttbr1 &= ~TTBR_ASID_MASK; /* safety measure */ + ttbr1 |= ttbr0 & TTBR_ASID_MASK; + write_sysreg(ttbr1, ttbr1_el1); + isb(); + + /* Restore user page table */ + write_sysreg(ttbr0, ttbr0_el1); isb(); local_irq_restore(flags); } @@ -190,6 +223,26 @@ } /* + * Sanitise a uaccess pointer such that it becomes NULL if above the + * current addr_limit. + */ +#define uaccess_mask_ptr(ptr) (__typeof__(ptr))__uaccess_mask_ptr(ptr) +static inline void __user *__uaccess_mask_ptr(const void __user *ptr) +{ + void __user *safe_ptr; + + asm volatile( + " bics xzr, %1, %2\n" + " csel %0, %1, xzr, eq\n" + : "=&r" (safe_ptr) + : "r" (ptr), "r" (current_thread_info()->addr_limit) + : "cc"); + + csdb(); + return safe_ptr; +} + +/* * The "__xxx" versions of the user access functions do not verify the address * space - it must have been done previously with a separate "access_ok()" * call. @@ -241,28 +294,33 @@ (x) = (__force __typeof__(*(ptr)))__gu_val; \ } while (0) -#define __get_user(x, ptr) \ +#define __get_user_check(x, ptr, err) \ ({ \ - int __gu_err = 0; \ - __get_user_err((x), (ptr), __gu_err); \ - __gu_err; \ + __typeof__(*(ptr)) __user *__p = (ptr); \ + might_fault(); \ + if (access_ok(VERIFY_READ, __p, sizeof(*__p))) { \ + __p = uaccess_mask_ptr(__p); \ + __get_user_err((x), __p, (err)); \ + } else { \ + (x) = 0; (err) = -EFAULT; \ + } \ }) #define __get_user_error(x, ptr, err) \ ({ \ - __get_user_err((x), (ptr), (err)); \ + __get_user_check((x), (ptr), (err)); \ (void)0; \ }) -#define get_user(x, ptr) \ +#define __get_user(x, ptr) \ ({ \ - __typeof__(*(ptr)) __user *__p = (ptr); \ - might_fault(); \ - access_ok(VERIFY_READ, __p, sizeof(*__p)) ? \ - __get_user((x), __p) : \ - ((x) = 0, -EFAULT); \ + int __gu_err = 0; \ + __get_user_check((x), (ptr), __gu_err); \ + __gu_err; \ }) +#define get_user __get_user + #define __put_user_asm(instr, alt_instr, reg, x, addr, err, feature) \ asm volatile( \ "1:"ALTERNATIVE(instr " " reg "1, [%2]\n", \ @@ -305,46 +363,78 @@ uaccess_disable_not_uao(); \ } while (0) -#define __put_user(x, ptr) \ +#define __put_user_check(x, ptr, err) \ ({ \ - int __pu_err = 0; \ - __put_user_err((x), (ptr), __pu_err); \ - __pu_err; \ + __typeof__(*(ptr)) __user *__p = (ptr); \ + might_fault(); \ + if (access_ok(VERIFY_WRITE, __p, sizeof(*__p))) { \ + __p = uaccess_mask_ptr(__p); \ + __put_user_err((x), __p, (err)); \ + } else { \ + (err) = -EFAULT; \ + } \ }) #define __put_user_error(x, ptr, err) \ ({ \ - __put_user_err((x), (ptr), (err)); \ + __put_user_check((x), (ptr), (err)); \ (void)0; \ }) -#define put_user(x, ptr) \ +#define __put_user(x, ptr) \ ({ \ - __typeof__(*(ptr)) __user *__p = (ptr); \ - might_fault(); \ - access_ok(VERIFY_WRITE, __p, sizeof(*__p)) ? \ - __put_user((x), __p) : \ - -EFAULT; \ + int __pu_err = 0; \ + __put_user_check((x), (ptr), __pu_err); \ + __pu_err; \ }) +#define put_user __put_user + extern unsigned long __must_check __arch_copy_from_user(void *to, const void __user *from, unsigned long n); -#define raw_copy_from_user __arch_copy_from_user +#define raw_copy_from_user(to, from, n) \ +({ \ + __arch_copy_from_user((to), __uaccess_mask_ptr(from), (n)); \ +}) + extern unsigned long __must_check __arch_copy_to_user(void __user *to, const void *from, unsigned long n); -#define raw_copy_to_user __arch_copy_to_user -extern unsigned long __must_check raw_copy_in_user(void __user *to, const void __user *from, unsigned long n); -extern unsigned long __must_check __clear_user(void __user *addr, unsigned long n); +#define raw_copy_to_user(to, from, n) \ +({ \ + __arch_copy_to_user(__uaccess_mask_ptr(to), (from), (n)); \ +}) + +extern unsigned long __must_check __arch_copy_in_user(void __user *to, const void __user *from, unsigned long n); +#define raw_copy_in_user(to, from, n) \ +({ \ + __arch_copy_in_user(__uaccess_mask_ptr(to), \ + __uaccess_mask_ptr(from), (n)); \ +}) + #define INLINE_COPY_TO_USER #define INLINE_COPY_FROM_USER -static inline unsigned long __must_check clear_user(void __user *to, unsigned long n) +extern unsigned long __must_check __arch_clear_user(void __user *to, unsigned long n); +static inline unsigned long __must_check __clear_user(void __user *to, unsigned long n) { if (access_ok(VERIFY_WRITE, to, n)) - n = __clear_user(to, n); + n = __arch_clear_user(__uaccess_mask_ptr(to), n); return n; } +#define clear_user __clear_user extern long strncpy_from_user(char *dest, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); +#ifdef CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE +struct page; +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, size_t len); +extern unsigned long __must_check __copy_user_flushcache(void *to, const void __user *from, unsigned long n); + +static inline int __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size) +{ + kasan_check_write(dst, size); + return __copy_user_flushcache(dst, __uaccess_mask_ptr(src), size); +} +#endif + #endif /* __ASM_UACCESS_H */ --- linux-azure-4.13.0.orig/arch/arm64/include/uapi/asm/hwcap.h +++ linux-azure-4.13.0/arch/arm64/include/uapi/asm/hwcap.h @@ -35,5 +35,6 @@ #define HWCAP_JSCVT (1 << 13) #define HWCAP_FCMA (1 << 14) #define HWCAP_LRCPC (1 << 15) +#define HWCAP_DCPOP (1 << 16) #endif /* _UAPI__ASM_HWCAP_H */ --- linux-azure-4.13.0.orig/arch/arm64/kernel/Makefile +++ linux-azure-4.13.0/arch/arm64/kernel/Makefile @@ -54,6 +54,10 @@ arm64-reloc-test-y := reloc_test_core.o reloc_test_syms.o arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o +ifeq ($(CONFIG_KVM),y) +arm64-obj-$(CONFIG_HARDEN_BRANCH_PREDICTOR) += bpi.o +endif + obj-y += $(arm64-obj-y) vdso/ probes/ obj-m += $(arm64-obj-m) head-y := head.o --- linux-azure-4.13.0.orig/arch/arm64/kernel/arm64ksyms.c +++ linux-azure-4.13.0/arch/arm64/kernel/arm64ksyms.c @@ -37,8 +37,8 @@ /* user mem (segment) */ EXPORT_SYMBOL(__arch_copy_from_user); EXPORT_SYMBOL(__arch_copy_to_user); -EXPORT_SYMBOL(__clear_user); -EXPORT_SYMBOL(raw_copy_in_user); +EXPORT_SYMBOL(__arch_clear_user); +EXPORT_SYMBOL(__arch_copy_in_user); /* physical memory */ EXPORT_SYMBOL(memstart_addr); --- linux-azure-4.13.0.orig/arch/arm64/kernel/armv8_deprecated.c +++ linux-azure-4.13.0/arch/arm64/kernel/armv8_deprecated.c @@ -649,4 +649,4 @@ return 0; } -late_initcall(armv8_deprecated_init); +core_initcall(armv8_deprecated_init); --- linux-azure-4.13.0.orig/arch/arm64/kernel/asm-offsets.c +++ linux-azure-4.13.0/arch/arm64/kernel/asm-offsets.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -75,6 +76,7 @@ DEFINE(S_ORIG_X0, offsetof(struct pt_regs, orig_x0)); DEFINE(S_SYSCALLNO, offsetof(struct pt_regs, syscallno)); DEFINE(S_ORIG_ADDR_LIMIT, offsetof(struct pt_regs, orig_addr_limit)); + DEFINE(S_STACKFRAME, offsetof(struct pt_regs, stackframe)); DEFINE(S_FRAME_SIZE, sizeof(struct pt_regs)); BLANK(); DEFINE(MM_CONTEXT_ID, offsetof(struct mm_struct, context.id.counter)); @@ -147,11 +149,14 @@ DEFINE(ARM_SMCCC_RES_X2_OFFS, offsetof(struct arm_smccc_res, a2)); DEFINE(ARM_SMCCC_QUIRK_ID_OFFS, offsetof(struct arm_smccc_quirk, id)); DEFINE(ARM_SMCCC_QUIRK_STATE_OFFS, offsetof(struct arm_smccc_quirk, state)); - BLANK(); DEFINE(HIBERN_PBE_ORIG, offsetof(struct pbe, orig_address)); DEFINE(HIBERN_PBE_ADDR, offsetof(struct pbe, address)); DEFINE(HIBERN_PBE_NEXT, offsetof(struct pbe, next)); DEFINE(ARM64_FTR_SYSVAL, offsetof(struct arm64_ftr_reg, sys_val)); + BLANK(); +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + DEFINE(TRAMP_VALIAS, TRAMP_VALIAS); +#endif return 0; } --- linux-azure-4.13.0.orig/arch/arm64/kernel/bpi.S +++ linux-azure-4.13.0/arch/arm64/kernel/bpi.S @@ -0,0 +1,83 @@ +/* + * Contains CPU specific branch predictor invalidation sequences + * + * Copyright (C) 2018 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include + +.macro ventry target + .rept 31 + nop + .endr + b \target +.endm + +.macro vectors target + ventry \target + 0x000 + ventry \target + 0x080 + ventry \target + 0x100 + ventry \target + 0x180 + + ventry \target + 0x200 + ventry \target + 0x280 + ventry \target + 0x300 + ventry \target + 0x380 + + ventry \target + 0x400 + ventry \target + 0x480 + ventry \target + 0x500 + ventry \target + 0x580 + + ventry \target + 0x600 + ventry \target + 0x680 + ventry \target + 0x700 + ventry \target + 0x780 +.endm + + .align 11 +ENTRY(__bp_harden_hyp_vecs_start) + .rept 4 + vectors __kvm_hyp_vector + .endr +ENTRY(__bp_harden_hyp_vecs_end) + +ENTRY(__qcom_hyp_sanitize_link_stack_start) + stp x29, x30, [sp, #-16]! + .rept 16 + bl . + 4 + .endr + ldp x29, x30, [sp], #16 +ENTRY(__qcom_hyp_sanitize_link_stack_end) + +.macro smccc_workaround_1 inst + sub sp, sp, #(8 * 4) + stp x2, x3, [sp, #(8 * 0)] + stp x0, x1, [sp, #(8 * 2)] + mov w0, #ARM_SMCCC_ARCH_WORKAROUND_1 + \inst #0 + ldp x2, x3, [sp, #(8 * 0)] + ldp x0, x1, [sp, #(8 * 2)] + add sp, sp, #(8 * 4) +.endm + +ENTRY(__smccc_workaround_1_smc_start) + smccc_workaround_1 smc +ENTRY(__smccc_workaround_1_smc_end) + +ENTRY(__smccc_workaround_1_hvc_start) + smccc_workaround_1 hvc +ENTRY(__smccc_workaround_1_hvc_end) --- linux-azure-4.13.0.orig/arch/arm64/kernel/cpu-reset.S +++ linux-azure-4.13.0/arch/arm64/kernel/cpu-reset.S @@ -16,7 +16,7 @@ #include .text -.pushsection .idmap.text, "ax" +.pushsection .idmap.text, "awx" /* * __cpu_soft_restart(el2_switch, entry, arg0, arg1, arg2) - Helper for @@ -37,6 +37,7 @@ mrs x12, sctlr_el1 ldr x13, =SCTLR_ELx_FLAGS bic x12, x12, x13 + pre_disable_mmu_workaround msr sctlr_el1, x12 isb --- linux-azure-4.13.0.orig/arch/arm64/kernel/cpu_errata.c +++ linux-azure-4.13.0/arch/arm64/kernel/cpu_errata.c @@ -30,6 +30,20 @@ entry->midr_range_max); } +static bool __maybe_unused +is_kryo_midr(const struct arm64_cpu_capabilities *entry, int scope) +{ + u32 model; + + WARN_ON(scope != SCOPE_LOCAL_CPU || preemptible()); + + model = read_cpuid_id(); + model &= MIDR_IMPLEMENTOR_MASK | (0xf00 << MIDR_PARTNUM_SHIFT) | + MIDR_ARCHITECTURE_MASK; + + return model == entry->midr_model; +} + static bool has_mismatched_cache_line_size(const struct arm64_cpu_capabilities *entry, int scope) @@ -46,6 +60,174 @@ return 0; } +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR +#include +#include + +DEFINE_PER_CPU_READ_MOSTLY(struct bp_hardening_data, bp_hardening_data); + +#ifdef CONFIG_KVM +extern char __qcom_hyp_sanitize_link_stack_start[]; +extern char __qcom_hyp_sanitize_link_stack_end[]; +extern char __smccc_workaround_1_smc_start[]; +extern char __smccc_workaround_1_smc_end[]; +extern char __smccc_workaround_1_hvc_start[]; +extern char __smccc_workaround_1_hvc_end[]; + +static void __copy_hyp_vect_bpi(int slot, const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + void *dst = lm_alias(__bp_harden_hyp_vecs_start + slot * SZ_2K); + int i; + + for (i = 0; i < SZ_2K; i += 0x80) + memcpy(dst + i, hyp_vecs_start, hyp_vecs_end - hyp_vecs_start); + + flush_icache_range((uintptr_t)dst, (uintptr_t)dst + SZ_2K); +} + +static void __install_bp_hardening_cb(bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + static int last_slot = -1; + static DEFINE_SPINLOCK(bp_lock); + int cpu, slot = -1; + + spin_lock(&bp_lock); + for_each_possible_cpu(cpu) { + if (per_cpu(bp_hardening_data.fn, cpu) == fn) { + slot = per_cpu(bp_hardening_data.hyp_vectors_slot, cpu); + break; + } + } + + if (slot == -1) { + last_slot++; + BUG_ON(((__bp_harden_hyp_vecs_end - __bp_harden_hyp_vecs_start) + / SZ_2K) <= last_slot); + slot = last_slot; + __copy_hyp_vect_bpi(slot, hyp_vecs_start, hyp_vecs_end); + } + + __this_cpu_write(bp_hardening_data.hyp_vectors_slot, slot); + __this_cpu_write(bp_hardening_data.fn, fn); + spin_unlock(&bp_lock); +} +#else +#define __qcom_hyp_sanitize_link_stack_start NULL +#define __qcom_hyp_sanitize_link_stack_end NULL +#define __smccc_workaround_1_smc_start NULL +#define __smccc_workaround_1_smc_end NULL +#define __smccc_workaround_1_hvc_start NULL +#define __smccc_workaround_1_hvc_end NULL + +static void __install_bp_hardening_cb(bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + __this_cpu_write(bp_hardening_data.fn, fn); +} +#endif /* CONFIG_KVM */ + +static void install_bp_hardening_cb(const struct arm64_cpu_capabilities *entry, + bp_hardening_cb_t fn, + const char *hyp_vecs_start, + const char *hyp_vecs_end) +{ + u64 pfr0; + + if (!entry->matches(entry, SCOPE_LOCAL_CPU)) + return; + + pfr0 = read_cpuid(ID_AA64PFR0_EL1); + if (cpuid_feature_extract_unsigned_field(pfr0, ID_AA64PFR0_CSV2_SHIFT)) + return; + + __install_bp_hardening_cb(fn, hyp_vecs_start, hyp_vecs_end); +} + +#include +#include +#include + +static void call_smc_arch_workaround_1(void) +{ + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +static void call_hvc_arch_workaround_1(void) +{ + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_WORKAROUND_1, NULL); +} + +static int enable_smccc_arch_workaround_1(void *data) +{ + const struct arm64_cpu_capabilities *entry = data; + bp_hardening_cb_t cb; + void *smccc_start, *smccc_end; + struct arm_smccc_res res; + + if (!entry->matches(entry, SCOPE_LOCAL_CPU)) + return 0; + + if (psci_ops.smccc_version == SMCCC_VERSION_1_0) + return 0; + + switch (psci_ops.conduit) { + case PSCI_CONDUIT_HVC: + arm_smccc_1_1_hvc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + if (res.a0) + return 0; + cb = call_hvc_arch_workaround_1; + smccc_start = __smccc_workaround_1_hvc_start; + smccc_end = __smccc_workaround_1_hvc_end; + break; + + case PSCI_CONDUIT_SMC: + arm_smccc_1_1_smc(ARM_SMCCC_ARCH_FEATURES_FUNC_ID, + ARM_SMCCC_ARCH_WORKAROUND_1, &res); + if (res.a0) + return 0; + cb = call_smc_arch_workaround_1; + smccc_start = __smccc_workaround_1_smc_start; + smccc_end = __smccc_workaround_1_smc_end; + break; + + default: + return 0; + } + + install_bp_hardening_cb(entry, cb, smccc_start, smccc_end); + + return 0; +} + +static void qcom_link_stack_sanitization(void) +{ + u64 tmp; + + asm volatile("mov %0, x30 \n" + ".rept 16 \n" + "bl . + 4 \n" + ".endr \n" + "mov x30, %0 \n" + : "=&r" (tmp)); +} + +static int qcom_enable_link_stack_sanitization(void *data) +{ + const struct arm64_cpu_capabilities *entry = data; + + install_bp_hardening_cb(entry, qcom_link_stack_sanitization, + __qcom_hyp_sanitize_link_stack_start, + __qcom_hyp_sanitize_link_stack_end); + + return 0; +} +#endif /* CONFIG_HARDEN_BRANCH_PREDICTOR */ + #define MIDR_RANGE(model, min, max) \ .def_scope = SCOPE_LOCAL_CPU, \ .matches = is_affected_midr_range, \ @@ -169,6 +351,13 @@ MIDR_CPU_VAR_REV(0, 0), MIDR_CPU_VAR_REV(0, 0)), }, + { + .desc = "Qualcomm Technologies Kryo erratum 1003", + .capability = ARM64_WORKAROUND_QCOM_FALKOR_E1003, + .def_scope = SCOPE_LOCAL_CPU, + .midr_model = MIDR_QCOM_KRYO, + .matches = is_kryo_midr, + }, #endif #ifdef CONFIG_QCOM_FALKOR_ERRATUM_1009 { @@ -187,6 +376,56 @@ MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), }, #endif +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A57), + .enable = enable_smccc_arch_workaround_1, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A72), + .enable = enable_smccc_arch_workaround_1, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + .enable = enable_smccc_arch_workaround_1, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A75), + .enable = enable_smccc_arch_workaround_1, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), + .enable = qcom_enable_link_stack_sanitization, + }, + { + .capability = ARM64_HARDEN_BP_POST_GUEST_EXIT, + MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR_V1), + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR), + .enable = qcom_enable_link_stack_sanitization, + }, + { + .capability = ARM64_HARDEN_BP_POST_GUEST_EXIT, + MIDR_ALL_VERSIONS(MIDR_QCOM_FALKOR), + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_BRCM_VULCAN), + .enable = enable_smccc_arch_workaround_1, + }, + { + .capability = ARM64_HARDEN_BRANCH_PREDICTOR, + MIDR_ALL_VERSIONS(MIDR_CAVIUM_THUNDERX2), + .enable = enable_smccc_arch_workaround_1, + }, +#endif { } }; @@ -200,15 +439,18 @@ { const struct arm64_cpu_capabilities *caps = arm64_errata; - for (; caps->matches; caps++) - if (!cpus_have_cap(caps->capability) && - caps->matches(caps, SCOPE_LOCAL_CPU)) { + for (; caps->matches; caps++) { + if (cpus_have_cap(caps->capability)) { + if (caps->enable) + caps->enable((void *)caps); + } else if (caps->matches(caps, SCOPE_LOCAL_CPU)) { pr_crit("CPU%d: Requires work around for %s, not detected" " at boot time\n", smp_processor_id(), caps->desc ? : "an erratum"); cpu_die_early(); } + } } void update_cpu_errata_workarounds(void) --- linux-azure-4.13.0.orig/arch/arm64/kernel/cpufeature.c +++ linux-azure-4.13.0/arch/arm64/kernel/cpufeature.c @@ -120,10 +120,13 @@ ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_LRCPC_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_FCMA_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_JSCVT_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_EXACT, ID_AA64ISAR1_DPB_SHIFT, 4, 0), ARM64_FTR_END, }; static const struct arm64_ftr_bits ftr_id_aa64pfr0[] = { + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV3_SHIFT, 4, 0), + ARM64_FTR_BITS(FTR_HIDDEN, FTR_NONSTRICT, FTR_LOWER_SAFE, ID_AA64PFR0_CSV2_SHIFT, 4, 0), ARM64_FTR_BITS(FTR_HIDDEN, FTR_STRICT, FTR_EXACT, ID_AA64PFR0_GIC_SHIFT, 4, 0), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_ASIMD_SHIFT, 4, ID_AA64PFR0_ASIMD_NI), S_ARM64_FTR_BITS(FTR_VISIBLE, FTR_STRICT, FTR_LOWER_SAFE, ID_AA64PFR0_FP_SHIFT, 4, ID_AA64PFR0_FP_NI), @@ -795,6 +798,86 @@ ID_AA64PFR0_FP_SHIFT) < 0; } +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static int __kpti_forced; /* 0: not forced, >0: forced on, <0: forced off */ + +static bool unmap_kernel_at_el0(const struct arm64_cpu_capabilities *entry, + int __unused) +{ + char const *str = "command line option"; + u64 pfr0 = read_sanitised_ftr_reg(SYS_ID_AA64PFR0_EL1); + + /* + * For reasons that aren't entirely clear, enabling KPTI on Cavium + * ThunderX leads to apparent I-cache corruption of kernel text, which + * ends as well as you might imagine. Don't even try. + */ + if (cpus_have_const_cap(ARM64_WORKAROUND_CAVIUM_27456)) { + str = "ARM64_WORKAROUND_CAVIUM_27456"; + __kpti_forced = -1; + } + + /* Forced? */ + if (__kpti_forced) { + pr_info_once("kernel page table isolation forced %s by %s\n", + __kpti_forced > 0 ? "ON" : "OFF", str); + return __kpti_forced > 0; + } + + /* Useful for KASLR robustness */ + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) + return true; + + /* Don't force KPTI for CPUs that are not vulnerable */ + switch (read_cpuid_id() & MIDR_CPU_MODEL_MASK) { + case MIDR_CAVIUM_THUNDERX2: + case MIDR_BRCM_VULCAN: + return false; + } + + /* Defer to CPU feature registers */ + return !cpuid_feature_extract_unsigned_field(pfr0, + ID_AA64PFR0_CSV3_SHIFT); +} + +static int kpti_install_ng_mappings(void *__unused) +{ + typedef void (kpti_remap_fn)(int, int, phys_addr_t); + extern kpti_remap_fn idmap_kpti_install_ng_mappings; + kpti_remap_fn *remap_fn; + + static bool kpti_applied = false; + int cpu = smp_processor_id(); + + if (kpti_applied) + return 0; + + remap_fn = (void *)__pa_symbol(idmap_kpti_install_ng_mappings); + + cpu_install_idmap(); + remap_fn(cpu, num_online_cpus(), __pa_symbol(swapper_pg_dir)); + cpu_uninstall_idmap(); + + if (!cpu) + kpti_applied = true; + + return 0; +} + +static int __init parse_kpti(char *str) +{ + bool enabled; + int ret = strtobool(str, &enabled); + + if (ret) + return ret; + + __kpti_forced = enabled ? 1 : -1; + return 0; +} +__setup("kpti=", parse_kpti); +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + static const struct arm64_cpu_capabilities arm64_features[] = { { .desc = "GIC system register CPU interface", @@ -881,6 +964,15 @@ .def_scope = SCOPE_SYSTEM, .matches = hyp_offset_low, }, +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + { + .desc = "Kernel page table isolation (KPTI)", + .capability = ARM64_UNMAP_KERNEL_AT_EL0, + .def_scope = SCOPE_SYSTEM, + .matches = unmap_kernel_at_el0, + .enable = kpti_install_ng_mappings, + }, +#endif { /* FP/SIMD is not implemented */ .capability = ARM64_HAS_NO_FPSIMD, @@ -888,6 +980,17 @@ .min_field_value = 0, .matches = has_no_fpsimd, }, +#ifdef CONFIG_ARM64_PMEM + { + .desc = "Data cache clean to Point of Persistence", + .capability = ARM64_HAS_DCPOP, + .def_scope = SCOPE_SYSTEM, + .matches = has_cpuid_feature, + .sys_reg = SYS_ID_AA64ISAR1_EL1, + .field_pos = ID_AA64ISAR1_DPB_SHIFT, + .min_field_value = 1, + }, +#endif {}, }; @@ -916,6 +1019,7 @@ HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_FP_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_FPHP), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 0, CAP_HWCAP, HWCAP_ASIMD), HWCAP_CAP(SYS_ID_AA64PFR0_EL1, ID_AA64PFR0_ASIMD_SHIFT, FTR_SIGNED, 1, CAP_HWCAP, HWCAP_ASIMDHP), + HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_DPB_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_DCPOP), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_JSCVT_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_JSCVT), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_FCMA_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_FCMA), HWCAP_CAP(SYS_ID_AA64ISAR1_EL1, ID_AA64ISAR1_LRCPC_SHIFT, FTR_UNSIGNED, 1, CAP_HWCAP, HWCAP_LRCPC), @@ -987,6 +1091,25 @@ cap_set_elf_hwcap(hwcaps); } +/* + * Check if the current CPU has a given feature capability. + * Should be called from non-preemptible context. + */ +static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, + unsigned int cap) +{ + const struct arm64_cpu_capabilities *caps; + + if (WARN_ON(preemptible())) + return false; + + for (caps = cap_array; caps->matches; caps++) + if (caps->capability == cap && + caps->matches(caps, SCOPE_LOCAL_CPU)) + return true; + return false; +} + void update_cpu_capabilities(const struct arm64_cpu_capabilities *caps, const char *info) { @@ -1022,7 +1145,7 @@ * uses an IPI, giving us a PSTATE that disappears when * we return. */ - stop_machine(caps->enable, NULL, cpu_online_mask); + stop_machine(caps->enable, (void *)caps, cpu_online_mask); } } } @@ -1065,8 +1188,9 @@ } static void -verify_local_cpu_features(const struct arm64_cpu_capabilities *caps) +verify_local_cpu_features(const struct arm64_cpu_capabilities *caps_list) { + const struct arm64_cpu_capabilities *caps = caps_list; for (; caps->matches; caps++) { if (!cpus_have_cap(caps->capability)) continue; @@ -1074,13 +1198,13 @@ * If the new CPU misses an advertised feature, we cannot proceed * further, park the cpu. */ - if (!caps->matches(caps, SCOPE_LOCAL_CPU)) { + if (!__this_cpu_has_cap(caps_list, caps->capability)) { pr_crit("CPU%d: missing feature: %s\n", smp_processor_id(), caps->desc); cpu_die_early(); } if (caps->enable) - caps->enable(NULL); + caps->enable((void *)caps); } } @@ -1135,25 +1259,6 @@ static_branch_enable(&arm64_const_caps_ready); } -/* - * Check if the current CPU has a given feature capability. - * Should be called from non-preemptible context. - */ -static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, - unsigned int cap) -{ - const struct arm64_cpu_capabilities *caps; - - if (WARN_ON(preemptible())) - return false; - - for (caps = cap_array; caps->desc; caps++) - if (caps->capability == cap && caps->matches) - return caps->matches(caps, SCOPE_LOCAL_CPU); - - return false; -} - extern const struct arm64_cpu_capabilities arm64_errata[]; bool this_cpu_has_cap(unsigned int cap) @@ -1294,4 +1399,4 @@ return 0; } -late_initcall(enable_mrs_emulation); +core_initcall(enable_mrs_emulation); --- linux-azure-4.13.0.orig/arch/arm64/kernel/cpuinfo.c +++ linux-azure-4.13.0/arch/arm64/kernel/cpuinfo.c @@ -68,6 +68,7 @@ "jscvt", "fcma", "lrcpc", + "dcpop", NULL }; --- linux-azure-4.13.0.orig/arch/arm64/kernel/efi-entry.S +++ linux-azure-4.13.0/arch/arm64/kernel/efi-entry.S @@ -96,6 +96,7 @@ mrs x0, sctlr_el2 bic x0, x0, #1 << 0 // clear SCTLR.M bic x0, x0, #1 << 2 // clear SCTLR.C + pre_disable_mmu_workaround msr sctlr_el2, x0 isb b 2f @@ -103,6 +104,7 @@ mrs x0, sctlr_el1 bic x0, x0, #1 << 0 // clear SCTLR.M bic x0, x0, #1 << 2 // clear SCTLR.C + pre_disable_mmu_workaround msr sctlr_el1, x0 isb 2: --- linux-azure-4.13.0.orig/arch/arm64/kernel/entry.S +++ linux-azure-4.13.0/arch/arm64/kernel/entry.S @@ -29,6 +29,8 @@ #include #include #include +#include +#include #include #include #include @@ -69,8 +71,73 @@ #define BAD_FIQ 2 #define BAD_ERROR 3 - .macro kernel_entry, el, regsize = 64 + .macro kernel_ventry, el, label, regsize = 64 + .align 7 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +alternative_if ARM64_UNMAP_KERNEL_AT_EL0 + .if \el == 0 + .if \regsize == 64 + mrs x30, tpidrro_el0 + msr tpidrro_el0, xzr + .else + mov x30, xzr + .endif + .endif +alternative_else_nop_endif +#endif + sub sp, sp, #S_FRAME_SIZE +#ifdef CONFIG_VMAP_STACK + /* + * Test whether the SP has overflowed, without corrupting a GPR. + * Task and IRQ stacks are aligned to (1 << THREAD_SHIFT). + */ + add sp, sp, x0 // sp' = sp + x0 + sub x0, sp, x0 // x0' = sp' - x0 = (sp + x0) - x0 = sp + tbnz x0, #THREAD_SHIFT, 0f + sub x0, sp, x0 // x0'' = sp' - x0' = (sp + x0) - sp = x0 + sub sp, sp, x0 // sp'' = sp' - x0 = (sp + x0) - x0 = sp + b el\()\el\()_\label + +0: + /* + * Either we've just detected an overflow, or we've taken an exception + * while on the overflow stack. Either way, we won't return to + * userspace, and can clobber EL0 registers to free up GPRs. + */ + + /* Stash the original SP (minus S_FRAME_SIZE) in tpidr_el0. */ + msr tpidr_el0, x0 + + /* Recover the original x0 value and stash it in tpidrro_el0 */ + sub x0, sp, x0 + msr tpidrro_el0, x0 + + /* Switch to the overflow stack */ + adr_this_cpu sp, overflow_stack + OVERFLOW_STACK_SIZE, x0 + + /* + * Check whether we were already on the overflow stack. This may happen + * after panic() re-enables interrupts. + */ + mrs x0, tpidr_el0 // sp of interrupted context + sub x0, sp, x0 // delta with top of overflow stack + tst x0, #~(OVERFLOW_STACK_SIZE - 1) // within range? + b.ne __bad_stack // no? -> bad stack pointer + + /* We were already on the overflow stack. Restore sp/x0 and carry on. */ + sub sp, sp, x0 + mrs x0, tpidrro_el0 +#endif + b el\()\el\()_\label + .endm + + .macro tramp_alias, dst, sym + mov_q \dst, TRAMP_VALIAS + add \dst, \dst, #(\sym - .entry.tramp.text) + .endm + + .macro kernel_entry, el, regsize = 64 .if \regsize == 32 mov w0, w0 // zero upper 32 bits of x0 .endif @@ -100,10 +167,10 @@ .else add x21, sp, #S_FRAME_SIZE get_thread_info tsk - /* Save the task's original addr_limit and set USER_DS (TASK_SIZE_64) */ + /* Save the task's original addr_limit and set USER_DS */ ldr x20, [tsk, #TSK_TI_ADDR_LIMIT] str x20, [sp, #S_ORIG_ADDR_LIMIT] - mov x20, #TASK_SIZE_64 + mov x20, #USER_DS str x20, [tsk, #TSK_TI_ADDR_LIMIT] /* No need to reset PSTATE.UAO, hardware's already set it to 0 for us */ .endif /* \el == 0 */ @@ -111,6 +178,18 @@ mrs x23, spsr_el1 stp lr, x21, [sp, #S_LR] + /* + * In order to be able to dump the contents of struct pt_regs at the + * time the exception was taken (in case we attempt to walk the call + * stack later), chain it together with the stack frames. + */ + .if \el == 0 + stp xzr, xzr, [sp, #S_STACKFRAME] + .else + stp x29, x22, [sp, #S_STACKFRAME] + .endif + add x29, sp, #S_STACKFRAME + #ifdef CONFIG_ARM64_SW_TTBR0_PAN /* * Set the TTBR0 PAN bit in SPSR. When the exception is taken from @@ -126,7 +205,7 @@ .if \el != 0 mrs x21, ttbr0_el1 - tst x21, #0xffff << 48 // Check for the reserved ASID + tst x21, #TTBR_ASID_MASK // Check for the reserved ASID orr x23, x23, #PSR_PAN_BIT // Set the emulated PAN in the saved SPSR b.eq 1f // TTBR0 access already disabled and x23, x23, #~PSR_PAN_BIT // Clear the emulated PAN in the saved SPSR @@ -142,8 +221,8 @@ * Set syscallno to -1 by default (overridden later if real syscall). */ .if \el == 0 - mvn x21, xzr - str x21, [sp, #S_SYSCALLNO] + mvn w21, wzr + str w21, [sp, #S_SYSCALLNO] .endif /* @@ -189,7 +268,7 @@ tbnz x22, #22, 1f // Skip re-enabling TTBR0 access if the PSR_PAN_BIT is set .endif - __uaccess_ttbr0_enable x0 + __uaccess_ttbr0_enable x0, x1 .if \el == 0 /* @@ -198,7 +277,7 @@ * Cavium erratum 27456 (broadcast TLBI instructions may cause I-cache * corruption). */ - post_ttbr0_update_workaround + bl post_ttbr_update_workaround .endif 1: .if \el != 0 @@ -210,18 +289,20 @@ .if \el == 0 ldr x23, [sp, #S_SP] // load return stack pointer msr sp_el0, x23 + tst x22, #PSR_MODE32_BIT // native task? + b.eq 3f + #ifdef CONFIG_ARM64_ERRATUM_845719 alternative_if ARM64_WORKAROUND_845719 - tbz x22, #4, 1f #ifdef CONFIG_PID_IN_CONTEXTIDR mrs x29, contextidr_el1 msr contextidr_el1, x29 #else msr contextidr_el1, xzr #endif -1: alternative_else_nop_endif #endif +3: .endif msr elr_el1, x21 // set up the return data @@ -243,7 +324,21 @@ ldp x28, x29, [sp, #16 * 14] ldr lr, [sp, #S_LR] add sp, sp, #S_FRAME_SIZE // restore sp - eret // return to kernel + + .if \el == 0 +alternative_insn eret, nop, ARM64_UNMAP_KERNEL_AT_EL0 +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + bne 4f + msr far_el1, x30 + tramp_alias x30, tramp_exit_native + br x30 +4: + tramp_alias x30, tramp_exit_compat + br x30 +#endif + .else + eret + .endif .endm .macro irq_stack_entry @@ -259,20 +354,12 @@ and x25, x25, #~(THREAD_SIZE - 1) cbnz x25, 9998f - adr_this_cpu x25, irq_stack, x26 - mov x26, #IRQ_STACK_START_SP + ldr_this_cpu x25, irq_stack_ptr, x26 + mov x26, #IRQ_STACK_SIZE add x26, x25, x26 /* switch to the irq stack */ mov sp, x26 - - /* - * Add a dummy stack frame, this non-standard format is fixed up - * by unwind_frame() - */ - stp x29, x19, [sp, #-16]! - mov x29, sp - 9998: .endm @@ -290,8 +377,10 @@ * * x7 is reserved for the system call number in 32-bit mode. */ -sc_nr .req x25 // number of system calls -scno .req x26 // syscall number +wsc_nr .req w25 // number of system calls +xsc_nr .req x25 // number of system calls (zero-extended) +wscno .req w26 // syscall number +xscno .req x26 // syscall number (zero-extended) stbl .req x27 // syscall table pointer tsk .req x28 // current thread_info @@ -315,34 +404,62 @@ .align 11 ENTRY(vectors) - ventry el1_sync_invalid // Synchronous EL1t - ventry el1_irq_invalid // IRQ EL1t - ventry el1_fiq_invalid // FIQ EL1t - ventry el1_error_invalid // Error EL1t - - ventry el1_sync // Synchronous EL1h - ventry el1_irq // IRQ EL1h - ventry el1_fiq_invalid // FIQ EL1h - ventry el1_error_invalid // Error EL1h - - ventry el0_sync // Synchronous 64-bit EL0 - ventry el0_irq // IRQ 64-bit EL0 - ventry el0_fiq_invalid // FIQ 64-bit EL0 - ventry el0_error_invalid // Error 64-bit EL0 + kernel_ventry 1, sync_invalid // Synchronous EL1t + kernel_ventry 1, irq_invalid // IRQ EL1t + kernel_ventry 1, fiq_invalid // FIQ EL1t + kernel_ventry 1, error_invalid // Error EL1t + + kernel_ventry 1, sync // Synchronous EL1h + kernel_ventry 1, irq // IRQ EL1h + kernel_ventry 1, fiq_invalid // FIQ EL1h + kernel_ventry 1, error_invalid // Error EL1h + + kernel_ventry 0, sync // Synchronous 64-bit EL0 + kernel_ventry 0, irq // IRQ 64-bit EL0 + kernel_ventry 0, fiq_invalid // FIQ 64-bit EL0 + kernel_ventry 0, error_invalid // Error 64-bit EL0 #ifdef CONFIG_COMPAT - ventry el0_sync_compat // Synchronous 32-bit EL0 - ventry el0_irq_compat // IRQ 32-bit EL0 - ventry el0_fiq_invalid_compat // FIQ 32-bit EL0 - ventry el0_error_invalid_compat // Error 32-bit EL0 + kernel_ventry 0, sync_compat, 32 // Synchronous 32-bit EL0 + kernel_ventry 0, irq_compat, 32 // IRQ 32-bit EL0 + kernel_ventry 0, fiq_invalid_compat, 32 // FIQ 32-bit EL0 + kernel_ventry 0, error_invalid_compat, 32 // Error 32-bit EL0 #else - ventry el0_sync_invalid // Synchronous 32-bit EL0 - ventry el0_irq_invalid // IRQ 32-bit EL0 - ventry el0_fiq_invalid // FIQ 32-bit EL0 - ventry el0_error_invalid // Error 32-bit EL0 + kernel_ventry 0, sync_invalid, 32 // Synchronous 32-bit EL0 + kernel_ventry 0, irq_invalid, 32 // IRQ 32-bit EL0 + kernel_ventry 0, fiq_invalid, 32 // FIQ 32-bit EL0 + kernel_ventry 0, error_invalid, 32 // Error 32-bit EL0 #endif END(vectors) +#ifdef CONFIG_VMAP_STACK + /* + * We detected an overflow in kernel_ventry, which switched to the + * overflow stack. Stash the exception regs, and head to our overflow + * handler. + */ +__bad_stack: + /* Restore the original x0 value */ + mrs x0, tpidrro_el0 + + /* + * Store the original GPRs to the new stack. The orginal SP (minus + * S_FRAME_SIZE) was stashed in tpidr_el0 by kernel_ventry. + */ + sub sp, sp, #S_FRAME_SIZE + kernel_entry 1 + mrs x0, tpidr_el0 + add x0, x0, #S_FRAME_SIZE + str x0, [sp, #S_SP] + + /* Stash the regs for handle_bad_stack */ + mov x0, sp + + /* Time to die */ + bl handle_bad_stack + ASM_BUG() +#endif /* CONFIG_VMAP_STACK */ + /* * Invalid mode handlers */ @@ -351,7 +468,8 @@ mov x0, sp mov x1, #\reason mrs x2, esr_el1 - b bad_mode + bl bad_mode + ASM_BUG() .endm el0_sync_invalid: @@ -448,14 +566,16 @@ mrs x0, far_el1 enable_dbg mov x2, sp - b do_sp_pc_abort + bl do_sp_pc_abort + ASM_BUG() el1_undef: /* * Undefined instruction */ enable_dbg mov x0, sp - b do_undefinstr + bl do_undefinstr + ASM_BUG() el1_dbg: /* * Debug exception handling @@ -473,7 +593,8 @@ mov x0, sp mov x2, x1 mov x1, #BAD_SYNC - b bad_mode + bl bad_mode + ASM_BUG() ENDPROC(el1_sync) .align 6 @@ -577,8 +698,8 @@ * AArch32 syscall handling */ adrp stbl, compat_sys_call_table // load compat syscall table pointer - uxtw scno, w7 // syscall number in w7 (r7) - mov sc_nr, #__NR_compat_syscalls + mov wscno, w7 // syscall number in w7 (r7) + mov wsc_nr, #__NR_compat_syscalls b el0_svc_naked .align 6 @@ -605,13 +726,15 @@ * Instruction abort handling */ mrs x26, far_el1 - // enable interrupts before calling the main handler - enable_dbg_and_irq + enable_dbg +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif ct_user_exit mov x0, x26 mov x1, x25 mov x2, sp - bl do_mem_abort + bl do_el0_ia_bp_hardening b ret_to_user el0_fpsimd_acc: /* @@ -638,8 +761,10 @@ * Stack or PC alignment exception handling */ mrs x26, far_el1 - // enable interrupts before calling the main handler - enable_dbg_and_irq + enable_dbg +#ifdef CONFIG_TRACE_IRQFLAGS + bl trace_hardirqs_off +#endif ct_user_exit mov x0, x26 mov x1, x25 @@ -698,6 +823,11 @@ #endif ct_user_exit +#ifdef CONFIG_HARDEN_BRANCH_PREDICTOR + tbz x22, #55, 1f + bl do_el0_irq_bp_hardening +1: +#endif irq_handler #ifdef CONFIG_TRACE_IRQFLAGS @@ -707,38 +837,6 @@ ENDPROC(el0_irq) /* - * Register switch for AArch64. The callee-saved registers need to be saved - * and restored. On entry: - * x0 = previous task_struct (must be preserved across the switch) - * x1 = next task_struct - * Previous and next are guaranteed not to be the same. - * - */ -ENTRY(cpu_switch_to) - mov x10, #THREAD_CPU_CONTEXT - add x8, x0, x10 - mov x9, sp - stp x19, x20, [x8], #16 // store callee-saved registers - stp x21, x22, [x8], #16 - stp x23, x24, [x8], #16 - stp x25, x26, [x8], #16 - stp x27, x28, [x8], #16 - stp x29, x9, [x8], #16 - str lr, [x8] - add x8, x1, x10 - ldp x19, x20, [x8], #16 // restore callee-saved registers - ldp x21, x22, [x8], #16 - ldp x23, x24, [x8], #16 - ldp x25, x26, [x8], #16 - ldp x27, x28, [x8], #16 - ldp x29, x9, [x8], #16 - ldr lr, [x8] - mov sp, x9 - msr sp_el0, x1 - ret -ENDPROC(cpu_switch_to) - -/* * This is the fast syscall return path. We do as little as possible here, * and this includes saving x0 back into the kernel stack. */ @@ -781,36 +879,25 @@ ENDPROC(ret_to_user) /* - * This is how we return from a fork. - */ -ENTRY(ret_from_fork) - bl schedule_tail - cbz x19, 1f // not a kernel thread - mov x0, x20 - blr x19 -1: get_thread_info tsk - b ret_to_user -ENDPROC(ret_from_fork) - -/* * SVC handler. */ .align 6 el0_svc: adrp stbl, sys_call_table // load syscall table pointer - uxtw scno, w8 // syscall number in w8 - mov sc_nr, #__NR_syscalls + mov wscno, w8 // syscall number in w8 + mov wsc_nr, #__NR_syscalls el0_svc_naked: // compat entry point - stp x0, scno, [sp, #S_ORIG_X0] // save the original x0 and syscall number + stp x0, xscno, [sp, #S_ORIG_X0] // save the original x0 and syscall number enable_dbg_and_irq ct_user_exit 1 ldr x16, [tsk, #TSK_TI_FLAGS] // check for syscall hooks tst x16, #_TIF_SYSCALL_WORK b.ne __sys_trace - cmp scno, sc_nr // check upper syscall limit + cmp wscno, wsc_nr // check upper syscall limit b.hs ni_sys - ldr x16, [stbl, scno, lsl #3] // address in the syscall table + mask_nospec64 xscno, xsc_nr, x19 // enforce bounds for syscall number + ldr x16, [stbl, xscno, lsl #3] // address in the syscall table blr x16 // call sys_* routine b ret_fast_syscall ni_sys: @@ -824,24 +911,23 @@ * switches, and waiting for our parent to respond. */ __sys_trace: - mov w0, #-1 // set default errno for - cmp scno, x0 // user-issued syscall(-1) + cmp wscno, #-1 // user-issued syscall(-1)? b.ne 1f - mov x0, #-ENOSYS + mov x0, #-ENOSYS // set default errno if so str x0, [sp, #S_X0] 1: mov x0, sp bl syscall_trace_enter cmp w0, #-1 // skip the syscall? b.eq __sys_trace_return_skipped - uxtw scno, w0 // syscall number (possibly new) + mov wscno, w0 // syscall number (possibly new) mov x1, sp // pointer to regs - cmp scno, sc_nr // check upper syscall limit + cmp wscno, wsc_nr // check upper syscall limit b.hs __ni_sys_trace ldp x0, x1, [sp] // restore the syscall args ldp x2, x3, [sp, #S_X2] ldp x4, x5, [sp, #S_X4] ldp x6, x7, [sp, #S_X6] - ldr x16, [stbl, scno, lsl #3] // address in the syscall table + ldr x16, [stbl, xscno, lsl #3] // address in the syscall table blr x16 // call sys_* routine __sys_trace_return: @@ -858,6 +944,117 @@ .popsection // .entry.text +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +/* + * Exception vectors trampoline. + */ + .pushsection ".entry.tramp.text", "ax" + + .macro tramp_map_kernel, tmp + mrs \tmp, ttbr1_el1 + sub \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + bic \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp +#ifdef CONFIG_QCOM_FALKOR_ERRATUM_1003 +alternative_if ARM64_WORKAROUND_QCOM_FALKOR_E1003 + /* ASID already in \tmp[63:48] */ + movk \tmp, #:abs_g2_nc:(TRAMP_VALIAS >> 12) + movk \tmp, #:abs_g1_nc:(TRAMP_VALIAS >> 12) + /* 2MB boundary containing the vectors, so we nobble the walk cache */ + movk \tmp, #:abs_g0_nc:((TRAMP_VALIAS & ~(SZ_2M - 1)) >> 12) + isb + tlbi vae1, \tmp + dsb nsh +alternative_else_nop_endif +#endif /* CONFIG_QCOM_FALKOR_ERRATUM_1003 */ + .endm + + .macro tramp_unmap_kernel, tmp + mrs \tmp, ttbr1_el1 + add \tmp, \tmp, #(SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + orr \tmp, \tmp, #USER_ASID_FLAG + msr ttbr1_el1, \tmp + /* + * We avoid running the post_ttbr_update_workaround here because + * it's only needed by Cavium ThunderX, which requires KPTI to be + * disabled. + */ + .endm + + .macro tramp_ventry, regsize = 64 + .align 7 +1: + .if \regsize == 64 + msr tpidrro_el0, x30 // Restored in kernel_ventry + .endif + /* + * Defend against branch aliasing attacks by pushing a dummy + * entry onto the return stack and using a RET instruction to + * enter the full-fat kernel vectors. + */ + bl 2f + b . +2: + tramp_map_kernel x30 +#ifdef CONFIG_RANDOMIZE_BASE + adr x30, tramp_vectors + PAGE_SIZE +alternative_insn isb, nop, ARM64_WORKAROUND_QCOM_FALKOR_E1003 + ldr x30, [x30] +#else + ldr x30, =vectors +#endif + prfm plil1strm, [x30, #(1b - tramp_vectors)] + msr vbar_el1, x30 + add x30, x30, #(1b - tramp_vectors) + isb + ret + .endm + + .macro tramp_exit, regsize = 64 + adr x30, tramp_vectors + msr vbar_el1, x30 + tramp_unmap_kernel x30 + .if \regsize == 64 + mrs x30, far_el1 + .endif + eret + .endm + + .align 11 +ENTRY(tramp_vectors) + .space 0x400 + + tramp_ventry + tramp_ventry + tramp_ventry + tramp_ventry + + tramp_ventry 32 + tramp_ventry 32 + tramp_ventry 32 + tramp_ventry 32 +END(tramp_vectors) + +ENTRY(tramp_exit_native) + tramp_exit +END(tramp_exit_native) + +ENTRY(tramp_exit_compat) + tramp_exit 32 +END(tramp_exit_compat) + + .ltorg + .popsection // .entry.tramp.text +#ifdef CONFIG_RANDOMIZE_BASE + .pushsection ".rodata", "a" + .align PAGE_SHIFT + .globl __entry_tramp_data_start +__entry_tramp_data_start: + .quad vectors + .popsection // .rodata +#endif /* CONFIG_RANDOMIZE_BASE */ +#endif /* CONFIG_UNMAP_KERNEL_AT_EL0 */ + /* * Special system call wrappers. */ @@ -865,3 +1062,49 @@ mov x0, sp b sys_rt_sigreturn ENDPROC(sys_rt_sigreturn_wrapper) + +/* + * Register switch for AArch64. The callee-saved registers need to be saved + * and restored. On entry: + * x0 = previous task_struct (must be preserved across the switch) + * x1 = next task_struct + * Previous and next are guaranteed not to be the same. + * + */ +ENTRY(cpu_switch_to) + mov x10, #THREAD_CPU_CONTEXT + add x8, x0, x10 + mov x9, sp + stp x19, x20, [x8], #16 // store callee-saved registers + stp x21, x22, [x8], #16 + stp x23, x24, [x8], #16 + stp x25, x26, [x8], #16 + stp x27, x28, [x8], #16 + stp x29, x9, [x8], #16 + str lr, [x8] + add x8, x1, x10 + ldp x19, x20, [x8], #16 // restore callee-saved registers + ldp x21, x22, [x8], #16 + ldp x23, x24, [x8], #16 + ldp x25, x26, [x8], #16 + ldp x27, x28, [x8], #16 + ldp x29, x9, [x8], #16 + ldr lr, [x8] + mov sp, x9 + msr sp_el0, x1 + ret +ENDPROC(cpu_switch_to) +NOKPROBE(cpu_switch_to) + +/* + * This is how we return from a fork. + */ +ENTRY(ret_from_fork) + bl schedule_tail + cbz x19, 1f // not a kernel thread + mov x0, x20 + blr x19 +1: get_thread_info tsk + b ret_to_user +ENDPROC(ret_from_fork) +NOKPROBE(ret_from_fork) --- linux-azure-4.13.0.orig/arch/arm64/kernel/head.S +++ linux-azure-4.13.0/arch/arm64/kernel/head.S @@ -143,8 +143,8 @@ dmb sy // needed before dc ivac with // MMU off - add x1, x0, #0x20 // 4 x 8 bytes - b __inval_cache_range // tail call + mov x1, #0x20 // 4 x 8 bytes + b __inval_dcache_area // tail call ENDPROC(preserve_boot_args) /* @@ -221,20 +221,20 @@ * dirty cache lines being evicted. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE - bl __inval_cache_range + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) + bl __inval_dcache_area /* * Clear the idmap and swapper page tables. */ adrp x0, idmap_pg_dir - adrp x6, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) 1: stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 stp xzr, xzr, [x0], #16 - cmp x0, x6 - b.lo 1b + subs x1, x1, #64 + b.ne 1b mov x7, SWAPPER_MM_MMUFLAGS @@ -307,9 +307,9 @@ * tables again to remove any speculatively loaded cache lines. */ adrp x0, idmap_pg_dir - adrp x1, swapper_pg_dir + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE + ldr x1, =(IDMAP_DIR_SIZE + SWAPPER_DIR_SIZE + RESERVED_TTBR0_SIZE) dmb sy - bl __inval_cache_range + bl __inval_dcache_area ret x28 ENDPROC(__create_page_tables) @@ -361,6 +361,9 @@ ret // to __primary_switch() 0: #endif + add sp, sp, #16 + mov x29, #0 + mov x30, #0 b start_kernel ENDPROC(__primary_switched) @@ -368,7 +371,7 @@ * end early head section, begin head code that is also used for * hotplug and needs to have the same protections as the text region */ - .section ".idmap.text","ax" + .section ".idmap.text","awx" ENTRY(kimage_vaddr) .quad _text - TEXT_OFFSET @@ -381,6 +384,7 @@ * booted in EL1 or EL2 respectively. */ ENTRY(el2_setup) + msr SPsel, #1 // We want to use SP_EL{1,2} mrs x0, CurrentEL cmp x0, #CurrentEL_EL2 b.eq 1f @@ -616,6 +620,7 @@ ldr x2, [x0, #CPU_BOOT_TASK] msr sp_el0, x2 mov x29, #0 + mov x30, #0 b secondary_start_kernel ENDPROC(__secondary_switched) @@ -727,6 +732,7 @@ * to take into account by discarding the current kernel mapping and * creating a new one. */ + pre_disable_mmu_workaround msr sctlr_el1, x20 // disable the MMU isb bl __create_page_tables // recreate kernel mapping --- linux-azure-4.13.0.orig/arch/arm64/kernel/hibernate.c +++ linux-azure-4.13.0/arch/arm64/kernel/hibernate.c @@ -330,7 +330,7 @@ * read only (code, rodata). Clear the RDONLY bit from * the temporary mappings we use during restore. */ - set_pte(dst_pte, pte_clear_rdonly(pte)); + set_pte(dst_pte, pte_mkwrite(pte)); } else if (debug_pagealloc_enabled() && !pte_none(pte)) { /* * debug_pagealloc will removed the PTE_VALID bit if @@ -343,7 +343,7 @@ */ BUG_ON(!pfn_valid(pte_pfn(pte))); - set_pte(dst_pte, pte_mkpresent(pte_clear_rdonly(pte))); + set_pte(dst_pte, pte_mkpresent(pte_mkwrite(pte))); } } --- linux-azure-4.13.0.orig/arch/arm64/kernel/irq.c +++ linux-azure-4.13.0/arch/arm64/kernel/irq.c @@ -23,15 +23,16 @@ #include #include +#include #include #include #include #include +#include unsigned long irq_err_count; -/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ -DEFINE_PER_CPU(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack) __aligned(16); +DEFINE_PER_CPU(unsigned long *, irq_stack_ptr); int arch_show_interrupts(struct seq_file *p, int prec) { @@ -50,8 +51,43 @@ handle_arch_irq = handle_irq; } +#ifdef CONFIG_VMAP_STACK +static void init_irq_stacks(void) +{ + int cpu; + unsigned long *p; + + for_each_possible_cpu(cpu) { + /* + * To ensure that VMAP'd stack overflow detection works + * correctly, the IRQ stacks need to have the same + * alignment as other stacks. + */ + p = __vmalloc_node_range(IRQ_STACK_SIZE, THREAD_ALIGN, + VMALLOC_START, VMALLOC_END, + THREADINFO_GFP, PAGE_KERNEL, + 0, cpu_to_node(cpu), + __builtin_return_address(0)); + + per_cpu(irq_stack_ptr, cpu) = p; + } +} +#else +/* irq stack only needs to be 16 byte aligned - not IRQ_STACK_SIZE aligned. */ +DEFINE_PER_CPU_ALIGNED(unsigned long [IRQ_STACK_SIZE/sizeof(long)], irq_stack); + +static void init_irq_stacks(void) +{ + int cpu; + + for_each_possible_cpu(cpu) + per_cpu(irq_stack_ptr, cpu) = per_cpu(irq_stack, cpu); +} +#endif + void __init init_IRQ(void) { + init_irq_stacks(); irqchip_init(); if (!handle_arch_irq) panic("No interrupt controller found."); --- linux-azure-4.13.0.orig/arch/arm64/kernel/perf_callchain.c +++ linux-azure-4.13.0/arch/arm64/kernel/perf_callchain.c @@ -162,7 +162,6 @@ } frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; --- linux-azure-4.13.0.orig/arch/arm64/kernel/probes/uprobes.c +++ linux-azure-4.13.0/arch/arm64/kernel/probes/uprobes.c @@ -40,7 +40,7 @@ probe_opcode_t insn; /* TODO: Currently we do not support AARCH32 instruction probing */ - if (test_bit(TIF_32BIT, &mm->context.flags)) + if (mm->context.flags & MMCF_AARCH32) return -ENOTSUPP; else if (!IS_ALIGNED(addr, AARCH64_INSN_SIZE)) return -EINVAL; --- linux-azure-4.13.0.orig/arch/arm64/kernel/process.c +++ linux-azure-4.13.0/arch/arm64/kernel/process.c @@ -305,16 +305,14 @@ static void tls_thread_switch(struct task_struct *next) { - unsigned long tpidr, tpidrro; - tls_preserve_current_state(); - tpidr = *task_user_tls(next); - tpidrro = is_compat_thread(task_thread_info(next)) ? - next->thread.tp_value : 0; + if (is_compat_thread(task_thread_info(next))) + write_sysreg(next->thread.tp_value, tpidrro_el0); + else if (!arm64_kernel_unmapped_at_el0()) + write_sysreg(0, tpidrro_el0); - write_sysreg(tpidr, tpidr_el0); - write_sysreg(tpidrro, tpidrro_el0); + write_sysreg(*task_user_tls(next), tpidr_el0); } /* Restore the UAO state depending on next's addr_limit */ @@ -382,15 +380,12 @@ return 0; frame.fp = thread_saved_fp(p); - frame.sp = thread_saved_sp(p); frame.pc = thread_saved_pc(p); #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = p->curr_ret_stack; #endif do { - if (frame.sp < stack_page || - frame.sp >= stack_page + THREAD_SIZE || - unwind_frame(p, &frame)) + if (unwind_frame(p, &frame)) goto out; if (!in_sched_functions(frame.pc)) { ret = frame.pc; @@ -417,3 +412,11 @@ else return randomize_page(mm->brk, SZ_1G); } + +/* + * Called from setup_new_exec() after (COMPAT_)SET_PERSONALITY. + */ +void arch_setup_new_exec(void) +{ + current->mm->context.flags = is_compat_task() ? MMCF_AARCH32 : 0; +} --- linux-azure-4.13.0.orig/arch/arm64/kernel/ptrace.c +++ linux-azure-4.13.0/arch/arm64/kernel/ptrace.c @@ -42,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -127,7 +128,7 @@ { return ((addr & ~(THREAD_SIZE - 1)) == (kernel_stack_pointer(regs) & ~(THREAD_SIZE - 1))) || - on_irq_stack(addr, raw_smp_processor_id()); + on_irq_stack(addr); } /** @@ -1363,7 +1364,7 @@ if (dir == PTRACE_SYSCALL_EXIT) tracehook_report_syscall_exit(regs, 0); else if (tracehook_report_syscall_entry(regs)) - regs->syscallno = ~0UL; + regs->syscallno = ~0; regs->regs[regno] = saved_reg; } --- linux-azure-4.13.0.orig/arch/arm64/kernel/relocate_kernel.S +++ linux-azure-4.13.0/arch/arm64/kernel/relocate_kernel.S @@ -45,6 +45,7 @@ mrs x0, sctlr_el2 ldr x1, =SCTLR_ELx_FLAGS bic x0, x0, x1 + pre_disable_mmu_workaround msr sctlr_el2, x0 isb 1: --- linux-azure-4.13.0.orig/arch/arm64/kernel/return_address.c +++ linux-azure-4.13.0/arch/arm64/kernel/return_address.c @@ -42,7 +42,6 @@ data.addr = NULL; frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)return_address; /* dummy */ #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; --- linux-azure-4.13.0.orig/arch/arm64/kernel/signal.c +++ linux-azure-4.13.0/arch/arm64/kernel/signal.c @@ -29,6 +29,7 @@ #include #include #include +#include #include #include @@ -387,7 +388,7 @@ /* * Avoid sys_rt_sigreturn() restarting. */ - regs->syscallno = ~0UL; + regs->syscallno = ~0; err |= !valid_user_regs(®s->user_regs, current); if (err == 0) @@ -673,7 +674,7 @@ { unsigned long continue_addr = 0, restart_addr = 0; int retval = 0; - int syscall = (int)regs->syscallno; + int syscall = regs->syscallno; struct ksignal ksig; /* @@ -687,7 +688,7 @@ /* * Avoid additional syscall restarting via ret_to_user. */ - regs->syscallno = ~0UL; + regs->syscallno = ~0; /* * Prepare for system call restart. We do this here so that a @@ -749,7 +750,11 @@ * Update the trace code with the current status. */ trace_hardirqs_off(); + do { + /* Check valid user FS if needed */ + addr_limit_user_check(); + if (thread_flags & _TIF_NEED_RESCHED) { schedule(); } else { --- linux-azure-4.13.0.orig/arch/arm64/kernel/signal32.c +++ linux-azure-4.13.0/arch/arm64/kernel/signal32.c @@ -354,7 +354,7 @@ /* * Avoid compat_sys_sigreturn() restarting. */ - regs->syscallno = ~0UL; + regs->syscallno = ~0; err |= !valid_user_regs(®s->user_regs, current); --- linux-azure-4.13.0.orig/arch/arm64/kernel/sleep.S +++ linux-azure-4.13.0/arch/arm64/kernel/sleep.S @@ -95,7 +95,7 @@ ret ENDPROC(__cpu_suspend_enter) - .pushsection ".idmap.text", "ax" + .pushsection ".idmap.text", "awx" ENTRY(cpu_resume) bl el2_setup // if in EL2 drop to EL1 cleanly bl __cpu_setup --- linux-azure-4.13.0.orig/arch/arm64/kernel/smp.c +++ linux-azure-4.13.0/arch/arm64/kernel/smp.c @@ -154,7 +154,7 @@ * page tables. */ secondary_data.task = idle; - secondary_data.stack = task_stack_page(idle) + THREAD_START_SP; + secondary_data.stack = task_stack_page(idle) + THREAD_SIZE; update_cpu_boot_status(CPU_MMU_OFF); __flush_dcache_area(&secondary_data, sizeof(secondary_data)); --- linux-azure-4.13.0.orig/arch/arm64/kernel/stacktrace.c +++ linux-azure-4.13.0/arch/arm64/kernel/stacktrace.c @@ -42,33 +42,17 @@ */ int notrace unwind_frame(struct task_struct *tsk, struct stackframe *frame) { - unsigned long high, low; unsigned long fp = frame->fp; - unsigned long irq_stack_ptr; + + if (fp & 0xf) + return -EINVAL; if (!tsk) tsk = current; - /* - * Switching between stacks is valid when tracing current and in - * non-preemptible context. - */ - if (tsk == current && !preemptible()) - irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); - else - irq_stack_ptr = 0; - - low = frame->sp; - /* irq stacks are not THREAD_SIZE aligned */ - if (on_irq_stack(frame->sp, raw_smp_processor_id())) - high = irq_stack_ptr; - else - high = ALIGN(low, THREAD_SIZE) - 0x20; - - if (fp < low || fp > high || fp & 0xf) + if (!on_accessible_stack(tsk, fp)) return -EINVAL; - frame->sp = fp + 0x10; frame->fp = READ_ONCE_NOCHECK(*(unsigned long *)(fp)); frame->pc = READ_ONCE_NOCHECK(*(unsigned long *)(fp + 8)); @@ -86,34 +70,13 @@ #endif /* CONFIG_FUNCTION_GRAPH_TRACER */ /* - * Check whether we are going to walk through from interrupt stack - * to task stack. - * If we reach the end of the stack - and its an interrupt stack, - * unpack the dummy frame to find the original elr. - * - * Check the frame->fp we read from the bottom of the irq_stack, - * and the original task stack pointer are both in current->stack. + * Frames created upon entry from EL0 have NULL FP and PC values, so + * don't bother reporting these. Frames created by __noreturn functions + * might have a valid FP even if PC is bogus, so only terminate where + * both are NULL. */ - if (frame->sp == irq_stack_ptr) { - struct pt_regs *irq_args; - unsigned long orig_sp = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); - - if (object_is_on_stack((void *)orig_sp) && - object_is_on_stack((void *)frame->fp)) { - frame->sp = orig_sp; - - /* orig_sp is the saved pt_regs, find the elr */ - irq_args = (struct pt_regs *)orig_sp; - frame->pc = irq_args->pc; - } else { - /* - * This frame has a non-standard format, and we - * didn't fix it, because the data looked wrong. - * Refuse to output this frame. - */ - return -EINVAL; - } - } + if (!frame->fp && !frame->pc) + return -EINVAL; return 0; } @@ -167,7 +130,6 @@ data.no_sched_functions = 0; frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = current->curr_ret_stack; @@ -192,12 +154,10 @@ if (tsk != current) { data.no_sched_functions = 1; frame.fp = thread_saved_fp(tsk); - frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } else { data.no_sched_functions = 0; frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)save_stack_trace_tsk; } #ifdef CONFIG_FUNCTION_GRAPH_TRACER --- linux-azure-4.13.0.orig/arch/arm64/kernel/time.c +++ linux-azure-4.13.0/arch/arm64/kernel/time.c @@ -50,7 +50,6 @@ return regs->pc; frame.fp = regs->regs[29]; - frame.sp = regs->sp; frame.pc = regs->pc; #ifdef CONFIG_FUNCTION_GRAPH_TRACER frame.graph = -1; /* no task info */ --- linux-azure-4.13.0.orig/arch/arm64/kernel/traps.c +++ linux-azure-4.13.0/arch/arm64/kernel/traps.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include @@ -41,6 +42,7 @@ #include #include #include +#include #include #include #include @@ -116,7 +118,7 @@ for (i = -4; i < 1; i++) { unsigned int val, bad; - bad = __get_user(val, &((u32 *)addr)[i]); + bad = get_user(val, &((u32 *)addr)[i]); if (!bad) p += sprintf(p, i == 0 ? "(%08x) " : "%08x ", val); @@ -143,7 +145,6 @@ void dump_backtrace(struct pt_regs *regs, struct task_struct *tsk) { struct stackframe frame; - unsigned long irq_stack_ptr; int skip; pr_debug("%s(regs = %p tsk = %p)\n", __func__, regs, tsk); @@ -154,25 +155,14 @@ if (!try_get_task_stack(tsk)) return; - /* - * Switching between stacks is valid when tracing current and in - * non-preemptible context. - */ - if (tsk == current && !preemptible()) - irq_stack_ptr = IRQ_STACK_PTR(smp_processor_id()); - else - irq_stack_ptr = 0; - if (tsk == current) { frame.fp = (unsigned long)__builtin_frame_address(0); - frame.sp = current_stack_pointer; frame.pc = (unsigned long)dump_backtrace; } else { /* * task blocked in __switch_to */ frame.fp = thread_saved_fp(tsk); - frame.sp = thread_saved_sp(tsk); frame.pc = thread_saved_pc(tsk); } #ifdef CONFIG_FUNCTION_GRAPH_TRACER @@ -182,13 +172,12 @@ skip = !!regs; printk("Call trace:\n"); while (1) { - unsigned long where = frame.pc; unsigned long stack; int ret; /* skip until specified stack frame */ if (!skip) { - dump_backtrace_entry(where); + dump_backtrace_entry(frame.pc); } else if (frame.fp == regs->regs[29]) { skip = 0; /* @@ -203,20 +192,12 @@ ret = unwind_frame(tsk, &frame); if (ret < 0) break; - stack = frame.sp; - if (in_exception_text(where)) { - /* - * If we switched to the irq_stack before calling this - * exception handler, then the pt_regs will be on the - * task stack. The easiest way to tell is if the large - * pt_regs would overlap with the end of the irq_stack. - */ - if (stack < irq_stack_ptr && - (stack + sizeof(struct pt_regs)) > irq_stack_ptr) - stack = IRQ_STACK_TO_TASK_STACK(irq_stack_ptr); + if (in_entry_text(frame.pc)) { + stack = frame.fp - offsetof(struct pt_regs, stackframe); - dump_mem("", "Exception stack", stack, - stack + sizeof(struct pt_regs)); + if (on_accessible_stack(tsk, stack)) + dump_mem("", "Exception stack", stack, + stack + sizeof(struct pt_regs)); } } @@ -484,6 +465,9 @@ case ESR_ELx_SYS64_ISS_CRM_DC_CVAC: /* DC CVAC, gets promoted */ __user_cache_maint("dc civac", address, ret); break; + case ESR_ELx_SYS64_ISS_CRM_DC_CVAP: /* DC CVAP */ + __user_cache_maint("sys 3, c7, c12, 1", address, ret); + break; case ESR_ELx_SYS64_ISS_CRM_DC_CIVAC: /* DC CIVAC */ __user_cache_maint("dc civac", address, ret); break; @@ -593,7 +577,7 @@ if (show_unhandled_signals_ratelimited()) { pr_info("%s[%d]: syscall %d\n", current->comm, - task_pid_nr(current), (int)regs->syscallno); + task_pid_nr(current), regs->syscallno); dump_instr("", regs); if (user_mode(regs)) __show_regs(regs); @@ -689,6 +673,43 @@ force_sig_info(info.si_signo, &info, current); } +#ifdef CONFIG_VMAP_STACK + +DEFINE_PER_CPU(unsigned long [OVERFLOW_STACK_SIZE/sizeof(long)], overflow_stack) + __aligned(16); + +asmlinkage void handle_bad_stack(struct pt_regs *regs) +{ + unsigned long tsk_stk = (unsigned long)current->stack; + unsigned long irq_stk = (unsigned long)this_cpu_read(irq_stack_ptr); + unsigned long ovf_stk = (unsigned long)this_cpu_ptr(overflow_stack); + unsigned int esr = read_sysreg(esr_el1); + unsigned long far = read_sysreg(far_el1); + + console_verbose(); + pr_emerg("Insufficient stack space to handle exception!"); + + pr_emerg("ESR: 0x%08x -- %s\n", esr, esr_get_class_string(esr)); + pr_emerg("FAR: 0x%016lx\n", far); + + pr_emerg("Task stack: [0x%016lx..0x%016lx]\n", + tsk_stk, tsk_stk + THREAD_SIZE); + pr_emerg("IRQ stack: [0x%016lx..0x%016lx]\n", + irq_stk, irq_stk + THREAD_SIZE); + pr_emerg("Overflow stack: [0x%016lx..0x%016lx]\n", + ovf_stk, ovf_stk + OVERFLOW_STACK_SIZE); + + __show_regs(regs); + + /* + * We use nmi_panic to limit the potential for recusive overflows, and + * to get a better stack trace. + */ + nmi_panic(NULL, "kernel stack overflow"); + cpu_park_loop(); +} +#endif + void __pte_error(const char *file, int line, unsigned long val) { pr_err("%s:%d: bad pte %016lx.\n", file, line, val); --- linux-azure-4.13.0.orig/arch/arm64/kernel/vdso.c +++ linux-azure-4.13.0/arch/arm64/kernel/vdso.c @@ -110,12 +110,27 @@ } #endif /* CONFIG_COMPAT */ +static int vdso_mremap(const struct vm_special_mapping *sm, + struct vm_area_struct *new_vma) +{ + unsigned long new_size = new_vma->vm_end - new_vma->vm_start; + unsigned long vdso_size = vdso_end - vdso_start; + + if (vdso_size != new_size) + return -EINVAL; + + current->mm->context.vdso = (void *)new_vma->vm_start; + + return 0; +} + static struct vm_special_mapping vdso_spec[2] __ro_after_init = { { .name = "[vvar]", }, { .name = "[vdso]", + .mremap = vdso_mremap, }, }; --- linux-azure-4.13.0.orig/arch/arm64/kernel/vmlinux.lds.S +++ linux-azure-4.13.0/arch/arm64/kernel/vmlinux.lds.S @@ -56,6 +56,17 @@ #define HIBERNATE_TEXT #endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define TRAMP_TEXT \ + . = ALIGN(PAGE_SIZE); \ + VMLINUX_SYMBOL(__entry_tramp_text_start) = .; \ + *(.entry.tramp.text) \ + . = ALIGN(PAGE_SIZE); \ + VMLINUX_SYMBOL(__entry_tramp_text_end) = .; +#else +#define TRAMP_TEXT +#endif + /* * The size of the PE/COFF section that covers the kernel image, which * runs from stext to _edata, must be a round multiple of the PE/COFF @@ -72,22 +83,6 @@ #define PECOFF_EDATA_PADDING #endif -#if defined(CONFIG_DEBUG_ALIGN_RODATA) -/* - * 4 KB granule: 1 level 2 entry - * 16 KB granule: 128 level 3 entries, with contiguous bit - * 64 KB granule: 32 level 3 entries, with contiguous bit - */ -#define SEGMENT_ALIGN SZ_2M -#else -/* - * 4 KB granule: 16 level 3 entries, with contiguous bit - * 16 KB granule: 4 level 3 entries, without contiguous bit - * 64 KB granule: 1 level 3 entry - */ -#define SEGMENT_ALIGN SZ_64K -#endif - SECTIONS { /* @@ -128,6 +123,7 @@ HYPERVISOR_TEXT IDMAP_TEXT HIBERNATE_TEXT + TRAMP_TEXT *(.fixup) *(.gnu.warning) . = ALIGN(16); @@ -192,7 +188,7 @@ _data = .; _sdata = .; - RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_SIZE) + RW_DATA_SECTION(L1_CACHE_BYTES, PAGE_SIZE, THREAD_ALIGN) /* * Data written with the MMU off but read with the MMU on requires @@ -229,6 +225,11 @@ . += RESERVED_TTBR0_SIZE; #endif +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + tramp_pg_dir = .; + . += PAGE_SIZE; +#endif + __pecoff_data_size = ABSOLUTE(. - __initdata_begin); _end = .; @@ -249,7 +250,10 @@ ASSERT(__hibernate_exit_text_end - (__hibernate_exit_text_start & ~(SZ_4K - 1)) <= SZ_4K, "Hibernate exit text too big or misaligned") #endif - +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +ASSERT((__entry_tramp_text_end - __entry_tramp_text_start) == PAGE_SIZE, + "Entry trampoline text too big") +#endif /* * If padding is applied before .head.text, virt<->phys conversions will fail. */ --- linux-azure-4.13.0.orig/arch/arm64/kvm/handle_exit.c +++ linux-azure-4.13.0/arch/arm64/kvm/handle_exit.c @@ -22,12 +22,13 @@ #include #include +#include + #include #include #include #include #include -#include #define CREATE_TRACE_POINTS #include "trace.h" @@ -42,9 +43,9 @@ kvm_vcpu_hvc_get_imm(vcpu)); vcpu->stat.hvc_exit_stat++; - ret = kvm_psci_call(vcpu); + ret = kvm_hvc_call_handler(vcpu); if (ret < 0) { - kvm_inject_undefined(vcpu); + vcpu_set_reg(vcpu, 0, ~0UL); return 1; } @@ -53,7 +54,16 @@ static int handle_smc(struct kvm_vcpu *vcpu, struct kvm_run *run) { - kvm_inject_undefined(vcpu); + /* + * "If an SMC instruction executed at Non-secure EL1 is + * trapped to EL2 because HCR_EL2.TSC is 1, the exception is a + * Trap exception, not a Secure Monitor Call exception [...]" + * + * We need to advance the PC after the trap, as it would + * otherwise return to the same address... + */ + vcpu_set_reg(vcpu, 0, ~0UL); + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); return 1; } --- linux-azure-4.13.0.orig/arch/arm64/kvm/hyp-init.S +++ linux-azure-4.13.0/arch/arm64/kvm/hyp-init.S @@ -151,6 +151,7 @@ mrs x5, sctlr_el2 ldr x6, =SCTLR_ELx_FLAGS bic x5, x5, x6 // Clear SCTL_M and etc + pre_disable_mmu_workaround msr sctlr_el2, x5 isb --- linux-azure-4.13.0.orig/arch/arm64/kvm/hyp/Makefile +++ linux-azure-4.13.0/arch/arm64/kvm/hyp/Makefile @@ -2,7 +2,7 @@ # Makefile for Kernel-based Virtual Machine module, HYP part # -ccflags-y += -fno-stack-protector +ccflags-y += -fno-stack-protector -DDISABLE_BRANCH_PROFILING KVM=../../../../virt/kvm --- linux-azure-4.13.0.orig/arch/arm64/kvm/hyp/entry.S +++ linux-azure-4.13.0/arch/arm64/kvm/hyp/entry.S @@ -196,3 +196,15 @@ eret ENDPROC(__fpsimd_guest_restore) + +ENTRY(__qcom_hyp_sanitize_btac_predictors) + /** + * Call SMC64 with Silicon provider serviceID 23<<8 (0xc2001700) + * 0xC2000000-0xC200FFFF: assigned to SiP Service Calls + * b15-b0: contains SiP functionID + */ + movz x0, #0x1700 + movk x0, #0xc200, lsl #16 + smc #0 + ret +ENDPROC(__qcom_hyp_sanitize_btac_predictors) --- linux-azure-4.13.0.orig/arch/arm64/kvm/hyp/hyp-entry.S +++ linux-azure-4.13.0/arch/arm64/kvm/hyp/hyp-entry.S @@ -15,6 +15,7 @@ * along with this program. If not, see . */ +#include #include #include @@ -64,10 +65,11 @@ lsr x0, x1, #ESR_ELx_EC_SHIFT cmp x0, #ESR_ELx_EC_HVC64 + ccmp x0, #ESR_ELx_EC_HVC32, #4, ne b.ne el1_trap - mrs x1, vttbr_el2 // If vttbr is valid, the 64bit guest - cbnz x1, el1_trap // called HVC + mrs x1, vttbr_el2 // If vttbr is valid, the guest + cbnz x1, el1_hvc_guest // called HVC /* Here, we're pretty sure the host called HVC. */ ldp x0, x1, [sp], #16 @@ -100,6 +102,20 @@ eret +el1_hvc_guest: + /* + * Fastest possible path for ARM_SMCCC_ARCH_WORKAROUND_1. + * The workaround has already been applied on the host, + * so let's quickly get back to the guest. We don't bother + * restoring x1, as it can be clobbered anyway. + */ + ldr x1, [sp] // Guest's x0 + eor w1, w1, #ARM_SMCCC_ARCH_WORKAROUND_1 + cbnz w1, el1_trap + mov x0, x1 + add sp, sp, #16 + eret + el1_trap: /* * x0: ESR_EC --- linux-azure-4.13.0.orig/arch/arm64/kvm/hyp/s2-setup.c +++ linux-azure-4.13.0/arch/arm64/kvm/hyp/s2-setup.c @@ -70,7 +70,7 @@ * Management in ID_AA64MMFR1_EL1 and enable the feature in VTCR_EL2. */ tmp = (read_sysreg(id_aa64mmfr1_el1) >> ID_AA64MMFR1_HADBS_SHIFT) & 0xf; - if (IS_ENABLED(CONFIG_ARM64_HW_AFDBM) && tmp) + if (tmp) val |= VTCR_EL2_HA; /* --- linux-azure-4.13.0.orig/arch/arm64/kvm/hyp/switch.c +++ linux-azure-4.13.0/arch/arm64/kvm/hyp/switch.c @@ -17,6 +17,9 @@ #include #include +#include + +#include #include #include @@ -51,7 +54,7 @@ val &= ~CPACR_EL1_FPEN; write_sysreg(val, cpacr_el1); - write_sysreg(__kvm_hyp_vector, vbar_el1); + write_sysreg(kvm_get_hyp_vector(), vbar_el1); } static void __hyp_text __activate_traps_nvhe(void) @@ -364,6 +367,16 @@ /* 0 falls through to be handled out of EL2 */ } + if (cpus_have_const_cap(ARM64_HARDEN_BP_POST_GUEST_EXIT)) { + u32 midr = read_cpuid_id(); + + /* Apply BTAC predictors mitigation to all Falkor chips */ + if (((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR) || + ((midr & MIDR_CPU_MODEL_MASK) == MIDR_QCOM_FALKOR_V1)) { + __qcom_hyp_sanitize_btac_predictors(); + } + } + fp_enabled = __fpsimd_enabled(); __sysreg_save_guest_state(guest_ctxt); --- linux-azure-4.13.0.orig/arch/arm64/kvm/inject_fault.c +++ linux-azure-4.13.0/arch/arm64/kvm/inject_fault.c @@ -33,12 +33,26 @@ #define LOWER_EL_AArch64_VECTOR 0x400 #define LOWER_EL_AArch32_VECTOR 0x600 +/* + * Table taken from ARMv8 ARM DDI0487B-B, table G1-10. + */ +static const u8 return_offsets[8][2] = { + [0] = { 0, 0 }, /* Reset, unused */ + [1] = { 4, 2 }, /* Undefined */ + [2] = { 0, 0 }, /* SVC, unused */ + [3] = { 4, 4 }, /* Prefetch abort */ + [4] = { 8, 8 }, /* Data abort */ + [5] = { 0, 0 }, /* HVC, unused */ + [6] = { 4, 4 }, /* IRQ, unused */ + [7] = { 4, 4 }, /* FIQ, unused */ +}; + static void prepare_fault32(struct kvm_vcpu *vcpu, u32 mode, u32 vect_offset) { unsigned long cpsr; unsigned long new_spsr_value = *vcpu_cpsr(vcpu); bool is_thumb = (new_spsr_value & COMPAT_PSR_T_BIT); - u32 return_offset = (is_thumb) ? 4 : 0; + u32 return_offset = return_offsets[vect_offset >> 2][is_thumb]; u32 sctlr = vcpu_cp15(vcpu, c1_SCTLR); cpsr = mode | COMPAT_PSR_I_BIT; --- linux-azure-4.13.0.orig/arch/arm64/kvm/vgic-sys-reg-v3.c +++ linux-azure-4.13.0/arch/arm64/kvm/vgic-sys-reg-v3.c @@ -208,29 +208,12 @@ static bool access_gic_aprn(struct kvm_vcpu *vcpu, struct sys_reg_params *p, const struct sys_reg_desc *r, u8 apr) { - struct vgic_cpu *vgic_v3_cpu = &vcpu->arch.vgic_cpu; u8 idx = r->Op2 & 3; - /* - * num_pri_bits are initialized with HW supported values. - * We can rely safely on num_pri_bits even if VM has not - * restored ICC_CTLR_EL1 before restoring APnR registers. - */ - switch (vgic_v3_cpu->num_pri_bits) { - case 7: - vgic_v3_access_apr_reg(vcpu, p, apr, idx); - break; - case 6: - if (idx > 1) - goto err; - vgic_v3_access_apr_reg(vcpu, p, apr, idx); - break; - default: - if (idx > 0) - goto err; - vgic_v3_access_apr_reg(vcpu, p, apr, idx); - } + if (idx > vgic_v3_max_apr_idx(vcpu)) + goto err; + vgic_v3_access_apr_reg(vcpu, p, apr, idx); return true; err: if (!p->is_write) --- linux-azure-4.13.0.orig/arch/arm64/lib/Makefile +++ linux-azure-4.13.0/arch/arm64/lib/Makefile @@ -17,3 +17,5 @@ -fcall-saved-x10 -fcall-saved-x11 -fcall-saved-x12 \ -fcall-saved-x13 -fcall-saved-x14 -fcall-saved-x15 \ -fcall-saved-x18 + +lib-$(CONFIG_ARCH_HAS_UACCESS_FLUSHCACHE) += uaccess_flushcache.o --- linux-azure-4.13.0.orig/arch/arm64/lib/clear_user.S +++ linux-azure-4.13.0/arch/arm64/lib/clear_user.S @@ -21,7 +21,7 @@ .text -/* Prototype: int __clear_user(void *addr, size_t sz) +/* Prototype: int __arch_clear_user(void *addr, size_t sz) * Purpose : clear some user memory * Params : addr - user memory address to clear * : sz - number of bytes to clear @@ -29,8 +29,8 @@ * * Alignment fixed up by hardware. */ -ENTRY(__clear_user) - uaccess_enable_not_uao x2, x3 +ENTRY(__arch_clear_user) + uaccess_enable_not_uao x2, x3, x4 mov x2, x1 // save the size for fixup return subs x1, x1, #8 b.mi 2f @@ -50,9 +50,9 @@ b.mi 5f uao_user_alternative 9f, strb, sttrb, wzr, x0, 0 5: mov x0, #0 - uaccess_disable_not_uao x2 + uaccess_disable_not_uao x2, x3 ret -ENDPROC(__clear_user) +ENDPROC(__arch_clear_user) .section .fixup,"ax" .align 2 --- linux-azure-4.13.0.orig/arch/arm64/lib/copy_from_user.S +++ linux-azure-4.13.0/arch/arm64/lib/copy_from_user.S @@ -64,10 +64,10 @@ end .req x5 ENTRY(__arch_copy_from_user) - uaccess_enable_not_uao x3, x4 + uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 + uaccess_disable_not_uao x3, x4 mov x0, #0 // Nothing to copy ret ENDPROC(__arch_copy_from_user) --- linux-azure-4.13.0.orig/arch/arm64/lib/copy_in_user.S +++ linux-azure-4.13.0/arch/arm64/lib/copy_in_user.S @@ -64,14 +64,15 @@ .endm end .req x5 -ENTRY(raw_copy_in_user) - uaccess_enable_not_uao x3, x4 + +ENTRY(__arch_copy_in_user) + uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 + uaccess_disable_not_uao x3, x4 mov x0, #0 ret -ENDPROC(raw_copy_in_user) +ENDPROC(__arch_copy_in_user) .section .fixup,"ax" .align 2 --- linux-azure-4.13.0.orig/arch/arm64/lib/copy_to_user.S +++ linux-azure-4.13.0/arch/arm64/lib/copy_to_user.S @@ -63,10 +63,10 @@ end .req x5 ENTRY(__arch_copy_to_user) - uaccess_enable_not_uao x3, x4 + uaccess_enable_not_uao x3, x4, x5 add end, x0, x2 #include "copy_template.S" - uaccess_disable_not_uao x3 + uaccess_disable_not_uao x3, x4 mov x0, #0 ret ENDPROC(__arch_copy_to_user) --- linux-azure-4.13.0.orig/arch/arm64/lib/uaccess_flushcache.c +++ linux-azure-4.13.0/arch/arm64/lib/uaccess_flushcache.c @@ -0,0 +1,47 @@ +/* + * Copyright (C) 2017 ARM Ltd. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program. If not, see . + */ + +#include +#include +#include + +void memcpy_flushcache(void *dst, const void *src, size_t cnt) +{ + /* + * We assume this should not be called with @dst pointing to + * non-cacheable memory, such that we don't need an explicit + * barrier to order the cache maintenance against the memcpy. + */ + memcpy(dst, src, cnt); + __clean_dcache_area_pop(dst, cnt); +} +EXPORT_SYMBOL_GPL(memcpy_flushcache); + +void memcpy_page_flushcache(char *to, struct page *page, size_t offset, + size_t len) +{ + memcpy_flushcache(to, page_address(page) + offset, len); +} + +unsigned long __copy_user_flushcache(void *to, const void __user *from, + unsigned long n) +{ + unsigned long rc = __arch_copy_from_user(to, from, n); + + /* See above */ + __clean_dcache_area_pop(to, n - rc); + return rc; +} --- linux-azure-4.13.0.orig/arch/arm64/mm/cache.S +++ linux-azure-4.13.0/arch/arm64/mm/cache.S @@ -49,7 +49,7 @@ * - end - virtual end address of region */ ENTRY(__flush_cache_user_range) - uaccess_ttbr0_enable x2, x3 + uaccess_ttbr0_enable x2, x3, x4 dcache_line_size x2, x3 sub x3, x2, #1 bic x4, x0, x3 @@ -72,7 +72,7 @@ isb mov x0, #0 1: - uaccess_ttbr0_disable x1 + uaccess_ttbr0_disable x1, x2 ret 9: mov x0, #-EFAULT @@ -109,20 +109,25 @@ ENDPROC(__clean_dcache_area_pou) /* - * __dma_inv_area(start, size) - * - start - virtual start address of region + * __inval_dcache_area(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are invalidated. Any partial lines at the ends of the interval are + * also cleaned to PoC to prevent data loss. + * + * - kaddr - kernel address * - size - size in question */ -__dma_inv_area: - add x1, x1, x0 +ENTRY(__inval_dcache_area) /* FALLTHROUGH */ /* - * __inval_cache_range(start, end) - * - start - start address of region - * - end - end address of region + * __dma_inv_area(start, size) + * - start - virtual start address of region + * - size - size in question */ -ENTRY(__inval_cache_range) +__dma_inv_area: + add x1, x1, x0 dcache_line_size x2, x3 sub x3, x2, #1 tst x1, x3 // end cache line aligned? @@ -140,7 +145,7 @@ b.lo 2b dsb sy ret -ENDPIPROC(__inval_cache_range) +ENDPIPROC(__inval_dcache_area) ENDPROC(__dma_inv_area) /* @@ -167,6 +172,20 @@ ENDPROC(__dma_clean_area) /* + * __clean_dcache_area_pop(kaddr, size) + * + * Ensure that any D-cache lines for the interval [kaddr, kaddr+size) + * are cleaned to the PoP. + * + * - kaddr - kernel address + * - size - size in question + */ +ENTRY(__clean_dcache_area_pop) + dcache_by_line_op cvap, sy, x0, x1, x2, x3 + ret +ENDPIPROC(__clean_dcache_area_pop) + +/* * __dma_flush_area(start, size) * * clean & invalidate D / U line --- linux-azure-4.13.0.orig/arch/arm64/mm/context.c +++ linux-azure-4.13.0/arch/arm64/mm/context.c @@ -39,7 +39,16 @@ #define ASID_MASK (~GENMASK(asid_bits - 1, 0)) #define ASID_FIRST_VERSION (1UL << asid_bits) -#define NUM_USER_ASIDS ASID_FIRST_VERSION + +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +#define NUM_USER_ASIDS (ASID_FIRST_VERSION >> 1) +#define asid2idx(asid) (((asid) & ~ASID_MASK) >> 1) +#define idx2asid(idx) (((idx) << 1) & ~ASID_MASK) +#else +#define NUM_USER_ASIDS (ASID_FIRST_VERSION) +#define asid2idx(asid) ((asid) & ~ASID_MASK) +#define idx2asid(idx) asid2idx(idx) +#endif /* Get the ASIDBits supported by the current CPU */ static u32 get_cpu_asid_bits(void) @@ -79,13 +88,6 @@ } } -static void set_reserved_asid_bits(void) -{ - if (IS_ENABLED(CONFIG_QCOM_FALKOR_ERRATUM_1003) && - cpus_have_const_cap(ARM64_WORKAROUND_QCOM_FALKOR_E1003)) - __set_bit(FALKOR_RESERVED_ASID, asid_map); -} - static void flush_context(unsigned int cpu) { int i; @@ -94,8 +96,6 @@ /* Update the list of reserved ASIDs and the ASID bitmap. */ bitmap_clear(asid_map, 0, NUM_USER_ASIDS); - set_reserved_asid_bits(); - /* * Ensure the generation bump is observed before we xchg the * active_asids. @@ -113,7 +113,7 @@ */ if (asid == 0) asid = per_cpu(reserved_asids, i); - __set_bit(asid & ~ASID_MASK, asid_map); + __set_bit(asid2idx(asid), asid_map); per_cpu(reserved_asids, i) = asid; } @@ -165,16 +165,16 @@ * We had a valid ASID in a previous life, so try to re-use * it if possible. */ - asid &= ~ASID_MASK; - if (!__test_and_set_bit(asid, asid_map)) + if (!__test_and_set_bit(asid2idx(asid), asid_map)) return newasid; } /* * Allocate a free ASID. If we can't find one, take a note of the - * currently active ASIDs and mark the TLBs as requiring flushes. - * We always count from ASID #1, as we use ASID #0 when setting a - * reserved TTBR0 for the init_mm. + * currently active ASIDs and mark the TLBs as requiring flushes. We + * always count from ASID #2 (index 1), as we use ASID #0 when setting + * a reserved TTBR0 for the init_mm and we allocate ASIDs in even/odd + * pairs. */ asid = find_next_zero_bit(asid_map, NUM_USER_ASIDS, cur_idx); if (asid != NUM_USER_ASIDS) @@ -191,7 +191,7 @@ set_asid: __set_bit(asid, asid_map); cur_idx = asid; - return asid | generation; + return idx2asid(asid) | generation; } void check_and_switch_context(struct mm_struct *mm, unsigned int cpu) @@ -227,6 +227,9 @@ raw_spin_unlock_irqrestore(&cpu_asid_lock, flags); switch_mm_fastpath: + + arm64_apply_bp_hardening(); + /* * Defer TTBR0_EL1 setting for user threads to uaccess_enable() when * emulating PAN. @@ -235,6 +238,15 @@ cpu_switch_mm(mm->pgd, mm); } +/* Errata workaround post TTBRx_EL1 update. */ +asmlinkage void post_ttbr_update_workaround(void) +{ + asm(ALTERNATIVE("nop; nop; nop", + "ic iallu; dsb nsh; isb", + ARM64_WORKAROUND_CAVIUM_27456, + CONFIG_CAVIUM_ERRATUM_27456)); +} + static int asids_init(void) { asid_bits = get_cpu_asid_bits(); @@ -250,8 +262,6 @@ panic("Failed to allocate bitmap for %lu ASIDs\n", NUM_USER_ASIDS); - set_reserved_asid_bits(); - pr_info("ASID allocator initialised with %lu entries\n", NUM_USER_ASIDS); return 0; } --- linux-azure-4.13.0.orig/arch/arm64/mm/fault.c +++ linux-azure-4.13.0/arch/arm64/mm/fault.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -139,7 +140,6 @@ pr_cont("\n"); } -#ifdef CONFIG_ARM64_HW_AFDBM /* * This function sets the access flags (dirty, accessed), as well as write * permission, and only to a more permissive setting. @@ -154,18 +154,13 @@ unsigned long address, pte_t *ptep, pte_t entry, int dirty) { - pteval_t old_pteval; - unsigned int tmp; + pteval_t old_pteval, pteval; if (pte_same(*ptep, entry)) return 0; /* only preserve the access flags and write permission */ - pte_val(entry) &= PTE_AF | PTE_WRITE | PTE_DIRTY; - - /* set PTE_RDONLY if actual read-only or clean PTE */ - if (!pte_write(entry) || !pte_sw_dirty(entry)) - pte_val(entry) |= PTE_RDONLY; + pte_val(entry) &= PTE_RDONLY | PTE_AF | PTE_WRITE | PTE_DIRTY; /* * Setting the flags must be done atomically to avoid racing with the @@ -174,21 +169,18 @@ * (calculated as: a & b == ~(~a | ~b)). */ pte_val(entry) ^= PTE_RDONLY; - asm volatile("// ptep_set_access_flags\n" - " prfm pstl1strm, %2\n" - "1: ldxr %0, %2\n" - " eor %0, %0, %3 // negate PTE_RDONLY in *ptep\n" - " orr %0, %0, %4 // set flags\n" - " eor %0, %0, %3 // negate final PTE_RDONLY\n" - " stxr %w1, %0, %2\n" - " cbnz %w1, 1b\n" - : "=&r" (old_pteval), "=&r" (tmp), "+Q" (pte_val(*ptep)) - : "L" (PTE_RDONLY), "r" (pte_val(entry))); + pteval = READ_ONCE(pte_val(*ptep)); + do { + old_pteval = pteval; + pteval ^= PTE_RDONLY; + pteval |= pte_val(entry); + pteval ^= PTE_RDONLY; + pteval = cmpxchg_relaxed(&pte_val(*ptep), old_pteval, pteval); + } while (pteval != old_pteval); flush_tlb_fix_spurious_fault(vma, address); return 1; } -#endif static bool is_el1_instruction_abort(unsigned int esr) { @@ -207,7 +199,7 @@ if (fsc_type == ESR_ELx_FSC_PERM) return true; - if (addr < USER_DS && system_uses_ttbr0_pan()) + if (addr < TASK_SIZE && system_uses_ttbr0_pan()) return fsc_type == ESR_ELx_FSC_FAULT && (regs->pstate & PSR_PAN_BIT); @@ -389,7 +381,7 @@ mm_flags |= FAULT_FLAG_WRITE; } - if (addr < USER_DS && is_permission_fault(esr, regs, addr)) { + if (addr < TASK_SIZE && is_permission_fault(esr, regs, addr)) { /* regs->orig_addr_limit may be 0 if we entered from EL0 */ if (regs->orig_addr_limit == KERNEL_DS) die("Accessing user space memory with fs=KERNEL_DS", regs, esr); @@ -614,7 +606,7 @@ { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 0 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 1 translation fault" }, { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 2 translation fault" }, - { do_page_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, + { do_translation_fault, SIGSEGV, SEGV_MAPERR, "level 3 translation fault" }, { do_bad, SIGBUS, 0, "unknown 8" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 1 access flag fault" }, { do_page_fault, SIGSEGV, SEGV_ACCERR, "level 2 access flag fault" }, @@ -712,6 +704,29 @@ arm64_notify_die("", regs, &info, esr); } +asmlinkage void __exception do_el0_irq_bp_hardening(void) +{ + /* PC has already been checked in entry.S */ + arm64_apply_bp_hardening(); +} + +asmlinkage void __exception do_el0_ia_bp_hardening(unsigned long addr, + unsigned int esr, + struct pt_regs *regs) +{ + /* + * We've taken an instruction abort from userspace and not yet + * re-enabled IRQs. If the address is a kernel address, apply + * BP hardening prior to enabling IRQs and pre-emption. + */ + if (addr > TASK_SIZE) + arm64_apply_bp_hardening(); + + local_irq_enable(); + do_mem_abort(addr, esr, regs); +} + + /* * Handle stack alignment exceptions. */ @@ -722,6 +737,12 @@ struct siginfo info; struct task_struct *tsk = current; + if (user_mode(regs)) { + if (instruction_pointer(regs) > TASK_SIZE) + arm64_apply_bp_hardening(); + local_irq_enable(); + } + if (show_unhandled_signals && unhandled_signal(tsk, SIGBUS)) pr_info_ratelimited("%s[%d]: %s exception: pc=%p sp=%p\n", tsk->comm, task_pid_nr(tsk), @@ -781,6 +802,9 @@ if (interrupts_enabled(regs)) trace_hardirqs_off(); + if (user_mode(regs) && instruction_pointer(regs) > TASK_SIZE) + arm64_apply_bp_hardening(); + if (!inf->fn(addr, esr, regs)) { rv = 1; } else { --- linux-azure-4.13.0.orig/arch/arm64/mm/flush.c +++ linux-azure-4.13.0/arch/arm64/mm/flush.c @@ -83,3 +83,19 @@ * Additional functions defined in assembly. */ EXPORT_SYMBOL(flush_icache_range); + +#ifdef CONFIG_ARCH_HAS_PMEM_API +void arch_wb_cache_pmem(void *addr, size_t size) +{ + /* Ensure order against any prior non-cacheable writes */ + dmb(osh); + __clean_dcache_area_pop(addr, size); +} +EXPORT_SYMBOL_GPL(arch_wb_cache_pmem); + +void arch_invalidate_pmem(void *addr, size_t size) +{ + __inval_dcache_area(addr, size); +} +EXPORT_SYMBOL_GPL(arch_invalidate_pmem); +#endif --- linux-azure-4.13.0.orig/arch/arm64/mm/mmu.c +++ linux-azure-4.13.0/arch/arm64/mm/mmu.c @@ -107,7 +107,7 @@ * The following mapping attributes may be updated in live * kernel mappings without the need for break-before-make. */ - static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE; + static const pteval_t mask = PTE_PXN | PTE_RDONLY | PTE_WRITE | PTE_NG; /* creating or taking down mappings is always safe */ if (old == 0 || new == 0) @@ -117,6 +117,10 @@ if ((old | new) & PTE_CONT) return false; + /* Transitioning from Non-Global to Global is unsafe */ + if (old & ~new & PTE_NG) + return false; + return ((old ^ new) & ~mask) == 0; } @@ -525,6 +529,37 @@ } early_param("rodata", parse_rodata); +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 +static int __init map_entry_trampoline(void) +{ + extern char __entry_tramp_text_start[]; + + pgprot_t prot = rodata_enabled ? PAGE_KERNEL_ROX : PAGE_KERNEL_EXEC; + phys_addr_t pa_start = __pa_symbol(__entry_tramp_text_start); + + /* The trampoline is always mapped and can therefore be global */ + pgprot_val(prot) &= ~PTE_NG; + + /* Map only the text into the trampoline page table */ + memset(tramp_pg_dir, 0, PGD_SIZE); + __create_pgd_mapping(tramp_pg_dir, pa_start, TRAMP_VALIAS, PAGE_SIZE, + prot, pgd_pgtable_alloc, 0); + + /* Map both the text and data into the kernel page table */ + __set_fixmap(FIX_ENTRY_TRAMP_TEXT, pa_start, prot); + if (IS_ENABLED(CONFIG_RANDOMIZE_BASE)) { + extern char __entry_tramp_data_start[]; + + __set_fixmap(FIX_ENTRY_TRAMP_DATA, + __pa_symbol(__entry_tramp_data_start), + PAGE_KERNEL_RO); + } + + return 0; +} +core_initcall(map_entry_trampoline); +#endif + /* * Create fine-grained mappings for the kernel. */ --- linux-azure-4.13.0.orig/arch/arm64/mm/proc.S +++ linux-azure-4.13.0/arch/arm64/mm/proc.S @@ -86,7 +86,7 @@ * * x0: Address of context pointer */ - .pushsection ".idmap.text", "ax" + .pushsection ".idmap.text", "awx" ENTRY(cpu_do_resume) ldp x2, x3, [x0] ldp x4, x5, [x0, #16] @@ -138,16 +138,30 @@ * - pgd_phys - physical address of new TTB */ ENTRY(cpu_do_switch_mm) - pre_ttbr0_update_workaround x0, x2, x3 + mrs x2, ttbr1_el1 mmid x1, x1 // get mm->context.id - bfi x0, x1, #48, #16 // set the ASID - msr ttbr0_el1, x0 // set TTBR0 +#ifdef CONFIG_ARM64_SW_TTBR0_PAN + bfi x0, x1, #48, #16 // set the ASID field in TTBR0 +#endif + bfi x2, x1, #48, #16 // set the ASID + msr ttbr1_el1, x2 // in TTBR1 (since TCR.A1 is set) isb - post_ttbr0_update_workaround - ret + msr ttbr0_el1, x0 // now update TTBR0 + isb + b post_ttbr_update_workaround // Back to C code... ENDPROC(cpu_do_switch_mm) - .pushsection ".idmap.text", "ax" + .pushsection ".idmap.text", "awx" + +.macro __idmap_cpu_set_reserved_ttbr1, tmp1 + adrp \tmp1, empty_zero_page + msr ttbr1_el1, \tmp1 + isb + tlbi vmalle1 + dsb nsh + isb +.endm + /* * void idmap_cpu_replace_ttbr1(phys_addr_t new_pgd) * @@ -158,13 +172,7 @@ mrs x2, daif msr daifset, #0xf - adrp x1, empty_zero_page - msr ttbr1_el1, x1 - isb - - tlbi vmalle1 - dsb nsh - isb + __idmap_cpu_set_reserved_ttbr1 x1 msr ttbr1_el1, x0 isb @@ -175,13 +183,196 @@ ENDPROC(idmap_cpu_replace_ttbr1) .popsection +#ifdef CONFIG_UNMAP_KERNEL_AT_EL0 + .pushsection ".idmap.text", "awx" + + .macro __idmap_kpti_get_pgtable_ent, type + dc cvac, cur_\()\type\()p // Ensure any existing dirty + dmb sy // lines are written back before + ldr \type, [cur_\()\type\()p] // loading the entry + tbz \type, #0, next_\()\type // Skip invalid entries + .endm + + .macro __idmap_kpti_put_pgtable_ent_ng, type + orr \type, \type, #PTE_NG // Same bit for blocks and pages + str \type, [cur_\()\type\()p] // Update the entry and ensure it + dc civac, cur_\()\type\()p // is visible to all CPUs. + .endm + +/* + * void __kpti_install_ng_mappings(int cpu, int num_cpus, phys_addr_t swapper) + * + * Called exactly once from stop_machine context by each CPU found during boot. + */ +__idmap_kpti_flag: + .long 1 +ENTRY(idmap_kpti_install_ng_mappings) + cpu .req w0 + num_cpus .req w1 + swapper_pa .req x2 + swapper_ttb .req x3 + flag_ptr .req x4 + cur_pgdp .req x5 + end_pgdp .req x6 + pgd .req x7 + cur_pudp .req x8 + end_pudp .req x9 + pud .req x10 + cur_pmdp .req x11 + end_pmdp .req x12 + pmd .req x13 + cur_ptep .req x14 + end_ptep .req x15 + pte .req x16 + + mrs swapper_ttb, ttbr1_el1 + adr flag_ptr, __idmap_kpti_flag + + cbnz cpu, __idmap_kpti_secondary + + /* We're the boot CPU. Wait for the others to catch up */ + sevl +1: wfe + ldaxr w18, [flag_ptr] + eor w18, w18, num_cpus + cbnz w18, 1b + + /* We need to walk swapper, so turn off the MMU. */ + mrs x18, sctlr_el1 + bic x18, x18, #SCTLR_ELx_M + msr sctlr_el1, x18 + isb + + /* Everybody is enjoying the idmap, so we can rewrite swapper. */ + /* PGD */ + mov cur_pgdp, swapper_pa + add end_pgdp, cur_pgdp, #(PTRS_PER_PGD * 8) +do_pgd: __idmap_kpti_get_pgtable_ent pgd + tbnz pgd, #1, walk_puds + __idmap_kpti_put_pgtable_ent_ng pgd +next_pgd: + add cur_pgdp, cur_pgdp, #8 + cmp cur_pgdp, end_pgdp + b.ne do_pgd + + /* Publish the updated tables and nuke all the TLBs */ + dsb sy + tlbi vmalle1is + dsb ish + isb + + /* We're done: fire up the MMU again */ + mrs x18, sctlr_el1 + orr x18, x18, #SCTLR_ELx_M + msr sctlr_el1, x18 + isb + + /* Set the flag to zero to indicate that we're all done */ + str wzr, [flag_ptr] + ret + + /* PUD */ +walk_puds: + .if CONFIG_PGTABLE_LEVELS > 3 + pte_to_phys cur_pudp, pgd + add end_pudp, cur_pudp, #(PTRS_PER_PUD * 8) +do_pud: __idmap_kpti_get_pgtable_ent pud + tbnz pud, #1, walk_pmds + __idmap_kpti_put_pgtable_ent_ng pud +next_pud: + add cur_pudp, cur_pudp, 8 + cmp cur_pudp, end_pudp + b.ne do_pud + b next_pgd + .else /* CONFIG_PGTABLE_LEVELS <= 3 */ + mov pud, pgd + b walk_pmds +next_pud: + b next_pgd + .endif + + /* PMD */ +walk_pmds: + .if CONFIG_PGTABLE_LEVELS > 2 + pte_to_phys cur_pmdp, pud + add end_pmdp, cur_pmdp, #(PTRS_PER_PMD * 8) +do_pmd: __idmap_kpti_get_pgtable_ent pmd + tbnz pmd, #1, walk_ptes + __idmap_kpti_put_pgtable_ent_ng pmd +next_pmd: + add cur_pmdp, cur_pmdp, #8 + cmp cur_pmdp, end_pmdp + b.ne do_pmd + b next_pud + .else /* CONFIG_PGTABLE_LEVELS <= 2 */ + mov pmd, pud + b walk_ptes +next_pmd: + b next_pud + .endif + + /* PTE */ +walk_ptes: + pte_to_phys cur_ptep, pmd + add end_ptep, cur_ptep, #(PTRS_PER_PTE * 8) +do_pte: __idmap_kpti_get_pgtable_ent pte + __idmap_kpti_put_pgtable_ent_ng pte +next_pte: + add cur_ptep, cur_ptep, #8 + cmp cur_ptep, end_ptep + b.ne do_pte + b next_pmd + + /* Secondary CPUs end up here */ +__idmap_kpti_secondary: + /* Uninstall swapper before surgery begins */ + __idmap_cpu_set_reserved_ttbr1 x18 + + /* Increment the flag to let the boot CPU we're ready */ +1: ldxr w18, [flag_ptr] + add w18, w18, #1 + stxr w17, w18, [flag_ptr] + cbnz w17, 1b + + /* Wait for the boot CPU to finish messing around with swapper */ + sevl +1: wfe + ldxr w18, [flag_ptr] + cbnz w18, 1b + + /* All done, act like nothing happened */ + msr ttbr1_el1, swapper_ttb + isb + ret + + .unreq cpu + .unreq num_cpus + .unreq swapper_pa + .unreq swapper_ttb + .unreq flag_ptr + .unreq cur_pgdp + .unreq end_pgdp + .unreq pgd + .unreq cur_pudp + .unreq end_pudp + .unreq pud + .unreq cur_pmdp + .unreq end_pmdp + .unreq pmd + .unreq cur_ptep + .unreq end_ptep + .unreq pte +ENDPROC(idmap_kpti_install_ng_mappings) + .popsection +#endif + /* * __cpu_setup * * Initialise the processor for turning the MMU on. Return in x0 the * value of the SCTLR_EL1 register. */ - .pushsection ".idmap.text", "ax" + .pushsection ".idmap.text", "awx" ENTRY(__cpu_setup) tlbi vmalle1 // Invalidate local TLB dsb nsh @@ -225,7 +416,7 @@ * both user and kernel. */ ldr x10, =TCR_TxSZ(VA_BITS) | TCR_CACHE_FLAGS | TCR_SMP_FLAGS | \ - TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 + TCR_TG_FLAGS | TCR_ASID16 | TCR_TBI0 | TCR_A1 tcr_set_idmap_t0sz x10, x9 /* --- linux-azure-4.13.0.orig/arch/arm64/xen/hypercall.S +++ linux-azure-4.13.0/arch/arm64/xen/hypercall.S @@ -101,12 +101,12 @@ * need the explicit uaccess_enable/disable if the TTBR0 PAN emulation * is enabled (it implies that hardware UAO and PAN disabled). */ - uaccess_ttbr0_enable x6, x7 + uaccess_ttbr0_enable x6, x7, x8 hvc XEN_IMM /* * Disable userspace access from kernel once the hyp call completed. */ - uaccess_ttbr0_disable x6 + uaccess_ttbr0_disable x6, x7 ret ENDPROC(privcmd_call); --- linux-azure-4.13.0.orig/arch/frv/include/asm/futex.h +++ linux-azure-4.13.0/arch/frv/include/asm/futex.h @@ -7,7 +7,8 @@ #include #include -extern int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr); +extern int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr); static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, --- linux-azure-4.13.0.orig/arch/frv/kernel/futex.c +++ linux-azure-4.13.0/arch/frv/kernel/futex.c @@ -186,20 +186,10 @@ /* * do the futex operations */ -int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -225,18 +215,9 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; break; - } - } + if (!ret) + *oval = oldval; return ret; -} /* end futex_atomic_op_inuser() */ +} /* end arch_futex_atomic_op_inuser() */ --- linux-azure-4.13.0.orig/arch/hexagon/include/asm/futex.h +++ linux-azure-4.13.0/arch/hexagon/include/asm/futex.h @@ -31,18 +31,9 @@ static inline int -futex_atomic_op_inuser(int encoded_op, int __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(int))) - return -EFAULT; pagefault_disable(); @@ -72,30 +63,9 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/ia64/include/asm/futex.h +++ linux-azure-4.13.0/arch/ia64/include/asm/futex.h @@ -45,18 +45,9 @@ } while (0) static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -84,17 +75,9 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/ia64/mm/init.c +++ linux-azure-4.13.0/arch/ia64/mm/init.c @@ -646,13 +646,20 @@ } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { + pg_data_t *pgdat; + struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int ret; - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); + pgdat = NODE_DATA(nid); + + zone = pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL, for_device); + ret = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + if (ret) printk("%s: Problem encountered in __add_pages() as ret=%d\n", __func__, ret); --- linux-azure-4.13.0.orig/arch/microblaze/include/asm/futex.h +++ linux-azure-4.13.0/arch/microblaze/include/asm/futex.h @@ -29,18 +29,9 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -66,30 +57,9 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/mips/ar7/platform.c +++ linux-azure-4.13.0/arch/mips/ar7/platform.c @@ -575,6 +575,7 @@ uart_port.type = PORT_AR7; uart_port.uartclk = clk_get_rate(bus_clk) / 2; uart_port.iotype = UPIO_MEM32; + uart_port.flags = UPF_FIXED_TYPE; uart_port.regshift = 2; uart_port.line = 0; @@ -653,6 +654,10 @@ u32 val; int res; + res = ar7_gpio_init(); + if (res) + pr_warn("unable to register gpios: %d\n", res); + res = ar7_register_uarts(); if (res) pr_err("unable to setup uart(s): %d\n", res); --- linux-azure-4.13.0.orig/arch/mips/ar7/prom.c +++ linux-azure-4.13.0/arch/mips/ar7/prom.c @@ -246,8 +246,6 @@ ar7_init_cmdline(fw_arg0, (char **)fw_arg1); ar7_init_env((struct env_var *)fw_arg2); console_config(); - - ar7_gpio_init(); } #define PORT(offset) (KSEG1ADDR(AR7_REGS_UART0 + (offset * 4))) --- linux-azure-4.13.0.orig/arch/mips/include/asm/futex.h +++ linux-azure-4.13.0/arch/mips/include/asm/futex.h @@ -83,18 +83,9 @@ } static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -125,17 +116,9 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/mips/include/asm/mips-cm.h +++ linux-azure-4.13.0/arch/mips/include/asm/mips-cm.h @@ -240,8 +240,8 @@ #define CM_GCR_BASE_GCRBASE_MSK (_ULCAST_(0x1ffff) << 15) #define CM_GCR_BASE_CMDEFTGT_SHF 0 #define CM_GCR_BASE_CMDEFTGT_MSK (_ULCAST_(0x3) << 0) -#define CM_GCR_BASE_CMDEFTGT_DISABLED 0 -#define CM_GCR_BASE_CMDEFTGT_MEM 1 +#define CM_GCR_BASE_CMDEFTGT_MEM 0 +#define CM_GCR_BASE_CMDEFTGT_RESERVED 1 #define CM_GCR_BASE_CMDEFTGT_IOCU0 2 #define CM_GCR_BASE_CMDEFTGT_IOCU1 3 --- linux-azure-4.13.0.orig/arch/mips/kernel/perf_event_mipsxx.c +++ linux-azure-4.13.0/arch/mips/kernel/perf_event_mipsxx.c @@ -618,8 +618,7 @@ return -ENOENT; } - if ((unsigned int)event->cpu >= nr_cpumask_bits || - (event->cpu >= 0 && !cpu_online(event->cpu))) + if (event->cpu >= 0 && !cpu_online(event->cpu)) return -ENODEV; if (!atomic_inc_not_zero(&active_events)) { --- linux-azure-4.13.0.orig/arch/mips/kernel/smp-bmips.c +++ linux-azure-4.13.0/arch/mips/kernel/smp-bmips.c @@ -589,11 +589,11 @@ /* Flush and enable RAC */ cfg = __raw_readl(cbr + BMIPS_RAC_CONFIG); - __raw_writel(cfg | 0x100, BMIPS_RAC_CONFIG); + __raw_writel(cfg | 0x100, cbr + BMIPS_RAC_CONFIG); __raw_readl(cbr + BMIPS_RAC_CONFIG); cfg = __raw_readl(cbr + BMIPS_RAC_CONFIG); - __raw_writel(cfg | 0xf, BMIPS_RAC_CONFIG); + __raw_writel(cfg | 0xf, cbr + BMIPS_RAC_CONFIG); __raw_readl(cbr + BMIPS_RAC_CONFIG); cfg = __raw_readl(cbr + BMIPS_RAC_ADDRESS_RANGE); --- linux-azure-4.13.0.orig/arch/mips/kernel/smp-cmp.c +++ linux-azure-4.13.0/arch/mips/kernel/smp-cmp.c @@ -19,7 +19,7 @@ #undef DEBUG #include -#include +#include #include #include #include --- linux-azure-4.13.0.orig/arch/mips/kernel/smp.c +++ linux-azure-4.13.0/arch/mips/kernel/smp.c @@ -66,6 +66,7 @@ cpumask_t cpu_core_map[NR_CPUS] __read_mostly; EXPORT_SYMBOL(cpu_core_map); +static DECLARE_COMPLETION(cpu_starting); static DECLARE_COMPLETION(cpu_running); /* @@ -376,6 +377,12 @@ cpumask_set_cpu(cpu, &cpu_coherent_mask); notify_cpu_starting(cpu); + /* Notify boot CPU that we're starting & ready to sync counters */ + complete(&cpu_starting); + + synchronise_count_slave(cpu); + + /* The CPU is running and counters synchronised, now mark it online */ set_cpu_online(cpu, true); set_cpu_sibling_map(cpu); @@ -383,8 +390,11 @@ calculate_cpu_foreign_map(); + /* + * Notify boot CPU that we're up & online and it can safely return + * from __cpu_up + */ complete(&cpu_running); - synchronise_count_slave(cpu); /* * irq will be enabled in ->smp_finish(), enabling it too early @@ -443,17 +453,17 @@ { mp_ops->boot_secondary(cpu, tidle); - /* - * We must check for timeout here, as the CPU will not be marked - * online until the counters are synchronised. - */ - if (!wait_for_completion_timeout(&cpu_running, + /* Wait for CPU to start and be ready to sync counters */ + if (!wait_for_completion_timeout(&cpu_starting, msecs_to_jiffies(1000))) { pr_crit("CPU%u: failed to start\n", cpu); return -EIO; } synchronise_count_master(cpu); + + /* Wait for CPU to finish startup & mark itself online before return */ + wait_for_completion(&cpu_running); return 0; } --- linux-azure-4.13.0.orig/arch/mips/math-emu/cp1emu.c +++ linux-azure-4.13.0/arch/mips/math-emu/cp1emu.c @@ -2387,7 +2387,6 @@ break; default: /* Reserved R6 ops */ - pr_err("Reserved MIPS R6 CMP.condn.S operation\n"); return SIGILL; } } @@ -2461,7 +2460,6 @@ break; default: /* Reserved R6 ops */ - pr_err("Reserved MIPS R6 CMP.condn.D operation\n"); return SIGILL; } } --- linux-azure-4.13.0.orig/arch/mips/math-emu/dp_fmax.c +++ linux-azure-4.13.0/arch/mips/math-emu/dp_fmax.c @@ -47,14 +47,26 @@ case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -80,9 +92,7 @@ return ys ? x : y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; @@ -106,16 +116,32 @@ else if (xs < ys) return x; - /* Compare exponent */ - if (xe > ye) - return x; - else if (xe < ye) - return y; + /* Signs of inputs are equal, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } else { + /* Inputs are both negative */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return y; + return x; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return y; - return x; + return x; + return y; } union ieee754dp ieee754dp_fmaxa(union ieee754dp x, union ieee754dp y) @@ -147,14 +173,26 @@ case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -164,6 +202,9 @@ /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754dp_inf(xs & ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): @@ -171,7 +212,6 @@ case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): return x; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): @@ -180,9 +220,7 @@ return y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; @@ -207,7 +245,11 @@ return y; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) return y; - return x; + else if (xm > ym) + return x; + else if (xs == 0) + return x; + return y; } --- linux-azure-4.13.0.orig/arch/mips/math-emu/dp_fmin.c +++ linux-azure-4.13.0/arch/mips/math-emu/dp_fmin.c @@ -47,14 +47,26 @@ case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -80,9 +92,7 @@ return ys ? y : x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; @@ -106,16 +116,32 @@ else if (xs < ys) return y; - /* Compare exponent */ - if (xe > ye) - return y; - else if (xe < ye) - return x; + /* Signs of inputs are the same, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } else { + /* Inputs are both negative */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return x; + return y; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return x; - return y; + return y; + return x; } union ieee754dp ieee754dp_fmina(union ieee754dp x, union ieee754dp y) @@ -147,14 +173,26 @@ case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754dp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -164,25 +202,25 @@ /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754dp_inf(xs | ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): - return x; + return y; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM): - return y; + return x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754dp_zero(1); + return ieee754dp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; @@ -207,7 +245,11 @@ return x; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) + return x; + else if (xm > ym) + return y; + else if (xs == 1) return x; return y; } --- linux-azure-4.13.0.orig/arch/mips/math-emu/dp_maddf.c +++ linux-azure-4.13.0/arch/mips/math-emu/dp_maddf.c @@ -14,22 +14,45 @@ #include "ieee754dp.h" -enum maddf_flags { - maddf_negate_product = 1 << 0, -}; + +/* 128 bits shift right logical with rounding. */ +void srl128(u64 *hptr, u64 *lptr, int count) +{ + u64 low; + + if (count >= 128) { + *lptr = *hptr != 0 || *lptr != 0; + *hptr = 0; + } else if (count >= 64) { + if (count == 64) { + *lptr = *hptr | (*lptr != 0); + } else { + low = *lptr; + *lptr = *hptr >> (count - 64); + *lptr |= (*hptr << (128 - count)) != 0 || low != 0; + } + *hptr = 0; + } else { + low = *lptr; + *lptr = low >> count | *hptr << (64 - count); + *lptr |= (low << (64 - count)) != 0; + *hptr = *hptr >> count; + } +} static union ieee754dp _dp_maddf(union ieee754dp z, union ieee754dp x, union ieee754dp y, enum maddf_flags flags) { int re; int rs; - u64 rm; unsigned lxm; unsigned hxm; unsigned lym; unsigned hym; u64 lrm; u64 hrm; + u64 lzm; + u64 hzm; u64 t; u64 at; int s; @@ -48,52 +71,34 @@ ieee754_clearcx(); - switch (zc) { - case IEEE754_CLASS_SNAN: - ieee754_setcx(IEEE754_INVALID_OPERATION); + /* + * Handle the cases when at least one of x, y or z is a NaN. + * Order of precedence is sNaN, qNaN and z, x, y. + */ + if (zc == IEEE754_CLASS_SNAN) return ieee754dp_nanxcpt(z); - case IEEE754_CLASS_DNORM: - DPDNORMZ; - /* QNAN and ZERO cases are handled separately below */ - } - - switch (CLPAIR(xc, yc)) { - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN): - return ieee754dp_nanxcpt(y); - - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): + if (xc == IEEE754_CLASS_SNAN) return ieee754dp_nanxcpt(x); - - case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): + if (yc == IEEE754_CLASS_SNAN) + return ieee754dp_nanxcpt(y); + if (zc == IEEE754_CLASS_QNAN) + return z; + if (xc == IEEE754_CLASS_QNAN) + return x; + if (yc == IEEE754_CLASS_QNAN) return y; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF): - return x; + if (zc == IEEE754_CLASS_DNORM) + DPDNORMZ; + /* ZERO z cases are handled separately below */ + switch (CLPAIR(xc, yc)) { /* * Infinity handling */ case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): - if (zc == IEEE754_CLASS_QNAN) - return z; ieee754_setcx(IEEE754_INVALID_OPERATION); return ieee754dp_indef(); @@ -102,9 +107,27 @@ case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): - if (zc == IEEE754_CLASS_QNAN) - return z; - return ieee754dp_inf(xs ^ ys); + if ((zc == IEEE754_CLASS_INF) && + ((!(flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys))) || + ((flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))))) { + /* + * Cases of addition of infinities with opposite signs + * or subtraction of infinities with same signs. + */ + ieee754_setcx(IEEE754_INVALID_OPERATION); + return ieee754dp_indef(); + } + /* + * z is here either not an infinity, or an infinity having the + * same sign as product (x*y) (in case of MADDF.D instruction) + * or product -(x*y) (in MSUBF.D case). The result must be an + * infinity, and its sign is determined only by the value of + * (flags & MADDF_NEGATE_PRODUCT) and the signs of x and y. + */ + if (flags & MADDF_NEGATE_PRODUCT) + return ieee754dp_inf(1 ^ (xs ^ ys)); + else + return ieee754dp_inf(xs ^ ys); case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM): @@ -113,32 +136,42 @@ case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): if (zc == IEEE754_CLASS_INF) return ieee754dp_inf(zs); - /* Multiplication is 0 so just return z */ + if (zc == IEEE754_CLASS_ZERO) { + /* Handle cases +0 + (-0) and similar ones. */ + if ((!(flags & MADDF_NEGATE_PRODUCT) + && (zs == (xs ^ ys))) || + ((flags & MADDF_NEGATE_PRODUCT) + && (zs != (xs ^ ys)))) + /* + * Cases of addition of zeros of equal signs + * or subtraction of zeroes of opposite signs. + * The sign of the resulting zero is in any + * such case determined only by the sign of z. + */ + return z; + + return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD); + } + /* x*y is here 0, and z is not 0, so just return z */ return z; case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): DPDNORMX; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754dp_inf(zs); DPDNORMY; break; case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754dp_inf(zs); DPDNORMX; break; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754dp_inf(zs); /* fall through to real computations */ } @@ -157,7 +190,7 @@ re = xe + ye; rs = xs ^ ys; - if (flags & maddf_negate_product) + if (flags & MADDF_NEGATE_PRODUCT) rs ^= 1; /* shunt to top of word */ @@ -165,7 +198,7 @@ ym <<= 64 - (DP_FBITS + 1); /* - * Multiply 64 bits xm, ym to give high 64 bits rm with stickness. + * Multiply 64 bits xm and ym to give 128 bits result in hrm:lrm. */ /* 32 * 32 => 64 */ @@ -195,81 +228,110 @@ hrm = hrm + (t >> 32); - rm = hrm | (lrm != 0); - - /* - * Sticky shift down to normal rounding precision. - */ - if ((s64) rm < 0) { - rm = (rm >> (64 - (DP_FBITS + 1 + 3))) | - ((rm << (DP_FBITS + 1 + 3)) != 0); + /* Put explicit bit at bit 126 if necessary */ + if ((int64_t)hrm < 0) { + lrm = (hrm << 63) | (lrm >> 1); + hrm = hrm >> 1; re++; - } else { - rm = (rm >> (64 - (DP_FBITS + 1 + 3 + 1))) | - ((rm << (DP_FBITS + 1 + 3 + 1)) != 0); } - assert(rm & (DP_HIDDEN_BIT << 3)); - if (zc == IEEE754_CLASS_ZERO) - return ieee754dp_format(rs, re, rm); + assert(hrm & (1 << 62)); - /* And now the addition */ - assert(zm & DP_HIDDEN_BIT); + if (zc == IEEE754_CLASS_ZERO) { + /* + * Move explicit bit from bit 126 to bit 55 since the + * ieee754dp_format code expects the mantissa to be + * 56 bits wide (53 + 3 rounding bits). + */ + srl128(&hrm, &lrm, (126 - 55)); + return ieee754dp_format(rs, re, lrm); + } - /* - * Provide guard,round and stick bit space. - */ - zm <<= 3; + /* Move explicit bit from bit 52 to bit 126 */ + lzm = 0; + hzm = zm << 10; + assert(hzm & (1 << 62)); + /* Make the exponents the same */ if (ze > re) { /* * Have to shift y fraction right to align. */ s = ze - re; - rm = XDPSRS(rm, s); + srl128(&hrm, &lrm, s); re += s; } else if (re > ze) { /* * Have to shift x fraction right to align. */ s = re - ze; - zm = XDPSRS(zm, s); + srl128(&hzm, &lzm, s); ze += s; } assert(ze == re); assert(ze <= DP_EMAX); + /* Do the addition */ if (zs == rs) { /* - * Generate 28 bit result of adding two 27 bit numbers - * leaving result in xm, xs and xe. + * Generate 128 bit result by adding two 127 bit numbers + * leaving result in hzm:lzm, zs and ze. */ - zm = zm + rm; - - if (zm >> (DP_FBITS + 1 + 3)) { /* carry out */ - zm = XDPSRS1(zm); + hzm = hzm + hrm + (lzm > (lzm + lrm)); + lzm = lzm + lrm; + if ((int64_t)hzm < 0) { /* carry out */ + srl128(&hzm, &lzm, 1); ze++; } } else { - if (zm >= rm) { - zm = zm - rm; + if (hzm > hrm || (hzm == hrm && lzm >= lrm)) { + hzm = hzm - hrm - (lzm < lrm); + lzm = lzm - lrm; } else { - zm = rm - zm; + hzm = hrm - hzm - (lrm < lzm); + lzm = lrm - lzm; zs = rs; } - if (zm == 0) + if (lzm == 0 && hzm == 0) return ieee754dp_zero(ieee754_csr.rm == FPU_CSR_RD); /* - * Normalize to rounding precision. + * Put explicit bit at bit 126 if necessary. */ - while ((zm >> (DP_FBITS + 3)) == 0) { - zm <<= 1; - ze--; + if (hzm == 0) { + /* left shift by 63 or 64 bits */ + if ((int64_t)lzm < 0) { + /* MSB of lzm is the explicit bit */ + hzm = lzm >> 1; + lzm = lzm << 63; + ze -= 63; + } else { + hzm = lzm; + lzm = 0; + ze -= 64; + } + } + + t = 0; + while ((hzm >> (62 - t)) == 0) + t++; + + assert(t <= 62); + if (t) { + hzm = hzm << t | lzm >> (64 - t); + lzm = lzm << t; + ze -= t; } } - return ieee754dp_format(zs, ze, zm); + /* + * Move explicit bit from bit 126 to bit 55 since the + * ieee754dp_format code expects the mantissa to be + * 56 bits wide (53 + 3 rounding bits). + */ + srl128(&hzm, &lzm, (126 - 55)); + + return ieee754dp_format(zs, ze, lzm); } union ieee754dp ieee754dp_maddf(union ieee754dp z, union ieee754dp x, @@ -281,5 +343,5 @@ union ieee754dp ieee754dp_msubf(union ieee754dp z, union ieee754dp x, union ieee754dp y) { - return _dp_maddf(z, x, y, maddf_negate_product); + return _dp_maddf(z, x, y, MADDF_NEGATE_PRODUCT); } --- linux-azure-4.13.0.orig/arch/mips/math-emu/ieee754int.h +++ linux-azure-4.13.0/arch/mips/math-emu/ieee754int.h @@ -26,6 +26,10 @@ #define CLPAIR(x, y) ((x)*6+(y)) +enum maddf_flags { + MADDF_NEGATE_PRODUCT = 1 << 0, +}; + static inline void ieee754_clearcx(void) { ieee754_csr.cx = 0; --- linux-azure-4.13.0.orig/arch/mips/math-emu/ieee754sp.h +++ linux-azure-4.13.0/arch/mips/math-emu/ieee754sp.h @@ -45,6 +45,10 @@ return SPBEXP(x) != SP_EMAX + 1 + SP_EBIAS; } +/* 64 bit right shift with rounding */ +#define XSPSRS64(v, rs) \ + (((rs) >= 64) ? ((v) != 0) : ((v) >> (rs)) | ((v) << (64-(rs)) != 0)) + /* 3bit extended single precision sticky right shift */ #define XSPSRS(v, rs) \ ((rs > (SP_FBITS+3))?1:((v) >> (rs)) | ((v) << (32-(rs)) != 0)) --- linux-azure-4.13.0.orig/arch/mips/math-emu/sp_fmax.c +++ linux-azure-4.13.0/arch/mips/math-emu/sp_fmax.c @@ -47,14 +47,26 @@ case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -80,9 +92,7 @@ return ys ? x : y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; @@ -106,16 +116,32 @@ else if (xs < ys) return x; - /* Compare exponent */ - if (xe > ye) - return x; - else if (xe < ye) - return y; + /* Signs of inputs are equal, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } else { + /* Inputs are both negative */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return y; + return x; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return y; - return x; + return x; + return y; } union ieee754sp ieee754sp_fmaxa(union ieee754sp x, union ieee754sp y) @@ -147,14 +173,26 @@ case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -164,6 +202,9 @@ /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754sp_inf(xs & ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): @@ -171,7 +212,6 @@ case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): return x; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): @@ -180,9 +220,7 @@ return y; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs & ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; @@ -207,7 +245,11 @@ return y; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) return y; - return x; + else if (xm > ym) + return x; + else if (xs == 0) + return x; + return y; } --- linux-azure-4.13.0.orig/arch/mips/math-emu/sp_fmin.c +++ linux-azure-4.13.0/arch/mips/math-emu/sp_fmin.c @@ -47,14 +47,26 @@ case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -80,9 +92,7 @@ return ys ? y : x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; @@ -106,16 +116,32 @@ else if (xs < ys) return y; - /* Compare exponent */ - if (xe > ye) - return y; - else if (xe < ye) - return x; + /* Signs of inputs are the same, let's compare exponents */ + if (xs == 0) { + /* Inputs are both positive */ + if (xe > ye) + return y; + else if (xe < ye) + return x; + } else { + /* Inputs are both negative */ + if (xe > ye) + return x; + else if (xe < ye) + return y; + } - /* Compare mantissa */ + /* Signs and exponents of inputs are equal, let's compare mantissas */ + if (xs == 0) { + /* Inputs are both positive, with equal signs and exponents */ + if (xm <= ym) + return x; + return y; + } + /* Inputs are both negative, with equal signs and exponents */ if (xm <= ym) - return x; - return y; + return y; + return x; } union ieee754sp ieee754sp_fmina(union ieee754sp x, union ieee754sp y) @@ -147,14 +173,26 @@ case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): return ieee754sp_nanxcpt(x); - /* numbers are preferred to NaNs */ + /* + * Quiet NaN handling + */ + + /* + * The case of both inputs quiet NaNs + */ + case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): + return x; + + /* + * The cases of exactly one input quiet NaN (numbers + * are here preferred as returned values to NaNs) + */ case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): return x; - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): @@ -164,25 +202,25 @@ /* * Infinity and zero handling */ + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): + return ieee754sp_inf(xs | ys); + case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): - return x; + return y; - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_DNORM): - return y; + return x; case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): - if (xs == ys) - return x; - return ieee754sp_zero(1); + return ieee754sp_zero(xs | ys); case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; @@ -207,7 +245,11 @@ return x; /* Compare mantissa */ - if (xm <= ym) + if (xm < ym) + return x; + else if (xm > ym) + return y; + else if (xs == 1) return x; return y; } --- linux-azure-4.13.0.orig/arch/mips/math-emu/sp_maddf.c +++ linux-azure-4.13.0/arch/mips/math-emu/sp_maddf.c @@ -14,9 +14,6 @@ #include "ieee754sp.h" -enum maddf_flags { - maddf_negate_product = 1 << 0, -}; static union ieee754sp _sp_maddf(union ieee754sp z, union ieee754sp x, union ieee754sp y, enum maddf_flags flags) @@ -24,14 +21,8 @@ int re; int rs; unsigned rm; - unsigned short lxm; - unsigned short hxm; - unsigned short lym; - unsigned short hym; - unsigned lrm; - unsigned hrm; - unsigned t; - unsigned at; + uint64_t rm64; + uint64_t zm64; int s; COMPXSP; @@ -48,51 +39,35 @@ ieee754_clearcx(); - switch (zc) { - case IEEE754_CLASS_SNAN: - ieee754_setcx(IEEE754_INVALID_OPERATION); + /* + * Handle the cases when at least one of x, y or z is a NaN. + * Order of precedence is sNaN, qNaN and z, x, y. + */ + if (zc == IEEE754_CLASS_SNAN) return ieee754sp_nanxcpt(z); - case IEEE754_CLASS_DNORM: - SPDNORMZ; - /* QNAN and ZERO cases are handled separately below */ - } - - switch (CLPAIR(xc, yc)) { - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_SNAN): + if (xc == IEEE754_CLASS_SNAN) + return ieee754sp_nanxcpt(x); + if (yc == IEEE754_CLASS_SNAN) return ieee754sp_nanxcpt(y); + if (zc == IEEE754_CLASS_QNAN) + return z; + if (xc == IEEE754_CLASS_QNAN) + return x; + if (yc == IEEE754_CLASS_QNAN) + return y; - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_SNAN): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_ZERO): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_NORM): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_DNORM): - case CLPAIR(IEEE754_CLASS_SNAN, IEEE754_CLASS_INF): - return ieee754sp_nanxcpt(x); + if (zc == IEEE754_CLASS_DNORM) + SPDNORMZ; + /* ZERO z cases are handled separately below */ - case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_QNAN): - return y; + switch (CLPAIR(xc, yc)) { - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_QNAN): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_ZERO): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_NORM): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_DNORM): - case CLPAIR(IEEE754_CLASS_QNAN, IEEE754_CLASS_INF): - return x; /* * Infinity handling */ case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_INF): - if (zc == IEEE754_CLASS_QNAN) - return z; ieee754_setcx(IEEE754_INVALID_OPERATION); return ieee754sp_indef(); @@ -101,9 +76,27 @@ case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_NORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_DNORM): case CLPAIR(IEEE754_CLASS_INF, IEEE754_CLASS_INF): - if (zc == IEEE754_CLASS_QNAN) - return z; - return ieee754sp_inf(xs ^ ys); + if ((zc == IEEE754_CLASS_INF) && + ((!(flags & MADDF_NEGATE_PRODUCT) && (zs != (xs ^ ys))) || + ((flags & MADDF_NEGATE_PRODUCT) && (zs == (xs ^ ys))))) { + /* + * Cases of addition of infinities with opposite signs + * or subtraction of infinities with same signs. + */ + ieee754_setcx(IEEE754_INVALID_OPERATION); + return ieee754sp_indef(); + } + /* + * z is here either not an infinity, or an infinity having the + * same sign as product (x*y) (in case of MADDF.D instruction) + * or product -(x*y) (in MSUBF.D case). The result must be an + * infinity, and its sign is determined only by the value of + * (flags & MADDF_NEGATE_PRODUCT) and the signs of x and y. + */ + if (flags & MADDF_NEGATE_PRODUCT) + return ieee754sp_inf(1 ^ (xs ^ ys)); + else + return ieee754sp_inf(xs ^ ys); case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_ZERO): case CLPAIR(IEEE754_CLASS_ZERO, IEEE754_CLASS_NORM): @@ -112,32 +105,42 @@ case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_ZERO): if (zc == IEEE754_CLASS_INF) return ieee754sp_inf(zs); - /* Multiplication is 0 so just return z */ + if (zc == IEEE754_CLASS_ZERO) { + /* Handle cases +0 + (-0) and similar ones. */ + if ((!(flags & MADDF_NEGATE_PRODUCT) + && (zs == (xs ^ ys))) || + ((flags & MADDF_NEGATE_PRODUCT) + && (zs != (xs ^ ys)))) + /* + * Cases of addition of zeros of equal signs + * or subtraction of zeroes of opposite signs. + * The sign of the resulting zero is in any + * such case determined only by the sign of z. + */ + return z; + + return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD); + } + /* x*y is here 0, and z is not 0, so just return z */ return z; case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_DNORM): SPDNORMX; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_DNORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754sp_inf(zs); SPDNORMY; break; case CLPAIR(IEEE754_CLASS_DNORM, IEEE754_CLASS_NORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754sp_inf(zs); SPDNORMX; break; case CLPAIR(IEEE754_CLASS_NORM, IEEE754_CLASS_NORM): - if (zc == IEEE754_CLASS_QNAN) - return z; - else if (zc == IEEE754_CLASS_INF) + if (zc == IEEE754_CLASS_INF) return ieee754sp_inf(zs); /* fall through to real computations */ } @@ -158,111 +161,93 @@ re = xe + ye; rs = xs ^ ys; - if (flags & maddf_negate_product) + if (flags & MADDF_NEGATE_PRODUCT) rs ^= 1; - /* shunt to top of word */ - xm <<= 32 - (SP_FBITS + 1); - ym <<= 32 - (SP_FBITS + 1); - - /* - * Multiply 32 bits xm, ym to give high 32 bits rm with stickness. - */ - lxm = xm & 0xffff; - hxm = xm >> 16; - lym = ym & 0xffff; - hym = ym >> 16; - - lrm = lxm * lym; /* 16 * 16 => 32 */ - hrm = hxm * hym; /* 16 * 16 => 32 */ - - t = lxm * hym; /* 16 * 16 => 32 */ - at = lrm + (t << 16); - hrm += at < lrm; - lrm = at; - hrm = hrm + (t >> 16); - - t = hxm * lym; /* 16 * 16 => 32 */ - at = lrm + (t << 16); - hrm += at < lrm; - lrm = at; - hrm = hrm + (t >> 16); + /* Multiple 24 bit xm and ym to give 48 bit results */ + rm64 = (uint64_t)xm * ym; - rm = hrm | (lrm != 0); + /* Shunt to top of word */ + rm64 = rm64 << 16; - /* - * Sticky shift down to normal rounding precision. - */ - if ((int) rm < 0) { - rm = (rm >> (32 - (SP_FBITS + 1 + 3))) | - ((rm << (SP_FBITS + 1 + 3)) != 0); + /* Put explicit bit at bit 62 if necessary */ + if ((int64_t) rm64 < 0) { + rm64 = rm64 >> 1; re++; - } else { - rm = (rm >> (32 - (SP_FBITS + 1 + 3 + 1))) | - ((rm << (SP_FBITS + 1 + 3 + 1)) != 0); } - assert(rm & (SP_HIDDEN_BIT << 3)); - if (zc == IEEE754_CLASS_ZERO) - return ieee754sp_format(rs, re, rm); + assert(rm64 & (1 << 62)); - /* And now the addition */ - - assert(zm & SP_HIDDEN_BIT); + if (zc == IEEE754_CLASS_ZERO) { + /* + * Move explicit bit from bit 62 to bit 26 since the + * ieee754sp_format code expects the mantissa to be + * 27 bits wide (24 + 3 rounding bits). + */ + rm = XSPSRS64(rm64, (62 - 26)); + return ieee754sp_format(rs, re, rm); + } - /* - * Provide guard,round and stick bit space. - */ - zm <<= 3; + /* Move explicit bit from bit 23 to bit 62 */ + zm64 = (uint64_t)zm << (62 - 23); + assert(zm64 & (1 << 62)); + /* Make the exponents the same */ if (ze > re) { /* * Have to shift r fraction right to align. */ s = ze - re; - rm = XSPSRS(rm, s); + rm64 = XSPSRS64(rm64, s); re += s; } else if (re > ze) { /* * Have to shift z fraction right to align. */ s = re - ze; - zm = XSPSRS(zm, s); + zm64 = XSPSRS64(zm64, s); ze += s; } assert(ze == re); assert(ze <= SP_EMAX); + /* Do the addition */ if (zs == rs) { /* - * Generate 28 bit result of adding two 27 bit numbers - * leaving result in zm, zs and ze. + * Generate 64 bit result by adding two 63 bit numbers + * leaving result in zm64, zs and ze. */ - zm = zm + rm; - - if (zm >> (SP_FBITS + 1 + 3)) { /* carry out */ - zm = XSPSRS1(zm); + zm64 = zm64 + rm64; + if ((int64_t)zm64 < 0) { /* carry out */ + zm64 = XSPSRS1(zm64); ze++; } } else { - if (zm >= rm) { - zm = zm - rm; + if (zm64 >= rm64) { + zm64 = zm64 - rm64; } else { - zm = rm - zm; + zm64 = rm64 - zm64; zs = rs; } - if (zm == 0) + if (zm64 == 0) return ieee754sp_zero(ieee754_csr.rm == FPU_CSR_RD); /* - * Normalize in extended single precision + * Put explicit bit at bit 62 if necessary. */ - while ((zm >> (SP_MBITS + 3)) == 0) { - zm <<= 1; + while ((zm64 >> 62) == 0) { + zm64 <<= 1; ze--; } - } + + /* + * Move explicit bit from bit 62 to bit 26 since the + * ieee754sp_format code expects the mantissa to be + * 27 bits wide (24 + 3 rounding bits). + */ + zm = XSPSRS64(zm64, (62 - 26)); + return ieee754sp_format(zs, ze, zm); } @@ -275,5 +260,5 @@ union ieee754sp ieee754sp_msubf(union ieee754sp z, union ieee754sp x, union ieee754sp y) { - return _sp_maddf(z, x, y, maddf_negate_product); + return _sp_maddf(z, x, y, MADDF_NEGATE_PRODUCT); } --- linux-azure-4.13.0.orig/arch/mips/mm/uasm-micromips.c +++ linux-azure-4.13.0/arch/mips/mm/uasm-micromips.c @@ -80,7 +80,7 @@ [insn_jr] = {M(mm_pool32a_op, 0, 0, 0, mm_jalr_op, mm_pool32axf_op), RS}, [insn_lb] = {M(mm_lb32_op, 0, 0, 0, 0, 0), RT | RS | SIMM}, [insn_ld] = {0, 0}, - [insn_lh] = {M(mm_lh32_op, 0, 0, 0, 0, 0), RS | RS | SIMM}, + [insn_lh] = {M(mm_lh32_op, 0, 0, 0, 0, 0), RT | RS | SIMM}, [insn_ll] = {M(mm_pool32c_op, 0, 0, (mm_ll_func << 1), 0, 0), RS | RT | SIMM}, [insn_lld] = {0, 0}, [insn_lui] = {M(mm_pool32i_op, mm_lui_op, 0, 0, 0, 0), RS | SIMM}, --- linux-azure-4.13.0.orig/arch/mips/net/ebpf_jit.c +++ linux-azure-4.13.0/arch/mips/net/ebpf_jit.c @@ -679,7 +679,7 @@ { int src, dst, r, td, ts, mem_off, b_off; bool need_swap, did_move, cmp_eq; - unsigned int target; + unsigned int target = 0; u64 t64; s64 t64s; @@ -1485,7 +1485,7 @@ } src = ebpf_to_mips_reg(ctx, insn, src_reg_no_fp); if (src < 0) - return dst; + return src; if (BPF_MODE(insn->code) == BPF_XADD) { switch (BPF_SIZE(insn->code)) { case BPF_W: --- linux-azure-4.13.0.orig/arch/openrisc/include/asm/futex.h +++ linux-azure-4.13.0/arch/openrisc/include/asm/futex.h @@ -30,20 +30,10 @@ }) static inline int -futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -68,30 +58,9 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/parisc/include/asm/futex.h +++ linux-azure-4.13.0/arch/parisc/include/asm/futex.h @@ -32,22 +32,12 @@ } static inline int -futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +arch_futex_atomic_op_inuser(int op, int oparg, int *oval, u32 __user *uaddr) { unsigned long int flags; - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval, ret; u32 tmp; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(*uaddr))) - return -EFAULT; - _futex_spin_lock_irqsave(uaddr, &flags); pagefault_disable(); @@ -85,17 +75,9 @@ pagefault_enable(); _futex_spin_unlock_irqrestore(uaddr, &flags); - if (ret == 0) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/parisc/kernel/syscall.S +++ linux-azure-4.13.0/arch/parisc/kernel/syscall.S @@ -742,7 +742,7 @@ 10: ldd 0(%r25), %r25 11: ldd 0(%r24), %r24 #else - /* Load new value into r22/r23 - high/low */ + /* Load old value into r22/r23 - high/low */ 10: ldw 0(%r25), %r22 11: ldw 4(%r25), %r23 /* Load new value into fr4 for atomic store later */ @@ -834,11 +834,11 @@ copy %r0, %r28 #else /* Compare first word */ -19: ldw,ma 0(%r26), %r29 +19: ldw 0(%r26), %r29 sub,= %r29, %r22, %r0 b,n cas2_end /* Compare second word */ -20: ldw,ma 4(%r26), %r29 +20: ldw 4(%r26), %r29 sub,= %r29, %r23, %r0 b,n cas2_end /* Perform the store */ --- linux-azure-4.13.0.orig/arch/parisc/kernel/time.c +++ linux-azure-4.13.0/arch/parisc/kernel/time.c @@ -253,7 +253,10 @@ cpu0_loc = per_cpu(cpu_data, 0).cpu_loc; for_each_online_cpu(cpu) { - if (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc) + if (cpu == 0) + continue; + if ((cpu0_loc != 0) && + (cpu0_loc == per_cpu(cpu_data, cpu).cpu_loc)) continue; clocksource_cr16.name = "cr16_unstable"; --- linux-azure-4.13.0.orig/arch/powerpc/Kconfig +++ linux-azure-4.13.0/arch/powerpc/Kconfig @@ -1193,6 +1193,8 @@ source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "fs/Kconfig" source "lib/Kconfig" --- linux-azure-4.13.0.orig/arch/powerpc/Kconfig.debug +++ linux-azure-4.13.0/arch/powerpc/Kconfig.debug @@ -369,4 +369,14 @@ def_bool y depends on PPC_PTDUMP && PPC_BOOK3S +config PPC_DEBUG_RFI + bool "Debug RFIs (Return From Interrupt)" + depends on PPC_BOOK3S_64 + help + The enables extra debug code in some RFI (Return From Interrupt) + sequences, to detect that we are returning to the correct context + (user, kernel or guest). This adds some performance overhead. + + If unsure, say N. + endmenu --- linux-azure-4.13.0.orig/arch/powerpc/Makefile +++ linux-azure-4.13.0/arch/powerpc/Makefile @@ -279,7 +279,7 @@ all: zImage # With make 3.82 we cannot mix normal and wildcard targets -BOOT_TARGETS1 := zImage zImage.initrd uImage +BOOT_TARGETS1 := zImage zImage.initrd uImage vmlinux.strip BOOT_TARGETS2 := zImage% dtbImage% treeImage.% cuImage.% simpleImage.% uImage.% PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2) --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/barrier.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/barrier.h @@ -34,6 +34,8 @@ #define rmb() __asm__ __volatile__ ("sync" : : : "memory") #define wmb() __asm__ __volatile__ ("sync" : : : "memory") +#define osb() asm volatile("ori 31,31,0") + #ifdef __SUBARCH_HAS_LWSYNC # define SMPWMB LWSYNC #else --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/book3s/64/hash.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/book3s/64/hash.h @@ -40,7 +40,7 @@ * Define the address range of the kernel non-linear virtual area */ #define H_KERN_VIRT_START ASM_CONST(0xD000000000000000) -#define H_KERN_VIRT_SIZE ASM_CONST(0x0000100000000000) +#define H_KERN_VIRT_SIZE ASM_CONST(0x0000400000000000) /* 64T */ /* * The vmalloc space starts at the beginning of that region, and @@ -48,9 +48,11 @@ * (we keep a quarter for the virtual memmap) */ #define H_VMALLOC_START H_KERN_VIRT_START -#define H_VMALLOC_SIZE (H_KERN_VIRT_SIZE >> 1) +#define H_VMALLOC_SIZE ASM_CONST(0x380000000000) /* 56T */ #define H_VMALLOC_END (H_VMALLOC_START + H_VMALLOC_SIZE) +#define H_KERN_IO_START H_VMALLOC_END + /* * Region IDs */ --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/book3s/64/hugetlb.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/book3s/64/hugetlb.h @@ -54,9 +54,7 @@ #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE static inline bool gigantic_page_supported(void) { - if (radix_enabled()) - return true; - return false; + return true; } #endif --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/book3s/64/mmu-hash.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/book3s/64/mmu-hash.h @@ -468,7 +468,7 @@ int psize, int ssize); int htab_remove_mapping(unsigned long vstart, unsigned long vend, int psize, int ssize); -extern void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages); +extern void pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages); extern void demote_segment_4k(struct mm_struct *mm, unsigned long addr); #ifdef CONFIG_PPC_PSERIES --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/book3s/64/pgtable.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -272,8 +272,10 @@ extern unsigned long __kernel_virt_start; extern unsigned long __kernel_virt_size; +extern unsigned long __kernel_io_start; #define KERN_VIRT_START __kernel_virt_start #define KERN_VIRT_SIZE __kernel_virt_size +#define KERN_IO_START __kernel_io_start extern struct page *vmemmap; extern unsigned long ioremap_bot; extern unsigned long pci_io_base; @@ -298,7 +300,6 @@ * PHB_IO_BASE = ISA_IO_BASE + 64K to ISA_IO_BASE + 2G, PHB IO spaces * IOREMAP_BASE = ISA_IO_BASE + 2G to VMALLOC_START + PGTABLE_RANGE */ -#define KERN_IO_START (KERN_VIRT_START + (KERN_VIRT_SIZE >> 1)) #define FULL_IO_SIZE 0x80000000ul #define ISA_IO_BASE (KERN_IO_START) #define ISA_IO_END (KERN_IO_START + 0x10000ul) --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/book3s/64/radix.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/book3s/64/radix.h @@ -110,6 +110,8 @@ */ #define RADIX_VMEMMAP_BASE (RADIX_VMALLOC_END) +#define RADIX_KERN_IO_START (RADIX_KERN_VIRT_START + (RADIX_KERN_VIRT_SIZE >> 1)) + #ifndef __ASSEMBLY__ #define RADIX_PTE_TABLE_SIZE (sizeof(pte_t) << RADIX_PTE_INDEX_SIZE) #define RADIX_PMD_TABLE_SIZE (sizeof(pmd_t) << RADIX_PMD_INDEX_SIZE) --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/book3s/64/tlbflush-radix.h @@ -22,22 +22,21 @@ extern void radix__local_flush_tlb_mm(struct mm_struct *mm); extern void radix__local_flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); -extern void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); extern void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); extern void radix__tlb_flush(struct mmu_gather *tlb); #ifdef CONFIG_SMP extern void radix__flush_tlb_mm(struct mm_struct *mm); extern void radix__flush_tlb_page(struct vm_area_struct *vma, unsigned long vmaddr); -extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); extern void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize); #else #define radix__flush_tlb_mm(mm) radix__local_flush_tlb_mm(mm) #define radix__flush_tlb_page(vma,addr) radix__local_flush_tlb_page(vma,addr) #define radix__flush_tlb_page_psize(mm,addr,p) radix__local_flush_tlb_page_psize(mm,addr,p) -#define radix__flush_tlb_pwc(tlb, addr) radix__local_flush_tlb_pwc(tlb, addr) #endif +extern void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr); +extern void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr); extern void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa, unsigned long page_size); extern void radix__flush_tlb_lpid(unsigned long lpid); --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/code-patching.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/code-patching.h @@ -83,16 +83,8 @@ * On PPC64 ABIv1 the function pointer actually points to the * function's descriptor. The first entry in the descriptor is the * address of the function text. - * - * However, we may also receive pointer to an assembly symbol. To - * detect that, we first check if the function pointer we receive - * already points to kernel/module text and we only dereference it - * if it doesn't. */ - if (kernel_text_address((unsigned long)func)) - return (unsigned long)func; - else - return ((func_descr_t *)func)->entry; + return ((func_descr_t *)func)->entry; #else return (unsigned long)func; #endif --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/cpuidle.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/cpuidle.h @@ -67,6 +67,17 @@ #define ERR_DEEP_STATE_ESL_MISMATCH -2 #ifndef __ASSEMBLY__ +/* Additional SPRs that need to be saved/restored during stop */ +struct stop_sprs { + u64 pid; + u64 ldbar; + u64 fscr; + u64 hfscr; + u64 mmcr1; + u64 mmcr2; + u64 mmcra; +}; + extern u32 pnv_fastsleep_workaround_at_entry[]; extern u32 pnv_fastsleep_workaround_at_exit[]; --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/cputhreads.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/cputhreads.h @@ -2,6 +2,7 @@ #define _ASM_POWERPC_CPUTHREADS_H #ifndef __ASSEMBLY__ +#include #include #include --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/emulated_ops.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/emulated_ops.h @@ -55,6 +55,10 @@ struct ppc_emulated_entry mfdscr; struct ppc_emulated_entry mtdscr; struct ppc_emulated_entry lq_stq; + struct ppc_emulated_entry lxvw4x; + struct ppc_emulated_entry lxvh8x; + struct ppc_emulated_entry lxvd2x; + struct ppc_emulated_entry lxvb16x; #endif } ppc_emulated; --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/epapr_hcalls.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/epapr_hcalls.h @@ -508,7 +508,7 @@ static inline long epapr_hypercall0_1(unsigned int nr, unsigned long *r2) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; unsigned long r; @@ -520,7 +520,7 @@ static inline long epapr_hypercall0(unsigned int nr) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; return epapr_hypercall(in, out, nr); @@ -528,7 +528,7 @@ static inline long epapr_hypercall1(unsigned int nr, unsigned long p1) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -538,7 +538,7 @@ static inline long epapr_hypercall2(unsigned int nr, unsigned long p1, unsigned long p2) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -549,7 +549,7 @@ static inline long epapr_hypercall3(unsigned int nr, unsigned long p1, unsigned long p2, unsigned long p3) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; @@ -562,7 +562,7 @@ unsigned long p2, unsigned long p3, unsigned long p4) { - unsigned long in[8]; + unsigned long in[8] = {0}; unsigned long out[8]; in[0] = p1; --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/exception-64e.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/exception-64e.h @@ -209,5 +209,11 @@ ori r3,r3,vector_offset@l; \ mtspr SPRN_IVOR##vector_number,r3; +#define RFI_TO_KERNEL \ + rfi + +#define RFI_TO_USER \ + rfi + #endif /* _ASM_POWERPC_EXCEPTION_64E_H */ --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/exception-64s.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/exception-64s.h @@ -35,6 +35,7 @@ * implementations as possible. */ #include +#include /* PACA save area offsets (exgen, exmc, etc) */ #define EX_R9 0 @@ -69,6 +70,73 @@ */ #define EX_R3 EX_DAR +/* + * The nop instructions allow us to insert one or more instructions to flush the + * L1-D cache when return to userspace or a guest. + */ +#define RFI_FLUSH_SLOT \ + RFI_FLUSH_FIXUP_SECTION; \ + nop; \ + nop; \ + nop + +#ifdef CONFIG_PPC_DEBUG_RFI +#define CHECK_TARGET_MSR_PR(srr_reg, expected_pr) \ + SET_SCRATCH0(r3); \ + mfspr r3,srr_reg; \ + extrdi r3,r3,1,63-MSR_PR_LG; \ +666: tdnei r3,expected_pr; \ + EMIT_BUG_ENTRY 666b,__FILE__,__LINE__,0; \ + GET_SCRATCH0(r3); +#else +#define CHECK_TARGET_MSR_PR(srr_reg, expected_pr) +#endif + +#define RFI_TO_KERNEL \ + CHECK_TARGET_MSR_PR(SPRN_SRR1, 0); \ + rfid + +#define RFI_TO_USER \ + CHECK_TARGET_MSR_PR(SPRN_SRR1, 1); \ + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback + +#define RFI_TO_USER_OR_KERNEL \ + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback + +#define RFI_TO_GUEST \ + RFI_FLUSH_SLOT; \ + rfid; \ + b rfi_flush_fallback + +#define HRFI_TO_KERNEL \ + CHECK_TARGET_MSR_PR(SPRN_HSRR1, 0); \ + hrfid + +#define HRFI_TO_USER \ + CHECK_TARGET_MSR_PR(SPRN_HSRR1, 1); \ + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback + +#define HRFI_TO_USER_OR_KERNEL \ + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback + +#define HRFI_TO_GUEST \ + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback + +#define HRFI_TO_UNKNOWN \ + RFI_FLUSH_SLOT; \ + hrfid; \ + b hrfi_flush_fallback + #ifdef CONFIG_RELOCATABLE #define __EXCEPTION_RELON_PROLOG_PSERIES_1(label, h) \ mfspr r11,SPRN_##h##SRR0; /* save SRR0 */ \ @@ -213,7 +281,7 @@ mtspr SPRN_##h##SRR0,r12; \ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ mtspr SPRN_##h##SRR1,r10; \ - h##rfid; \ + h##RFI_TO_KERNEL; \ b . /* prevent speculative execution */ #define EXCEPTION_PROLOG_PSERIES_1(label, h) \ __EXCEPTION_PROLOG_PSERIES_1(label, h) @@ -227,7 +295,7 @@ mtspr SPRN_##h##SRR0,r12; \ mfspr r12,SPRN_##h##SRR1; /* and SRR1 */ \ mtspr SPRN_##h##SRR1,r10; \ - h##rfid; \ + h##RFI_TO_KERNEL; \ b . /* prevent speculative execution */ #define EXCEPTION_PROLOG_PSERIES_1_NORI(label, h) \ --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/feature-fixups.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/feature-fixups.h @@ -187,7 +187,20 @@ FTR_ENTRY_OFFSET label##1b-label##3b; \ .popsection; +#define RFI_FLUSH_FIXUP_SECTION \ +951: \ + .pushsection __rfi_flush_fixup,"a"; \ + .align 2; \ +952: \ + FTR_ENTRY_OFFSET 951b-952b; \ + .popsection; + + #ifndef __ASSEMBLY__ +#include + +extern long __start___rfi_flush_fixup, __stop___rfi_flush_fixup; + void apply_feature_fixups(void); void setup_feature_keys(void); #endif --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/futex.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/futex.h @@ -29,18 +29,10 @@ : "b" (uaddr), "i" (-EFAULT), "r" (oparg) \ : "cr0", "memory") -static inline int futex_atomic_op_inuser (int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (! access_ok (VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; pagefault_disable(); @@ -66,17 +58,9 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/hugetlb.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/hugetlb.h @@ -218,18 +218,4 @@ } #endif /* CONFIG_HUGETLB_PAGE */ -/* - * FSL Book3E platforms require special gpage handling - the gpages - * are reserved early in the boot process by memblock instead of via - * the .dts as on IBM platforms. - */ -#if defined(CONFIG_HUGETLB_PAGE) && (defined(CONFIG_PPC_FSL_BOOK3E) || \ - defined(CONFIG_PPC_8xx)) -extern void __init reserve_hugetlb_gpages(void); -#else -static inline void reserve_hugetlb_gpages(void) -{ -} -#endif - #endif /* _ASM_POWERPC_HUGETLB_H */ --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/hvcall.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/hvcall.h @@ -240,6 +240,7 @@ #define H_GET_HCA_INFO 0x1B8 #define H_GET_PERF_COUNT 0x1BC #define H_MANAGE_TRACE 0x1C0 +#define H_GET_CPU_CHARACTERISTICS 0x1C8 #define H_FREE_LOGICAL_LAN_BUFFER 0x1D4 #define H_QUERY_INT_STATE 0x1E4 #define H_POLL_PENDING 0x1D8 @@ -318,6 +319,19 @@ #define H_SIGNAL_SYS_RESET_ALL_OTHERS -2 /* >= 0 values are CPU number */ +/* H_GET_CPU_CHARACTERISTICS return values */ +#define H_GET_CPU_CHAR_CHAR_ORI31_SPEC_BAR PPC_BIT(0) +#define H_GET_CPU_CHAR_CHAR_BCCTR_SERIAL PPC_BIT(1) +#define H_GET_CPU_CHAR_CHAR_ORI30_L1_FLUSH PPC_BIT(2) +#define H_GET_CPU_CHAR_CHAR_MTTRIG2_L1_FLUSH PPC_BIT(3) +#define H_GET_CPU_CHAR_CHAR_L1D_PRIVATE PPC_BIT(4) +#define H_GET_CPU_CHAR_CHAR_BC_HINTS_HONORED PPC_BIT(5) +#define H_GET_CPU_CHAR_CHAR_MTTRID01_THR_CFG PPC_BIT(6) + +#define H_GET_CPU_CHAR_BEHAV_FAV_SEC_VS_PERF PPC_BIT(0) +#define H_GET_CPU_CHAR_BEHAV_L1_FLUSH_LOW_PRIV PPC_BIT(1) +#define H_GET_CPU_CHAR_BEHAV_SPEC_BAR_BNDS_CHK PPC_BIT(2) + /* Flag values used in H_REGISTER_PROC_TBL hcall */ #define PROC_TABLE_OP_MASK 0x18 #define PROC_TABLE_DEREG 0x10 --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/imc-pmu.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/imc-pmu.h @@ -0,0 +1,128 @@ +#ifndef __ASM_POWERPC_IMC_PMU_H +#define __ASM_POWERPC_IMC_PMU_H + +/* + * IMC Nest Performance Monitor counter support. + * + * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation. + * (C) 2017 Anju T Sudhakar, IBM Corporation. + * (C) 2017 Hemant K Shaw, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or later version. + */ + +#include +#include +#include +#include +#include + +/* + * For static allocation of some of the structures. + */ +#define IMC_MAX_PMUS 32 + +/* + * Compatibility macros for IMC devices + */ +#define IMC_DTB_COMPAT "ibm,opal-in-memory-counters" +#define IMC_DTB_UNIT_COMPAT "ibm,imc-counters" + + +/* + * LDBAR: Counter address and Enable/Disable macro. + * perf/imc-pmu.c has the LDBAR layout information. + */ +#define THREAD_IMC_LDBAR_MASK 0x0003ffffffffe000ULL +#define THREAD_IMC_ENABLE 0x8000000000000000ULL + +/* + * Structure to hold memory address information for imc units. + */ +struct imc_mem_info { + u64 *vbase; + u32 id; +}; + +/* + * Place holder for nest pmu events and values. + */ +struct imc_events { + u32 value; + char *name; + char *unit; + char *scale; +}; + +/* Event attribute array index */ +#define IMC_FORMAT_ATTR 0 +#define IMC_EVENT_ATTR 1 +#define IMC_CPUMASK_ATTR 2 +#define IMC_NULL_ATTR 3 + +/* PMU Format attribute macros */ +#define IMC_EVENT_OFFSET_MASK 0xffffffffULL + +/* + * Device tree parser code detects IMC pmu support and + * registers new IMC pmus. This structure will hold the + * pmu functions, events, counter memory information + * and attrs for each imc pmu and will be referenced at + * the time of pmu registration. + */ +struct imc_pmu { + struct pmu pmu; + struct imc_mem_info *mem_info; + struct imc_events **events; + /* + * Attribute groups for the PMU. Slot 0 used for + * format attribute, slot 1 used for cpusmask attribute, + * slot 2 used for event attribute. Slot 3 keep as + * NULL. + */ + const struct attribute_group *attr_groups[4]; + u32 counter_mem_size; + int domain; + /* + * flag to notify whether the memory is mmaped + * or allocated by kernel. + */ + bool imc_counter_mmaped; +}; + +/* + * Structure to hold id, lock and reference count for the imc events which + * are inited. + */ +struct imc_pmu_ref { + struct mutex lock; + unsigned int id; + int refc; +}; + +/* + * In-Memory Collection Counters type. + * Data comes from Device tree. + * Three device type are supported. + */ + +enum { + IMC_TYPE_THREAD = 0x1, + IMC_TYPE_CORE = 0x4, + IMC_TYPE_CHIP = 0x10, +}; + +/* + * Domains for IMC PMUs + */ +#define IMC_DOMAIN_NEST 1 +#define IMC_DOMAIN_CORE 2 +#define IMC_DOMAIN_THREAD 3 + +extern int init_imc_pmu(struct device_node *parent, + struct imc_pmu *pmu_ptr, int pmu_id); +extern void thread_imc_disable(void); +#endif /* __ASM_POWERPC_IMC_PMU_H */ --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/machdep.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/machdep.h @@ -76,7 +76,6 @@ void __noreturn (*restart)(char *cmd); void __noreturn (*halt)(void); - void (*panic)(char *str); void (*cpu_die)(void); long (*time_init)(void); /* Optional, may be NULL */ --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/mmu_context.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/mmu_context.h @@ -185,9 +185,10 @@ #endif } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) { + return 0; } static inline void arch_exit_mmap(struct mm_struct *mm) --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/opal-api.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/opal-api.h @@ -190,7 +190,10 @@ #define OPAL_NPU_INIT_CONTEXT 146 #define OPAL_NPU_DESTROY_CONTEXT 147 #define OPAL_NPU_MAP_LPAR 148 -#define OPAL_LAST 148 +#define OPAL_IMC_COUNTERS_INIT 149 +#define OPAL_IMC_COUNTERS_START 150 +#define OPAL_IMC_COUNTERS_STOP 151 +#define OPAL_LAST 151 /* Device tree flags */ @@ -1084,6 +1087,13 @@ XIVE_DUMP_EMU_STATE = 5, }; +/* "type" argument options for OPAL_IMC_COUNTERS_* calls */ +enum { + OPAL_IMC_COUNTERS_NEST = 1, + OPAL_IMC_COUNTERS_CORE = 2, +}; + + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_API_H */ --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/opal.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/opal.h @@ -50,7 +50,7 @@ uint32_t hour_min); int64_t opal_cec_power_down(uint64_t request); int64_t opal_cec_reboot(void); -int64_t opal_cec_reboot2(uint32_t reboot_type, char *diag); +int64_t opal_cec_reboot2(uint32_t reboot_type, const char *diag); int64_t opal_read_nvram(uint64_t buffer, uint64_t size, uint64_t offset); int64_t opal_write_nvram(uint64_t buffer, uint64_t size, uint64_t offset); int64_t opal_handle_interrupt(uint64_t isn, __be64 *outstanding_event_mask); @@ -268,6 +268,11 @@ int64_t opal_xive_sync(uint32_t type, uint32_t id); int64_t opal_xive_dump(uint32_t type, uint32_t id); +int64_t opal_imc_counters_init(uint32_t type, uint64_t address, + uint64_t cpu_pir); +int64_t opal_imc_counters_start(uint32_t type, uint64_t cpu_pir); +int64_t opal_imc_counters_stop(uint32_t type, uint64_t cpu_pir); + /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, int depth, void *data); --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/paca.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/paca.h @@ -31,6 +31,7 @@ #endif #include #include +#include register struct paca_struct *local_paca asm("r13"); @@ -48,6 +49,9 @@ #define get_lppaca() (get_paca()->lppaca_ptr) #define get_slb_shadow() (get_paca()->slb_shadow_ptr) +/* Maximum number of threads per core. */ +#define MAX_SMT 8 + struct task_struct; /* @@ -183,6 +187,12 @@ struct paca_struct **thread_sibling_pacas; /* The PSSCR value that the kernel requested before going to stop */ u64 requested_psscr; + + /* + * Save area for additional SPRs that need to be + * saved/restored during cpuidle stop. + */ + struct stop_sprs stop_sprs; #endif #ifdef CONFIG_PPC_STD_MMU_64 @@ -191,6 +201,8 @@ u64 exmc[EX_SIZE]; /* used for machine checks */ #endif #ifdef CONFIG_PPC_BOOK3S_64 + void *rfi_flush_fallback_area; + /* Exclusive stacks for system reset and machine check exception. */ void *nmi_emergency_sp; void *mc_emergency_sp; @@ -203,6 +215,7 @@ */ u16 in_mce; u8 hmi_event_available; /* HMI event is available */ + u8 hmi_p9_special_emu; /* HMI P9 special emulation */ #endif /* Stuff for accurate time accounting */ @@ -224,6 +237,15 @@ struct sibling_subcore_state *sibling_subcore_state; #endif #endif +#ifdef CONFIG_PPC_BOOK3S_64 + /* + * rfi fallback flush must be in its own cacheline to prevent + * other paca data leaking into the L1d + */ + u64 exrfi[EX_SIZE] __aligned(0x80); + u64 l1d_flush_congruence; + u64 l1d_flush_sets; +#endif }; extern void copy_mm_to_paca(struct mm_struct *mm); --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/plpar_wrappers.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/plpar_wrappers.h @@ -325,4 +325,20 @@ return plpar_hcall_norets(H_SIGNAL_SYS_RESET, cpu); } +static inline long plpar_get_cpu_characteristics(unsigned long *character, + unsigned long *behavior) +{ + long rc; + unsigned long retbuf[PLPAR_HCALL_BUFSIZE]; + + rc = plpar_hcall(H_GET_CPU_CHARACTERISTICS, retbuf); + + if (character) + *character = retbuf[0]; + if (behavior) + *behavior = retbuf[1]; + + return rc; +} + #endif /* _ASM_POWERPC_PLPAR_WRAPPERS_H */ --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/setup.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/setup.h @@ -23,7 +23,6 @@ void check_for_initrd(void); void initmem_init(void); -void setup_panic(void); #define ARCH_PANIC_TIMEOUT 180 #ifdef CONFIG_PPC_PSERIES @@ -38,6 +37,19 @@ static inline void pseries_little_endian_exceptions(void) {} #endif /* CONFIG_PPC_PSERIES */ +void rfi_flush_enable(bool enable); + +/* These are bit flags */ +enum l1d_flush_type { + L1D_FLUSH_NONE = 0x1, + L1D_FLUSH_FALLBACK = 0x2, + L1D_FLUSH_ORI = 0x4, + L1D_FLUSH_MTTRIG = 0x8, +}; + +void __init setup_rfi_flush(enum l1d_flush_type, bool enable); +void do_rfi_flush_fixups(enum l1d_flush_type types); + #endif /* !__ASSEMBLY__ */ #endif /* _ASM_POWERPC_SETUP_H */ --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/smp.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/smp.h @@ -30,6 +30,7 @@ #include extern int boot_cpuid; +extern int boot_hw_cpuid; extern int spinning_secondaries; extern void cpu_die(void); --- linux-azure-4.13.0.orig/arch/powerpc/include/asm/uaccess.h +++ linux-azure-4.13.0/arch/powerpc/include/asm/uaccess.h @@ -173,6 +173,23 @@ extern long __get_user_bad(void); +/* + * This does an atomic 128 byte aligned load from userspace. + * Upto caller to do enable_kernel_vmx() before calling! + */ +#define __get_user_atomic_128_aligned(kaddr, uaddr, err) \ + __asm__ __volatile__( \ + "1: lvx 0,0,%1 # get user\n" \ + " stvx 0,0,%2 # put kernel\n" \ + "2:\n" \ + ".section .fixup,\"ax\"\n" \ + "3: li %0,%3\n" \ + " b 2b\n" \ + ".previous\n" \ + EX_TABLE(1b, 3b) \ + : "=r" (err) \ + : "b" (uaddr), "b" (kaddr), "i" (-EFAULT), "0" (err)) + #define __get_user_asm(x, addr, err, op) \ __asm__ __volatile__( \ "1: "op" %1,0(%2) # get_user\n" \ --- linux-azure-4.13.0.orig/arch/powerpc/kernel/align.c +++ linux-azure-4.13.0/arch/powerpc/kernel/align.c @@ -235,6 +235,28 @@ #define SWIZ_PTR(p) ((unsigned char __user *)((p) ^ swiz)) +#define __get_user_or_set_dar(_regs, _dest, _addr) \ + ({ \ + int rc = 0; \ + typeof(_addr) __addr = (_addr); \ + if (__get_user_inatomic(_dest, __addr)) { \ + _regs->dar = (unsigned long)__addr; \ + rc = -EFAULT; \ + } \ + rc; \ + }) + +#define __put_user_or_set_dar(_regs, _src, _addr) \ + ({ \ + int rc = 0; \ + typeof(_addr) __addr = (_addr); \ + if (__put_user_inatomic(_src, __addr)) { \ + _regs->dar = (unsigned long)__addr; \ + rc = -EFAULT; \ + } \ + rc; \ + }) + static int emulate_multiple(struct pt_regs *regs, unsigned char __user *addr, unsigned int reg, unsigned int nb, unsigned int flags, unsigned int instr, @@ -263,9 +285,10 @@ } else { unsigned long pc = regs->nip ^ (swiz & 4); - if (__get_user_inatomic(instr, - (unsigned int __user *)pc)) + if (__get_user_or_set_dar(regs, instr, + (unsigned int __user *)pc)) return -EFAULT; + if (swiz == 0 && (flags & SW)) instr = cpu_to_le32(instr); nb = (instr >> 11) & 0x1f; @@ -309,31 +332,31 @@ ((nb0 + 3) / 4) * sizeof(unsigned long)); for (i = 0; i < nb; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) + if (__get_user_or_set_dar(regs, REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; if (nb0 > 0) { rptr = ®s->gpr[0]; addr += nb; for (i = 0; i < nb0; ++i, ++p) - if (__get_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) + if (__get_user_or_set_dar(regs, + REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; } } else { for (i = 0; i < nb; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, i ^ bswiz), - SWIZ_PTR(p))) + if (__put_user_or_set_dar(regs, REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; if (nb0 > 0) { rptr = ®s->gpr[0]; addr += nb; for (i = 0; i < nb0; ++i, ++p) - if (__put_user_inatomic(REG_BYTE(rptr, - i ^ bswiz), - SWIZ_PTR(p))) + if (__put_user_or_set_dar(regs, + REG_BYTE(rptr, i ^ bswiz), + SWIZ_PTR(p))) return -EFAULT; } } @@ -345,29 +368,32 @@ * Only POWER6 has these instructions, and it does true little-endian, * so we don't need the address swizzling. */ -static int emulate_fp_pair(unsigned char __user *addr, unsigned int reg, - unsigned int flags) +static int emulate_fp_pair(struct pt_regs *regs, unsigned char __user *addr, + unsigned int reg, unsigned int flags) { char *ptr0 = (char *) ¤t->thread.TS_FPR(reg); char *ptr1 = (char *) ¤t->thread.TS_FPR(reg+1); - int i, ret, sw = 0; + int i, sw = 0; if (reg & 1) return 0; /* invalid form: FRS/FRT must be even */ if (flags & SW) sw = 7; - ret = 0; + for (i = 0; i < 8; ++i) { if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); + if (__get_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__get_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); + if (__put_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__put_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } } - if (ret) - return -EFAULT; + return 1; /* exception handled and fixed up */ } @@ -377,24 +403,27 @@ { char *ptr0 = (char *)®s->gpr[reg]; char *ptr1 = (char *)®s->gpr[reg+1]; - int i, ret, sw = 0; + int i, sw = 0; if (reg & 1) return 0; /* invalid form: GPR must be even */ if (flags & SW) sw = 7; - ret = 0; + for (i = 0; i < 8; ++i) { if (!(flags & ST)) { - ret |= __get_user(ptr0[i^sw], addr + i); - ret |= __get_user(ptr1[i^sw], addr + i + 8); + if (__get_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__get_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } else { - ret |= __put_user(ptr0[i^sw], addr + i); - ret |= __put_user(ptr1[i^sw], addr + i + 8); + if (__put_user_or_set_dar(regs, ptr0[i^sw], addr + i)) + return -EFAULT; + if (__put_user_or_set_dar(regs, ptr1[i^sw], addr + i + 8)) + return -EFAULT; } } - if (ret) - return -EFAULT; + return 1; /* exception handled and fixed up */ } #endif /* CONFIG_PPC64 */ @@ -687,9 +716,14 @@ for (j = 0; j < length; j += elsize) { for (i = 0; i < elsize; ++i) { if (flags & ST) - ret |= __put_user(ptr[i^sw], addr + i); + ret = __put_user_or_set_dar(regs, ptr[i^sw], + addr + i); else - ret |= __get_user(ptr[i^sw], addr + i); + ret = __get_user_or_set_dar(regs, ptr[i^sw], + addr + i); + + if (ret) + return ret; } ptr += elsize; #ifdef __LITTLE_ENDIAN__ @@ -739,7 +773,7 @@ unsigned int dsisr; unsigned char __user *addr; unsigned long p, swiz; - int ret, i; + int i; union data { u64 ll; double dd; @@ -936,7 +970,7 @@ if (flags & F) { /* Special case for 16-byte FP loads and stores */ PPC_WARN_ALIGNMENT(fp_pair, regs); - return emulate_fp_pair(addr, reg, flags); + return emulate_fp_pair(regs, addr, reg, flags); } else { #ifdef CONFIG_PPC64 /* Special case for 16-byte loads and stores */ @@ -966,15 +1000,12 @@ } data.ll = 0; - ret = 0; p = (unsigned long)addr; for (i = 0; i < nb; i++) - ret |= __get_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); - - if (unlikely(ret)) - return -EFAULT; + if (__get_user_or_set_dar(regs, data.v[start + i], + SWIZ_PTR(p++))) + return -EFAULT; } else if (flags & F) { data.ll = current->thread.TS_FPR(reg); @@ -1046,15 +1077,13 @@ break; } - ret = 0; p = (unsigned long)addr; for (i = 0; i < nb; i++) - ret |= __put_user_inatomic(data.v[start + i], - SWIZ_PTR(p++)); + if (__put_user_or_set_dar(regs, data.v[start + i], + SWIZ_PTR(p++))) + return -EFAULT; - if (unlikely(ret)) - return -EFAULT; } else if (flags & F) current->thread.TS_FPR(reg) = data.ll; else --- linux-azure-4.13.0.orig/arch/powerpc/kernel/asm-offsets.c +++ linux-azure-4.13.0/arch/powerpc/kernel/asm-offsets.c @@ -237,6 +237,11 @@ OFFSET(PACA_NMI_EMERG_SP, paca_struct, nmi_emergency_sp); OFFSET(PACA_IN_MCE, paca_struct, in_mce); OFFSET(PACA_IN_NMI, paca_struct, in_nmi); + OFFSET(PACA_RFI_FLUSH_FALLBACK_AREA, paca_struct, rfi_flush_fallback_area); + OFFSET(PACA_EXRFI, paca_struct, exrfi); + OFFSET(PACA_L1D_FLUSH_CONGRUENCE, paca_struct, l1d_flush_congruence); + OFFSET(PACA_L1D_FLUSH_SETS, paca_struct, l1d_flush_sets); + #endif OFFSET(PACAHWCPUID, paca_struct, hw_cpu_id); OFFSET(PACAKEXECSTATE, paca_struct, kexec_state); @@ -746,6 +751,14 @@ OFFSET(PACA_SUBCORE_SIBLING_MASK, paca_struct, subcore_sibling_mask); OFFSET(PACA_SIBLING_PACA_PTRS, paca_struct, thread_sibling_pacas); OFFSET(PACA_REQ_PSSCR, paca_struct, requested_psscr); +#define STOP_SPR(x, f) OFFSET(x, paca_struct, stop_sprs.f) + STOP_SPR(STOP_PID, pid); + STOP_SPR(STOP_LDBAR, ldbar); + STOP_SPR(STOP_FSCR, fscr); + STOP_SPR(STOP_HFSCR, hfscr); + STOP_SPR(STOP_MMCR1, mmcr1); + STOP_SPR(STOP_MMCR2, mmcr2); + STOP_SPR(STOP_MMCRA, mmcra); #endif DEFINE(PPC_DBELL_SERVER, PPC_DBELL_SERVER); --- linux-azure-4.13.0.orig/arch/powerpc/kernel/cpu_setup_power.S +++ linux-azure-4.13.0/arch/powerpc/kernel/cpu_setup_power.S @@ -102,6 +102,7 @@ li r0,0 mtspr SPRN_PSSCR,r0 mtspr SPRN_LPID,r0 + mtspr SPRN_PID,r0 mfspr r3,SPRN_LPCR LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) or r3, r3, r4 @@ -126,6 +127,7 @@ li r0,0 mtspr SPRN_PSSCR,r0 mtspr SPRN_LPID,r0 + mtspr SPRN_PID,r0 mfspr r3,SPRN_LPCR LOAD_REG_IMMEDIATE(r4, LPCR_PECEDH | LPCR_PECE_HVEE | LPCR_HVICE | LPCR_HEIC) or r3, r3, r4 --- linux-azure-4.13.0.orig/arch/powerpc/kernel/dt_cpu_ftrs.c +++ linux-azure-4.13.0/arch/powerpc/kernel/dt_cpu_ftrs.c @@ -102,10 +102,10 @@ case PVR_POWER8: case PVR_POWER8E: case PVR_POWER8NVL: - __flush_tlb_power8(POWER8_TLB_SETS); + __flush_tlb_power8(TLB_INVAL_SCOPE_GLOBAL); break; case PVR_POWER9: - __flush_tlb_power9(POWER9_TLB_SETS_HASH); + __flush_tlb_power9(TLB_INVAL_SCOPE_GLOBAL); break; default: pr_err("unknown CPU version for boot TLB flush\n"); @@ -746,6 +746,9 @@ cur_cpu_spec->cpu_features |= CPU_FTR_HVMODE; } + /* Make sure powerpc_base_platform is non-NULL */ + powerpc_base_platform = cur_cpu_spec->platform; + system_registers.lpcr = mfspr(SPRN_LPCR); system_registers.hfscr = mfspr(SPRN_HFSCR); system_registers.fscr = mfspr(SPRN_FSCR); --- linux-azure-4.13.0.orig/arch/powerpc/kernel/eeh.c +++ linux-azure-4.13.0/arch/powerpc/kernel/eeh.c @@ -1018,6 +1018,10 @@ } else if ((ret = eeh_ops->init())) return ret; + /* Initialize PHB PEs */ + list_for_each_entry_safe(hose, tmp, &hose_list, list_node) + eeh_dev_phb_init_dynamic(hose); + /* Initialize EEH event */ ret = eeh_event_init(); if (ret) --- linux-azure-4.13.0.orig/arch/powerpc/kernel/eeh_dev.c +++ linux-azure-4.13.0/arch/powerpc/kernel/eeh_dev.c @@ -83,21 +83,3 @@ /* EEH PE for PHB */ eeh_phb_pe_create(phb); } - -/** - * eeh_dev_phb_init - Create EEH devices for devices included in existing PHBs - * - * Scan all the existing PHBs and create EEH devices for their OF - * nodes and their children OF nodes - */ -static int __init eeh_dev_phb_init(void) -{ - struct pci_controller *phb, *tmp; - - list_for_each_entry_safe(phb, tmp, &hose_list, list_node) - eeh_dev_phb_init_dynamic(phb); - - return 0; -} - -core_initcall(eeh_dev_phb_init); --- linux-azure-4.13.0.orig/arch/powerpc/kernel/entry_64.S +++ linux-azure-4.13.0/arch/powerpc/kernel/entry_64.S @@ -37,6 +37,11 @@ #include #include #include +#ifdef CONFIG_PPC_BOOK3S +#include +#else +#include +#endif /* * System calls. @@ -262,13 +267,23 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r13,GPR13(r1) /* only restore r13 if returning to usermode */ + ld r2,GPR2(r1) + ld r1,GPR1(r1) + mtlr r4 + mtcr r5 + mtspr SPRN_SRR0,r7 + mtspr SPRN_SRR1,r8 + RFI_TO_USER + b . /* prevent speculative execution */ + + /* exit to kernel */ 1: ld r2,GPR2(r1) ld r1,GPR1(r1) mtlr r4 mtcr r5 mtspr SPRN_SRR0,r7 mtspr SPRN_SRR1,r8 - RFI + RFI_TO_KERNEL b . /* prevent speculative execution */ .Lsyscall_error: @@ -397,8 +412,7 @@ mtmsrd r10, 1 mtspr SPRN_SRR0, r11 mtspr SPRN_SRR1, r12 - - rfid + RFI_TO_USER b . /* prevent speculative execution */ #endif _ASM_NOKPROBE_SYMBOL(system_call_common); @@ -878,7 +892,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ACCOUNT_CPU_USER_EXIT(r13, r2, r4) REST_GPR(13, r1) -1: + mtspr SPRN_SRR1,r3 ld r2,_CCR(r1) @@ -891,8 +905,22 @@ ld r3,GPR3(r1) ld r4,GPR4(r1) ld r1,GPR1(r1) + RFI_TO_USER + b . /* prevent speculative execution */ - rfid +1: mtspr SPRN_SRR1,r3 + + ld r2,_CCR(r1) + mtcrf 0xFF,r2 + ld r2,_NIP(r1) + mtspr SPRN_SRR0,r2 + + ld r0,GPR0(r1) + ld r2,GPR2(r1) + ld r3,GPR3(r1) + ld r4,GPR4(r1) + ld r1,GPR1(r1) + RFI_TO_KERNEL b . /* prevent speculative execution */ #endif /* CONFIG_PPC_BOOK3E */ @@ -1078,7 +1106,7 @@ mtspr SPRN_SRR0,r5 mtspr SPRN_SRR1,r6 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ rtas_return_loc: @@ -1103,7 +1131,7 @@ mtspr SPRN_SRR0,r3 mtspr SPRN_SRR1,r4 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ _ASM_NOKPROBE_SYMBOL(__enter_rtas) _ASM_NOKPROBE_SYMBOL(rtas_return_loc) @@ -1176,7 +1204,7 @@ LOAD_REG_IMMEDIATE(r12, MSR_SF | MSR_ISF | MSR_LE) andc r11,r11,r12 mtsrr1 r11 - rfid + RFI_TO_KERNEL #endif /* CONFIG_PPC_BOOK3E */ 1: /* Return from OF */ --- linux-azure-4.13.0.orig/arch/powerpc/kernel/exceptions-64s.S +++ linux-azure-4.13.0/arch/powerpc/kernel/exceptions-64s.S @@ -253,7 +253,7 @@ LOAD_HANDLER(r12, machine_check_handle_early) 1: mtspr SPRN_SRR0,r12 mtspr SPRN_SRR1,r11 - rfid + RFI_TO_KERNEL b . /* prevent speculative execution */ 2: /* Stack overflow. Stay on emergency stack and panic. @@ -442,7 +442,7 @@ li r3,MSR_ME andc r10,r10,r3 /* Turn off MSR_ME */ mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . 2: /* @@ -460,7 +460,7 @@ */ bl machine_check_queue_event MACHINE_CHECK_HANDLER_WINDUP - rfid + RFI_TO_USER_OR_KERNEL 9: /* Deliver the machine check to host kernel in V mode. */ MACHINE_CHECK_HANDLER_WINDUP @@ -595,6 +595,9 @@ stw r9,PACA_EXSLB+EX_CCR(r13) /* save CR in exc. frame */ std r10,PACA_EXSLB+EX_LR(r13) /* save LR */ + andi. r9,r11,MSR_PR /* check for exception from userspace */ + cmpdi cr4,r9,MSR_PR + /* * Test MSR_RI before calling slb_allocate_realmode, because the * MSR in r11 gets clobbered. However we still want to allocate @@ -621,9 +624,12 @@ /* All done -- return from exception. */ + bne cr4,1f /* returning to kernel */ + .machine push .machine "power4" mtcrf 0x80,r9 + mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ mtcrf 0x02,r9 /* I/D indication is in cr6 */ mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ @@ -637,9 +643,30 @@ ld r11,PACA_EXSLB+EX_R11(r13) ld r12,PACA_EXSLB+EX_R12(r13) ld r13,PACA_EXSLB+EX_R13(r13) - rfid + RFI_TO_USER + b . /* prevent speculative execution */ +1: +.machine push +.machine "power4" + mtcrf 0x80,r9 + mtcrf 0x08,r9 /* MSR[PR] indication is in cr4 */ + mtcrf 0x04,r9 /* MSR[RI] indication is in cr5 */ + mtcrf 0x02,r9 /* I/D indication is in cr6 */ + mtcrf 0x01,r9 /* slb_allocate uses cr0 and cr7 */ +.machine pop + + RESTORE_CTR(r9, PACA_EXSLB) + RESTORE_PPR_PACA(PACA_EXSLB, r9) + mr r3,r12 + ld r9,PACA_EXSLB+EX_R9(r13) + ld r10,PACA_EXSLB+EX_R10(r13) + ld r11,PACA_EXSLB+EX_R11(r13) + ld r12,PACA_EXSLB+EX_R12(r13) + ld r13,PACA_EXSLB+EX_R13(r13) + RFI_TO_KERNEL b . /* prevent speculative execution */ + 2: std r3,PACA_EXSLB+EX_DAR(r13) mr r3,r12 mfspr r11,SPRN_SRR0 @@ -648,7 +675,7 @@ mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . 8: std r3,PACA_EXSLB+EX_DAR(r13) @@ -659,7 +686,7 @@ mtspr SPRN_SRR0,r10 ld r10,PACAKMSR(r13) mtspr SPRN_SRR1,r10 - rfid + RFI_TO_KERNEL b . EXC_COMMON_BEGIN(unrecov_slb) @@ -734,7 +761,29 @@ EXC_VIRT(program_check, 0x4700, 0x100, 0x700) TRAMP_KVM(PACA_EXGEN, 0x700) EXC_COMMON_BEGIN(program_check_common) - EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) + /* + * It's possible to receive a TM Bad Thing type program check with + * userspace register values (in particular r1), but with SRR1 reporting + * that we came from the kernel. Normally that would confuse the bad + * stack logic, and we would report a bad kernel stack pointer. Instead + * we switch to the emergency stack if we're taking a TM Bad Thing from + * the kernel. + */ + li r10,MSR_PR /* Build a mask of MSR_PR .. */ + oris r10,r10,0x200000@h /* .. and SRR1_PROGTM */ + and r10,r10,r12 /* Mask SRR1 with that. */ + srdi r10,r10,8 /* Shift it so we can compare */ + cmpldi r10,(0x200000 >> 8) /* .. with an immediate. */ + bne 1f /* If != go to normal path. */ + + /* SRR1 had PR=0 and SRR1_PROGTM=1, so use the emergency stack */ + andi. r10,r12,MSR_PR; /* Set CR0 correctly for label */ + /* 3 in EXCEPTION_PROLOG_COMMON */ + mr r10,r1 /* Save r1 */ + ld r1,PACAEMERGSP(r13) /* Use emergency stack */ + subi r1,r1,INT_FRAME_SIZE /* alloc stack frame */ + b 3f /* Jump into the macro !! */ +1: EXCEPTION_PROLOG_COMMON(0x700, PACA_EXGEN) bl save_nvgprs RECONCILE_IRQ_STATE(r10, r11) addi r3,r1,STACK_FRAME_OVERHEAD @@ -775,7 +824,7 @@ #endif -EXC_REAL_MASKABLE(decrementer, 0x900, 0x80) +EXC_REAL_OOL_MASKABLE(decrementer, 0x900, 0x80) EXC_VIRT_MASKABLE(decrementer, 0x4900, 0x80, 0x900) TRAMP_KVM(PACA_EXGEN, 0x900) EXC_COMMON_ASYNC(decrementer_common, 0x900, timer_interrupt) @@ -882,7 +931,7 @@ mtspr SPRN_SRR0,r10 ; \ ld r10,PACAKMSR(r13) ; \ mtspr SPRN_SRR1,r10 ; \ - rfid ; \ + RFI_TO_KERNEL ; \ b . ; /* prevent speculative execution */ #define SYSCALL_FASTENDIAN \ @@ -891,7 +940,7 @@ xori r12,r12,MSR_LE ; \ mtspr SPRN_SRR1,r12 ; \ mr r13,r9 ; \ - rfid ; /* return to userspace */ \ + RFI_TO_USER ; /* return to userspace */ \ b . ; /* prevent speculative execution */ #if defined(CONFIG_RELOCATABLE) @@ -1010,6 +1059,8 @@ EXCEPTION_PROLOG_COMMON_3(0xe60) addi r3,r1,STACK_FRAME_OVERHEAD BRANCH_LINK_TO_FAR(hmi_exception_realmode) /* Function call ABI */ + cmpdi cr0,r3,0 + /* Windup the stack. */ /* Move original HSRR0 and HSRR1 into the respective regs */ ld r9,_MSR(r1) @@ -1026,10 +1077,15 @@ REST_8GPRS(2, r1) REST_GPR(10, r1) ld r11,_CCR(r1) + REST_2GPRS(12, r1) + bne 1f mtcr r11 REST_GPR(11, r1) - REST_2GPRS(12, r1) - /* restore original r1. */ + ld r1,GPR1(r1) + HRFI_TO_USER_OR_KERNEL + +1: mtcr r11 + REST_GPR(11, r1) ld r1,GPR1(r1) /* @@ -1042,8 +1098,9 @@ EXCEPTION_PROLOG_0(PACA_EXGEN) b tramp_real_hmi_exception -EXC_COMMON_ASYNC(hmi_exception_common, 0xe60, handle_hmi_exception) - +EXC_COMMON_BEGIN(hmi_exception_common) +EXCEPTION_COMMON(PACA_EXGEN, 0xe60, hmi_exception_common, handle_hmi_exception, + ret_from_except, FINISH_NAP;ADD_NVGPRS;ADD_RECONCILE;RUNLATCH_ON) EXC_REAL_OOL_MASKABLE_HV(h_doorbell, 0xe80, 0x20) EXC_VIRT_OOL_MASKABLE_HV(h_doorbell, 0x4e80, 0x20, 0xe80) @@ -1276,7 +1333,7 @@ ld r11,PACA_EXGEN+EX_R11(r13) ld r12,PACA_EXGEN+EX_R12(r13) ld r13,PACA_EXGEN+EX_R13(r13) - HRFID + HRFI_TO_UNKNOWN b . #endif @@ -1383,10 +1440,94 @@ ld r10,PACA_EXGEN+EX_R10(r13); \ ld r11,PACA_EXGEN+EX_R11(r13); \ GET_SCRATCH0(r13); \ - ##_H##rfid; \ + ##_H##RFI_TO_KERNEL; \ b .; \ MASKED_DEC_HANDLER(_H) +TRAMP_REAL_BEGIN(rfi_flush_fallback) + SET_SCRATCH0(r13); + GET_PACA(r13); + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + std r12,PACA_EXRFI+EX_R12(r13) + std r8,PACA_EXRFI+EX_R13(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SETS(r13) + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13) + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt. + */ + addi r12,r12,8 + mtctr r11 + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + hwsync +1: li r8,0 + .rept 8 /* 8-way set associative */ + ldx r11,r10,r8 + add r8,r8,r12 + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not + add r8,r8,r11 // Add 0, this creates a dependency on the ldx + .endr + addi r10,r10,128 /* 128 byte cache line */ + bdnz 1b + + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r12,PACA_EXRFI+EX_R12(r13) + ld r8,PACA_EXRFI+EX_R13(r13) + GET_SCRATCH0(r13); + rfid + +TRAMP_REAL_BEGIN(hrfi_flush_fallback) + SET_SCRATCH0(r13); + GET_PACA(r13); + std r9,PACA_EXRFI+EX_R9(r13) + std r10,PACA_EXRFI+EX_R10(r13) + std r11,PACA_EXRFI+EX_R11(r13) + std r12,PACA_EXRFI+EX_R12(r13) + std r8,PACA_EXRFI+EX_R13(r13) + mfctr r9 + ld r10,PACA_RFI_FLUSH_FALLBACK_AREA(r13) + ld r11,PACA_L1D_FLUSH_SETS(r13) + ld r12,PACA_L1D_FLUSH_CONGRUENCE(r13) + /* + * The load adresses are at staggered offsets within cachelines, + * which suits some pipelines better (on others it should not + * hurt. + */ + addi r12,r12,8 + mtctr r11 + DCBT_STOP_ALL_STREAM_IDS(r11) /* Stop prefetch streams */ + + /* order ld/st prior to dcbt stop all streams with flushing */ + hwsync +1: li r8,0 + .rept 8 /* 8-way set associative */ + ldx r11,r10,r8 + add r8,r8,r12 + xor r11,r11,r11 // Ensure r11 is 0 even if fallback area is not + add r8,r8,r11 // Add 0, this creates a dependency on the ldx + .endr + addi r10,r10,128 /* 128 byte cache line */ + bdnz 1b + + mtctr r9 + ld r9,PACA_EXRFI+EX_R9(r13) + ld r10,PACA_EXRFI+EX_R10(r13) + ld r11,PACA_EXRFI+EX_R11(r13) + ld r12,PACA_EXRFI+EX_R12(r13) + ld r8,PACA_EXRFI+EX_R13(r13) + GET_SCRATCH0(r13); + hrfid + /* * Real mode exceptions actually use this too, but alternate * instruction code patches (which end up in the common .text area) @@ -1406,7 +1547,7 @@ addi r13, r13, 4 mtspr SPRN_SRR0, r13 GET_SCRATCH0(r13) - rfid + RFI_TO_KERNEL b . TRAMP_REAL_BEGIN(kvmppc_skip_Hinterrupt) @@ -1418,7 +1559,7 @@ addi r13, r13, 4 mtspr SPRN_HSRR0, r13 GET_SCRATCH0(r13) - hrfid + HRFI_TO_KERNEL b . #endif --- linux-azure-4.13.0.orig/arch/powerpc/kernel/fadump.c +++ linux-azure-4.13.0/arch/powerpc/kernel/fadump.c @@ -1446,6 +1446,25 @@ return; } +static int fadump_panic_event(struct notifier_block *this, + unsigned long event, void *ptr) +{ + /* + * If firmware-assisted dump has been registered then trigger + * firmware-assisted dump and let firmware handle everything + * else. If this returns, then fadump was not registered, so + * go through the rest of the panic path. + */ + crash_fadump(NULL, ptr); + + return NOTIFY_DONE; +} + +static struct notifier_block fadump_panic_block = { + .notifier_call = fadump_panic_event, + .priority = INT_MIN /* may not return; must be done last */ +}; + /* * Prepare for firmware-assisted dump. */ @@ -1478,6 +1497,9 @@ init_fadump_mem_struct(&fdm, fw_dump.reserve_dump_area_start); fadump_init_files(); + atomic_notifier_chain_register(&panic_notifier_list, + &fadump_panic_block); + return 1; } subsys_initcall(setup_fadump); --- linux-azure-4.13.0.orig/arch/powerpc/kernel/idle_book3s.S +++ linux-azure-4.13.0/arch/powerpc/kernel/idle_book3s.S @@ -85,7 +85,61 @@ std r3,_WORT(r1) mfspr r3,SPRN_WORC std r3,_WORC(r1) +/* + * On POWER9, there are idle states such as stop4, invoked via cpuidle, + * that lose hypervisor resources. In such cases, we need to save + * additional SPRs before entering those idle states so that they can + * be restored to their older values on wakeup from the idle state. + * + * On POWER8, the only such deep idle state is winkle which is used + * only in the context of CPU-Hotplug, where these additional SPRs are + * reinitiazed to a sane value. Hence there is no need to save/restore + * these SPRs. + */ +BEGIN_FTR_SECTION + blr +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) +power9_save_additional_sprs: + mfspr r3, SPRN_PID + mfspr r4, SPRN_LDBAR + std r3, STOP_PID(r13) + std r4, STOP_LDBAR(r13) + + mfspr r3, SPRN_FSCR + mfspr r4, SPRN_HFSCR + std r3, STOP_FSCR(r13) + std r4, STOP_HFSCR(r13) + + mfspr r3, SPRN_MMCRA + mfspr r4, SPRN_MMCR1 + std r3, STOP_MMCRA(r13) + std r4, STOP_MMCR1(r13) + + mfspr r3, SPRN_MMCR2 + std r3, STOP_MMCR2(r13) + blr + +power9_restore_additional_sprs: + ld r3,_LPCR(r1) + ld r4, STOP_PID(r13) + mtspr SPRN_LPCR,r3 + mtspr SPRN_PID, r4 + + ld r3, STOP_LDBAR(r13) + ld r4, STOP_FSCR(r13) + mtspr SPRN_LDBAR, r3 + mtspr SPRN_FSCR, r4 + + ld r3, STOP_HFSCR(r13) + ld r4, STOP_MMCRA(r13) + mtspr SPRN_HFSCR, r3 + mtspr SPRN_MMCRA, r4 + /* We have already restored PACA_MMCR0 */ + ld r3, STOP_MMCR1(r13) + ld r4, STOP_MMCR2(r13) + mtspr SPRN_MMCR1, r3 + mtspr SPRN_MMCR2, r4 blr /* @@ -809,9 +863,16 @@ mtctr r12 bctrl +/* + * On POWER9, we can come here on wakeup from a cpuidle stop state. + * Hence restore the additional SPRs to the saved value. + * + * On POWER8, we come here only on winkle. Since winkle is used + * only in the case of CPU-Hotplug, we don't need to restore + * the additional SPRs. + */ BEGIN_FTR_SECTION - ld r4,_LPCR(r1) - mtspr SPRN_LPCR,r4 + bl power9_restore_additional_sprs END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) hypervisor_state_restored: --- linux-azure-4.13.0.orig/arch/powerpc/kernel/kprobes.c +++ linux-azure-4.13.0/arch/powerpc/kernel/kprobes.c @@ -600,7 +600,12 @@ unsigned long arch_deref_entry_point(void *entry) { - return ppc_global_function_entry(entry); +#ifdef PPC64_ELF_ABI_v1 + if (!kernel_text_address((unsigned long)entry)) + return ppc_global_function_entry(entry); + else +#endif + return (unsigned long)entry; } NOKPROBE_SYMBOL(arch_deref_entry_point); --- linux-azure-4.13.0.orig/arch/powerpc/kernel/mce.c +++ linux-azure-4.13.0/arch/powerpc/kernel/mce.c @@ -22,11 +22,14 @@ #undef DEBUG #define pr_fmt(fmt) "mce: " fmt +#include #include #include #include #include #include + +#include #include static DEFINE_PER_CPU(int, mce_nest_count); @@ -446,3 +449,61 @@ return 0; } EXPORT_SYMBOL(get_mce_fault_addr); + +/* + * This function is called in real mode. Strictly no printk's please. + * + * regs->nip and regs->msr contains srr0 and ssr1. + */ +long machine_check_early(struct pt_regs *regs) +{ + long handled = 0; + + __this_cpu_inc(irq_stat.mce_exceptions); + + if (cur_cpu_spec && cur_cpu_spec->machine_check_early) + handled = cur_cpu_spec->machine_check_early(regs); + return handled; +} + +long hmi_exception_realmode(struct pt_regs *regs) +{ + __this_cpu_inc(irq_stat.hmi_exceptions); + +#ifdef CONFIG_PPC_BOOK3S_64 + /* Workaround for P9 vector CI loads (see p9_hmi_special_emu) */ + if (pvr_version_is(PVR_POWER9)) { + unsigned long hmer = mfspr(SPRN_HMER); + + /* Do we have the debug bit set */ + if (hmer & PPC_BIT(17)) { + hmer &= ~PPC_BIT(17); + mtspr(SPRN_HMER, hmer); + + /* + * Now to avoid problems with soft-disable we + * only do the emulation if we are coming from + * user space + */ + if (user_mode(regs)) + local_paca->hmi_p9_special_emu = 1; + + /* + * Don't bother going to OPAL if that's the + * only relevant bit. + */ + if (!(hmer & mfspr(SPRN_HMEER))) + return local_paca->hmi_p9_special_emu; + } + } +#endif /* CONFIG_PPC_BOOK3S_64 */ + + wait_for_subcore_guest_exit(); + + if (ppc_md.hmi_exception_early) + ppc_md.hmi_exception_early(regs); + + wait_for_tb_resync(); + + return 1; +} --- linux-azure-4.13.0.orig/arch/powerpc/kernel/paca.c +++ linux-azure-4.13.0/arch/powerpc/kernel/paca.c @@ -198,6 +198,7 @@ { u64 limit; int cpu; + int nr_cpus; limit = ppc64_rma_size; @@ -210,20 +211,32 @@ limit = min(0x10000000ULL, limit); #endif - paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpu_ids); + /* + * Always align up the nr_cpu_ids to SMT threads and allocate + * the paca. This will help us to prepare for a situation where + * boot cpu id > nr_cpus_id. We will use the last nthreads + * slots (nthreads == threads per core) to accommodate a core + * that contains boot cpu thread. + * + * Do not change nr_cpu_ids value here. Let us do that in + * early_init_dt_scan_cpus() where we know exact value + * of threads per core. + */ + nr_cpus = _ALIGN_UP(nr_cpu_ids, MAX_SMT); + paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpus); paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit)); memset(paca, 0, paca_size); printk(KERN_DEBUG "Allocated %u bytes for %d pacas at %p\n", - paca_size, nr_cpu_ids, paca); + paca_size, nr_cpus, paca); - allocate_lppacas(nr_cpu_ids, limit); + allocate_lppacas(nr_cpus, limit); - allocate_slb_shadows(nr_cpu_ids, limit); + allocate_slb_shadows(nr_cpus, limit); /* Can't use for_each_*_cpu, as they aren't functional yet */ - for (cpu = 0; cpu < nr_cpu_ids; cpu++) + for (cpu = 0; cpu < nr_cpus; cpu++) initialise_paca(&paca[cpu], cpu); } --- linux-azure-4.13.0.orig/arch/powerpc/kernel/pci-common.c +++ linux-azure-4.13.0/arch/powerpc/kernel/pci-common.c @@ -331,6 +331,7 @@ } return NULL; } +EXPORT_SYMBOL(pci_find_hose_for_OF_device); /* * Reads the interrupt pin to determine if interrupt is use by card. @@ -1649,6 +1650,7 @@ { return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap); } +EXPORT_SYMBOL_GPL(early_find_capability); struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) { --- linux-azure-4.13.0.orig/arch/powerpc/kernel/prom.c +++ linux-azure-4.13.0/arch/powerpc/kernel/prom.c @@ -302,6 +302,29 @@ } } +/* + * Adjust the logical id of a boot cpu to fall under nr_cpu_ids. Map it to + * last core slot in the allocated paca array. + * + * e.g. on SMT=8 system, kernel booted with nr_cpus=1 and boot cpu = 33, + * align nr_cpu_ids to MAX_SMT value 8. Allocate paca array to hold up-to + * MAX_SMT=8 cpus. Since boot cpu 33 is greater than nr_cpus (8), adjust + * its logical id so that new id becomes less than nr_cpu_ids. Make sure + * that boot cpu's new logical id is aligned to its thread id and falls + * under last nthreads slots available in paca array. In this case the + * boot cpu 33 is adjusted to new boot cpu id 1. + * + */ +static inline void adjust_boot_cpuid(int nthreads, int phys_id) +{ + boot_hw_cpuid = phys_id; + if (boot_cpuid >= nr_cpu_ids) { + boot_cpuid = (boot_cpuid % nthreads) + (nr_cpu_ids - nthreads); + pr_info("Adjusted logical boot cpu id: logical %d physical %d\n", + boot_cpuid, phys_id); + } +} + static int __init early_init_dt_scan_cpus(unsigned long node, const char *uname, int depth, void *data) @@ -325,6 +348,18 @@ nthreads = len / sizeof(int); +#ifdef CONFIG_SMP + /* + * Now that we know threads per core lets align nr_cpu_ids to + * correct SMT value. + */ + if (nr_cpu_ids % nthreads) { + nr_cpu_ids = _ALIGN_UP(nr_cpu_ids, nthreads); + pr_info("Aligned nr_cpus to SMT=%d, nr_cpu_ids = %d\n", + nthreads, nr_cpu_ids); + } +#endif + /* * Now see if any of these threads match our boot cpu. * NOTE: This must match the parsing done in smp_setup_cpu_maps. @@ -363,7 +398,9 @@ DBG("boot cpu: logical %d physical %d\n", found, be32_to_cpu(intserv[found_thread])); boot_cpuid = found; - set_hard_smp_processor_id(found, be32_to_cpu(intserv[found_thread])); + adjust_boot_cpuid(nthreads, be32_to_cpu(intserv[found_thread])); + set_hard_smp_processor_id(boot_cpuid, + be32_to_cpu(intserv[found_thread])); /* * PAPR defines "logical" PVR values for cpus that --- linux-azure-4.13.0.orig/arch/powerpc/kernel/ptrace.c +++ linux-azure-4.13.0/arch/powerpc/kernel/ptrace.c @@ -131,7 +131,7 @@ * in the appropriate thread structures from live. */ - if (tsk != current) + if ((!cpu_has_feature(CPU_FTR_TM)) || (tsk != current)) return; if (MSR_TM_SUSPENDED(mfmsr())) { --- linux-azure-4.13.0.orig/arch/powerpc/kernel/setup-common.c +++ linux-azure-4.13.0/arch/powerpc/kernel/setup-common.c @@ -85,6 +85,7 @@ EXPORT_SYMBOL(machine_id); int boot_cpuid = -1; +int boot_hw_cpuid = -1; EXPORT_SYMBOL_GPL(boot_cpuid); /* @@ -473,6 +474,7 @@ struct device_node *dn = NULL; int cpu = 0; int nthreads = 1; + bool boot_cpu_added = false; DBG("smp_setup_cpu_maps()\n"); @@ -499,6 +501,24 @@ } nthreads = len / sizeof(int); + /* + * If boot cpu hasn't been added to paca and there are only + * last nthreads slots available in paca array then wait + * for boot cpu to show up. + */ + if (!boot_cpu_added && (cpu + nthreads) >= nr_cpu_ids) { + int found = 0; + + DBG("Holding last nthreads paca slots for boot cpu\n"); + for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) { + if (boot_hw_cpuid == be32_to_cpu(intserv[j])) { + found = 1; + break; + } + } + if (!found) + continue; + } for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) { bool avail; @@ -514,6 +534,11 @@ set_cpu_present(cpu, avail); set_hard_smp_processor_id(cpu, be32_to_cpu(intserv[j])); set_cpu_possible(cpu, true); + if (boot_hw_cpuid == be32_to_cpu(intserv[j])) { + DBG("Boot cpu %d (hard id %d) added to paca\n", + cpu, be32_to_cpu(intserv[j])); + boot_cpu_added = true; + } cpu++; } } @@ -704,30 +729,6 @@ } EXPORT_SYMBOL(check_legacy_ioport); -static int ppc_panic_event(struct notifier_block *this, - unsigned long event, void *ptr) -{ - /* - * If firmware-assisted dump has been registered then trigger - * firmware-assisted dump and let firmware handle everything else. - */ - crash_fadump(NULL, ptr); - ppc_md.panic(ptr); /* May not return */ - return NOTIFY_DONE; -} - -static struct notifier_block ppc_panic_block = { - .notifier_call = ppc_panic_event, - .priority = INT_MIN /* may not return; must be done last */ -}; - -void __init setup_panic(void) -{ - if (!ppc_md.panic) - return; - atomic_notifier_chain_register(&panic_notifier_list, &ppc_panic_block); -} - #ifdef CONFIG_CHECK_CACHE_COHERENCY /* * For platforms that have configurable cache-coherency. This function @@ -872,9 +873,6 @@ /* Probe the machine type, establish ppc_md. */ probe_machine(); - /* Setup panic notifier if requested by the platform. */ - setup_panic(); - /* * Configure ppc_md.power_save (ppc32 only, 64-bit machines do * it from their respective probe() function. @@ -916,13 +914,6 @@ /* Reserve large chunks of memory for use by CMA for KVM. */ kvm_cma_reserve(); - /* - * Reserve any gigantic pages requested on the command line. - * memblock needs to have been initialized by the time this is - * called since this will reserve memory. - */ - reserve_hugetlb_gpages(); - klp_init_thread_info(&init_thread_info); init_mm.start_code = (unsigned long)_stext; --- linux-azure-4.13.0.orig/arch/powerpc/kernel/setup.h +++ linux-azure-4.13.0/arch/powerpc/kernel/setup.h @@ -55,4 +55,6 @@ static inline void kvm_cma_reserve(void) { }; #endif +extern bool rfi_flush; + #endif /* __ARCH_POWERPC_KERNEL_SETUP_H */ --- linux-azure-4.13.0.orig/arch/powerpc/kernel/setup_64.c +++ linux-azure-4.13.0/arch/powerpc/kernel/setup_64.c @@ -751,3 +751,100 @@ struct ppc_pci_io ppc_pci_io; EXPORT_SYMBOL(ppc_pci_io); #endif + +#ifdef CONFIG_PPC_BOOK3S_64 +static enum l1d_flush_type enabled_flush_types; +static void *l1d_flush_fallback_area; +static bool no_rfi_flush; +bool rfi_flush; + +static int __init handle_no_rfi_flush(char *p) +{ + pr_info("rfi-flush: disabled on command line."); + no_rfi_flush = true; + return 0; +} +early_param("no_rfi_flush", handle_no_rfi_flush); + +/* + * The RFI flush is not KPTI, but because users will see doco that says to use + * nopti we hijack that option here to also disable the RFI flush. + */ +static int __init handle_no_pti(char *p) +{ + pr_info("rfi-flush: disabling due to 'nopti' on command line.\n"); + handle_no_rfi_flush(NULL); + return 0; +} +early_param("nopti", handle_no_pti); + +static void do_nothing(void *unused) +{ + /* + * We don't need to do the flush explicitly, just enter+exit kernel is + * sufficient, the RFI exit handlers will do the right thing. + */ +} + +void rfi_flush_enable(bool enable) +{ + if (rfi_flush == enable) + return; + + if (enable) { + do_rfi_flush_fixups(enabled_flush_types); + on_each_cpu(do_nothing, NULL, 1); + } else + do_rfi_flush_fixups(L1D_FLUSH_NONE); + + rfi_flush = enable; +} + +void __init setup_rfi_flush(enum l1d_flush_type types, bool enable) +{ + if (types & L1D_FLUSH_FALLBACK) { + int cpu; + u64 l1d_size = ppc64_caches.l1d.size; + u64 limit = min(safe_stack_limit(), ppc64_rma_size); + + pr_info("rfi-flush: Using fallback displacement flush\n"); + + /* + * Align to L1d size, and size it at 2x L1d size, to + * catch possible hardware prefetch runoff. We don't + * have a recipe for load patterns to reliably avoid + * the prefetcher. + */ + l1d_flush_fallback_area = + __va(memblock_alloc_base(l1d_size * 2, l1d_size, limit)); + memset(l1d_flush_fallback_area, 0, l1d_size * 2); + + for_each_possible_cpu(cpu) { + /* + * The fallback flush is currently coded for 8-way + * associativity. Different associativity is possible, + * but it will be treated as 8-way and may not evict + * the lines as effectively. + * + * 128 byte lines are mandatory. + */ + u64 c = l1d_size / 8; + + paca[cpu].rfi_flush_fallback_area = l1d_flush_fallback_area; + paca[cpu].l1d_flush_congruence = c; + paca[cpu].l1d_flush_sets = c / 128; + } + } + + if (types & L1D_FLUSH_ORI) + pr_info("rfi-flush: Using ori type flush\n"); + + if (types & L1D_FLUSH_MTTRIG) + pr_info("rfi-flush: Using mttrig type flush\n"); + + enabled_flush_types = types; + + if (!no_rfi_flush) + rfi_flush_enable(enable); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ --- linux-azure-4.13.0.orig/arch/powerpc/kernel/signal_64.c +++ linux-azure-4.13.0/arch/powerpc/kernel/signal_64.c @@ -452,9 +452,20 @@ if (MSR_TM_RESV(msr)) return -EINVAL; - /* pull in MSR TM from user context */ + /* pull in MSR TS bits from user context */ regs->msr = (regs->msr & ~MSR_TS_MASK) | (msr & MSR_TS_MASK); + /* + * Ensure that TM is enabled in regs->msr before we leave the signal + * handler. It could be the case that (a) user disabled the TM bit + * through the manipulation of the MSR bits in uc_mcontext or (b) the + * TM bit was disabled because a sufficient number of context switches + * happened whilst in the signal handler and load_tm overflowed, + * disabling the TM bit. In either case we can end up with an illegal + * TM state leading to a TM Bad Thing when we return to userspace. + */ + regs->msr |= MSR_TM; + /* pull in MSR LE from user context */ regs->msr = (regs->msr & ~MSR_LE) | (msr & MSR_LE); --- linux-azure-4.13.0.orig/arch/powerpc/kernel/sysfs.c +++ linux-azure-4.13.0/arch/powerpc/kernel/sysfs.c @@ -18,8 +18,10 @@ #include #include #include +#include #include "cacheinfo.h" +#include "setup.h" #ifdef CONFIG_PPC64 #include @@ -496,6 +498,43 @@ static DEVICE_ATTR(purr, 0400, show_purr, store_purr); static DEVICE_ATTR(pir, 0400, show_pir, NULL); +#ifdef CONFIG_PPC_BOOK3S_64 +static ssize_t show_rfi_flush(struct device *dev, + struct device_attribute *attr, char *buf) +{ + return sprintf(buf, "%d\n", rfi_flush ? 1 : 0); +} + +static ssize_t __used store_rfi_flush(struct device *dev, + struct device_attribute *attr, const char *buf, + size_t count) +{ + int val; + int ret = 0; + + ret = sscanf(buf, "%d", &val); + if (ret != 1) + return -EINVAL; + + if (val == 1) + rfi_flush_enable(true); + else if (val == 0) + rfi_flush_enable(false); + else + return -EINVAL; + + return count; +} + +static DEVICE_ATTR(rfi_flush, 0600, + show_rfi_flush, store_rfi_flush); + +static void sysfs_create_rfi_flush(void) +{ + device_create_file(cpu_subsys.dev_root, &dev_attr_rfi_flush); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ + /* * This is the system wide DSCR register default value. Any * change to this default value through the sysfs interface @@ -1036,6 +1075,9 @@ WARN_ON(r < 0); #ifdef CONFIG_PPC64 sysfs_create_dscr_default(); +#ifdef CONFIG_PPC_BOOK3S + sysfs_create_rfi_flush(); +#endif #endif /* CONFIG_PPC64 */ return 0; --- linux-azure-4.13.0.orig/arch/powerpc/kernel/traps.c +++ linux-azure-4.13.0/arch/powerpc/kernel/traps.c @@ -37,6 +37,7 @@ #include #include #include +#include #include #include @@ -312,39 +313,6 @@ /* What should we do here? We could issue a shutdown or hard reset. */ } -#ifdef CONFIG_PPC64 -/* - * This function is called in real mode. Strictly no printk's please. - * - * regs->nip and regs->msr contains srr0 and ssr1. - */ -long machine_check_early(struct pt_regs *regs) -{ - long handled = 0; - - __this_cpu_inc(irq_stat.mce_exceptions); - - if (cur_cpu_spec && cur_cpu_spec->machine_check_early) - handled = cur_cpu_spec->machine_check_early(regs); - return handled; -} - -long hmi_exception_realmode(struct pt_regs *regs) -{ - __this_cpu_inc(irq_stat.hmi_exceptions); - - wait_for_subcore_guest_exit(); - - if (ppc_md.hmi_exception_early) - ppc_md.hmi_exception_early(regs); - - wait_for_tb_resync(); - - return 0; -} - -#endif - /* * I/O accesses can cause machine checks on powermacs. * Check if the NIP corresponds to the address of a sync @@ -794,6 +762,187 @@ die("System Management Interrupt", regs, SIGABRT); } +#ifdef CONFIG_VSX +static void p9_hmi_special_emu(struct pt_regs *regs) +{ + unsigned int ra, rb, t, i, sel, instr, rc; + const void __user *addr; + u8 vbuf[16], *vdst; + unsigned long ea, msr, msr_mask; + bool swap; + + if (__get_user_inatomic(instr, (unsigned int __user *)regs->nip)) + return; + + /* + * lxvb16x opcode: 0x7c0006d8 + * lxvd2x opcode: 0x7c000698 + * lxvh8x opcode: 0x7c000658 + * lxvw4x opcode: 0x7c000618 + */ + if ((instr & 0xfc00073e) != 0x7c000618) { + pr_devel("HMI vec emu: not vector CI %i:%s[%d] nip=%016lx" + " instr=%08x\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr); + return; + } + + /* Grab vector registers into the task struct */ + msr = regs->msr; /* Grab msr before we flush the bits */ + flush_vsx_to_thread(current); + enable_kernel_altivec(); + + /* + * Is userspace running with a different endian (this is rare but + * not impossible) + */ + swap = (msr & MSR_LE) != (MSR_KERNEL & MSR_LE); + + /* Decode the instruction */ + ra = (instr >> 16) & 0x1f; + rb = (instr >> 11) & 0x1f; + t = (instr >> 21) & 0x1f; + if (instr & 1) + vdst = (u8 *)¤t->thread.vr_state.vr[t]; + else + vdst = (u8 *)¤t->thread.fp_state.fpr[t][0]; + + /* Grab the vector address */ + ea = regs->gpr[rb] + (ra ? regs->gpr[ra] : 0); + if (is_32bit_task()) + ea &= 0xfffffffful; + addr = (__force const void __user *)ea; + + /* Check it */ + if (!access_ok(VERIFY_READ, addr, 16)) { + pr_devel("HMI vec emu: bad access %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + /* Read the vector */ + rc = 0; + if ((unsigned long)addr & 0xfUL) + /* unaligned case */ + rc = __copy_from_user_inatomic(vbuf, addr, 16); + else + __get_user_atomic_128_aligned(vbuf, addr, rc); + if (rc) { + pr_devel("HMI vec emu: page fault %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, (unsigned long)addr); + return; + } + + pr_devel("HMI vec emu: emulated vector CI %i:%s[%d] nip=%016lx" + " instr=%08x addr=%016lx\n", + smp_processor_id(), current->comm, current->pid, regs->nip, + instr, (unsigned long) addr); + + /* Grab instruction "selector" */ + sel = (instr >> 6) & 3; + + /* + * Check to make sure the facility is actually enabled. This + * could happen if we get a false positive hit. + * + * lxvd2x/lxvw4x always check MSR VSX sel = 0,2 + * lxvh8x/lxvb16x check MSR VSX or VEC depending on VSR used sel = 1,3 + */ + msr_mask = MSR_VSX; + if ((sel & 1) && (instr & 1)) /* lxvh8x & lxvb16x + VSR >= 32 */ + msr_mask = MSR_VEC; + if (!(msr & msr_mask)) { + pr_devel("HMI vec emu: MSR fac clear %i:%s[%d] nip=%016lx" + " instr=%08x msr:%016lx\n", + smp_processor_id(), current->comm, current->pid, + regs->nip, instr, msr); + return; + } + + /* Do logging here before we modify sel based on endian */ + switch (sel) { + case 0: /* lxvw4x */ + PPC_WARN_EMULATED(lxvw4x, regs); + break; + case 1: /* lxvh8x */ + PPC_WARN_EMULATED(lxvh8x, regs); + break; + case 2: /* lxvd2x */ + PPC_WARN_EMULATED(lxvd2x, regs); + break; + case 3: /* lxvb16x */ + PPC_WARN_EMULATED(lxvb16x, regs); + break; + } + +#ifdef __LITTLE_ENDIAN__ + /* + * An LE kernel stores the vector in the task struct as an LE + * byte array (effectively swapping both the components and + * the content of the components). Those instructions expect + * the components to remain in ascending address order, so we + * swap them back. + * + * If we are running a BE user space, the expectation is that + * of a simple memcpy, so forcing the emulation to look like + * a lxvb16x should do the trick. + */ + if (swap) + sel = 3; + + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = ((u32 *)vbuf)[3-i]; + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = ((u16 *)vbuf)[7-i]; + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = ((u64 *)vbuf)[1-i]; + break; + case 3: /* lxvb16x */ + for (i = 0; i < 16; i++) + vdst[i] = vbuf[15-i]; + break; + } +#else /* __LITTLE_ENDIAN__ */ + /* On a big endian kernel, a BE userspace only needs a memcpy */ + if (!swap) + sel = 3; + + /* Otherwise, we need to swap the content of the components */ + switch (sel) { + case 0: /* lxvw4x */ + for (i = 0; i < 4; i++) + ((u32 *)vdst)[i] = cpu_to_le32(((u32 *)vbuf)[i]); + break; + case 1: /* lxvh8x */ + for (i = 0; i < 8; i++) + ((u16 *)vdst)[i] = cpu_to_le16(((u16 *)vbuf)[i]); + break; + case 2: /* lxvd2x */ + for (i = 0; i < 2; i++) + ((u64 *)vdst)[i] = cpu_to_le64(((u64 *)vbuf)[i]); + break; + case 3: /* lxvb16x */ + memcpy(vdst, vbuf, 16); + break; + } +#endif /* !__LITTLE_ENDIAN__ */ + + /* Go to next instruction */ + regs->nip += 4; +} +#endif /* CONFIG_VSX */ + void handle_hmi_exception(struct pt_regs *regs) { struct pt_regs *old_regs; @@ -801,6 +950,21 @@ old_regs = set_irq_regs(regs); irq_enter(); +#ifdef CONFIG_VSX + /* Real mode flagged P9 special emu is needed */ + if (local_paca->hmi_p9_special_emu) { + local_paca->hmi_p9_special_emu = 0; + + /* + * We don't want to take page faults while doing the + * emulation, we just replay the instruction if necessary. + */ + pagefault_disable(); + p9_hmi_special_emu(regs); + pagefault_enable(); + } +#endif /* CONFIG_VSX */ + if (ppc_md.handle_hmi_exception) ppc_md.handle_hmi_exception(regs); @@ -2037,6 +2201,10 @@ WARN_EMULATED_SETUP(mfdscr), WARN_EMULATED_SETUP(mtdscr), WARN_EMULATED_SETUP(lq_stq), + WARN_EMULATED_SETUP(lxvw4x), + WARN_EMULATED_SETUP(lxvh8x), + WARN_EMULATED_SETUP(lxvd2x), + WARN_EMULATED_SETUP(lxvb16x), #endif }; --- linux-azure-4.13.0.orig/arch/powerpc/kernel/vmlinux.lds.S +++ linux-azure-4.13.0/arch/powerpc/kernel/vmlinux.lds.S @@ -131,6 +131,15 @@ /* Read-only data */ RO_DATA(PAGE_SIZE) +#ifdef CONFIG_PPC64 + . = ALIGN(8); + __rfi_flush_fixup : AT(ADDR(__rfi_flush_fixup) - LOAD_OFFSET) { + __start___rfi_flush_fixup = .; + *(__rfi_flush_fixup) + __stop___rfi_flush_fixup = .; + } +#endif + EXCEPTION_TABLE(0) NOTES :kernel :notes --- linux-azure-4.13.0.orig/arch/powerpc/kvm/book3s_64_mmu_hv.c +++ linux-azure-4.13.0/arch/powerpc/kvm/book3s_64_mmu_hv.c @@ -645,6 +645,16 @@ hnow_v = hpte_new_to_old_v(hnow_v, hnow_r); hnow_r = hpte_new_to_old_r(hnow_r); } + + /* + * If the HPT is being resized, don't update the HPTE, + * instead let the guest retry after the resize operation is complete. + * The synchronization for hpte_setup_done test vs. set is provided + * by the HPTE lock. + */ + if (!kvm->arch.hpte_setup_done) + goto out_unlock; + if ((hnow_v & ~HPTE_V_HVLOCK) != hpte[0] || hnow_r != hpte[1] || rev->guest_rpte != hpte[2]) /* HPTE has been changed under us; let the guest retry */ --- linux-azure-4.13.0.orig/arch/powerpc/kvm/book3s_64_vio.c +++ linux-azure-4.13.0/arch/powerpc/kvm/book3s_64_vio.c @@ -479,28 +479,30 @@ return ret; dir = iommu_tce_direction(tce); + + idx = srcu_read_lock(&vcpu->kvm->srcu); + if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, - tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) - return H_PARAMETER; + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) { + ret = H_PARAMETER; + goto unlock_exit; + } entry = ioba >> stt->page_shift; list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { - if (dir == DMA_NONE) { + if (dir == DMA_NONE) ret = kvmppc_tce_iommu_unmap(vcpu->kvm, stit->tbl, entry); - } else { - idx = srcu_read_lock(&vcpu->kvm->srcu); + else ret = kvmppc_tce_iommu_map(vcpu->kvm, stit->tbl, entry, ua, dir); - srcu_read_unlock(&vcpu->kvm->srcu, idx); - } if (ret == H_SUCCESS) continue; if (ret == H_TOO_HARD) - return ret; + goto unlock_exit; WARN_ON_ONCE(1); kvmppc_clear_tce(stit->tbl, entry); @@ -508,7 +510,10 @@ kvmppc_tce_put(stt, entry, tce); - return H_SUCCESS; +unlock_exit: + srcu_read_unlock(&vcpu->kvm->srcu, idx); + + return ret; } EXPORT_SYMBOL_GPL(kvmppc_h_put_tce); --- linux-azure-4.13.0.orig/arch/powerpc/kvm/book3s_hv.c +++ linux-azure-4.13.0/arch/powerpc/kvm/book3s_hv.c @@ -2688,11 +2688,14 @@ * Hard-disable interrupts, and check resched flag and signals. * If we need to reschedule or deliver a signal, clean up * and return without going into the guest(s). + * If the hpte_setup_done flag has been cleared, don't go into the + * guest because that means a HPT resize operation is in progress. */ local_irq_disable(); hard_irq_disable(); if (lazy_irq_pending() || need_resched() || - recheck_signals(&core_info)) { + recheck_signals(&core_info) || + (!kvm_is_radix(vc->kvm) && !vc->kvm->arch.hpte_setup_done)) { local_irq_enable(); vc->vcore_state = VCORE_INACTIVE; /* Unlock all except the primary vcore */ @@ -2812,7 +2815,7 @@ */ trace_hardirqs_on(); - guest_enter(); + guest_enter_irqoff(); srcu_idx = srcu_read_lock(&vc->kvm->srcu); @@ -2820,8 +2823,6 @@ srcu_read_unlock(&vc->kvm->srcu, srcu_idx); - guest_exit(); - trace_hardirqs_off(); set_irq_happened(trap); @@ -2855,6 +2856,7 @@ kvmppc_set_host_core(pcpu); local_irq_enable(); + guest_exit(); /* Let secondaries go back to the offline loop */ for (i = 0; i < controlled_threads; ++i) { @@ -3061,7 +3063,7 @@ static int kvmppc_run_vcpu(struct kvm_run *kvm_run, struct kvm_vcpu *vcpu) { - int n_ceded, i; + int n_ceded, i, r; struct kvmppc_vcore *vc; struct kvm_vcpu *v; @@ -3115,6 +3117,20 @@ while (vcpu->arch.state == KVMPPC_VCPU_RUNNABLE && !signal_pending(current)) { + /* See if the HPT and VRMA are ready to go */ + if (!kvm_is_radix(vcpu->kvm) && + !vcpu->kvm->arch.hpte_setup_done) { + spin_unlock(&vc->lock); + r = kvmppc_hv_setup_htab_rma(vcpu); + spin_lock(&vc->lock); + if (r) { + kvm_run->exit_reason = KVM_EXIT_FAIL_ENTRY; + kvm_run->fail_entry.hardware_entry_failure_reason = 0; + vcpu->arch.ret = r; + break; + } + } + if (vc->vcore_state == VCORE_PREEMPT && vc->runner == NULL) kvmppc_vcore_end_preempt(vc); @@ -3232,13 +3248,6 @@ /* Order vcpus_running vs. hpte_setup_done, see kvmppc_alloc_reset_hpt */ smp_mb(); - /* On the first time here, set up HTAB and VRMA */ - if (!kvm_is_radix(vcpu->kvm) && !vcpu->kvm->arch.hpte_setup_done) { - r = kvmppc_hv_setup_htab_rma(vcpu); - if (r) - goto out; - } - flush_all_to_thread(current); /* Save userspace EBB and other register values */ @@ -3286,7 +3295,6 @@ } mtspr(SPRN_VRSAVE, user_vrsave); - out: vcpu->arch.state = KVMPPC_VCPU_NOTREADY; atomic_dec(&vcpu->kvm->arch.vcpus_running); return r; @@ -4187,11 +4195,13 @@ if ((cfg->process_table & PRTS_MASK) > 24) return -EINVAL; + mutex_lock(&kvm->lock); kvm->arch.process_table = cfg->process_table; kvmppc_setup_partition_table(kvm); lpcr = (cfg->flags & KVM_PPC_MMUV3_GTSE) ? LPCR_GTSE : 0; kvmppc_update_lpcr(kvm, lpcr, LPCR_GTSE); + mutex_unlock(&kvm->lock); return 0; } --- linux-azure-4.13.0.orig/arch/powerpc/kvm/book3s_hv_rm_xive.c +++ linux-azure-4.13.0/arch/powerpc/kvm/book3s_hv_rm_xive.c @@ -38,7 +38,6 @@ #define __x_tima get_tima_phys() #define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_page)) #define __x_trig_page(xd) ((void __iomem *)((xd)->trig_page)) -#define __x_readb __raw_rm_readb #define __x_writeb __raw_rm_writeb #define __x_readw __raw_rm_readw #define __x_readq __raw_rm_readq --- linux-azure-4.13.0.orig/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ linux-azure-4.13.0/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -78,7 +78,7 @@ mtmsrd r0,1 /* clear RI in MSR */ mtsrr0 r5 mtsrr1 r6 - RFI + RFI_TO_KERNEL kvmppc_call_hv_entry: ld r4, HSTATE_KVM_VCPU(r13) @@ -185,7 +185,7 @@ mtmsrd r6, 1 /* Clear RI in MSR */ mtsrr0 r8 mtsrr1 r7 - RFI + RFI_TO_KERNEL /* Virtual-mode return */ .Lvirt_return: @@ -765,6 +765,9 @@ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ bl kvmppc_restore_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif @@ -1115,7 +1118,7 @@ ld r0, VCPU_GPR(R0)(r4) ld r4, VCPU_GPR(R4)(r4) - hrfid + HRFI_TO_GUEST b . secondary_too_late: @@ -1293,6 +1296,7 @@ bne 3f BEGIN_FTR_SECTION PPC_MSGSYNC + lwsync END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 @@ -1623,6 +1627,9 @@ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ bl kvmppc_save_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif @@ -1742,7 +1749,10 @@ /* * Are we running hash or radix ? */ - beq cr2,3f + ld r5, VCPU_KVM(r9) + lbz r0, KVM_RADIX(r5) + cmpwi cr2, r0, 0 + beq cr2, 3f /* Radix: Handle the case where the guest used an illegal PID */ LOAD_REG_ADDR(r4, mmu_base_pid) @@ -2459,6 +2469,9 @@ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ ld r9, HSTATE_KVM_VCPU(r13) bl kvmppc_save_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) @@ -2569,6 +2582,9 @@ #ifdef CONFIG_PPC_TRANSACTIONAL_MEM BEGIN_FTR_SECTION + /* + * NOTE THAT THIS TRASHES ALL NON-VOLATILE REGISTERS INCLUDING CR + */ bl kvmppc_restore_tm END_FTR_SECTION_IFSET(CPU_FTR_TM) #endif @@ -2752,6 +2768,10 @@ PPC_MSGCLR(6) /* see if it's a host IPI */ li r3, 1 +BEGIN_FTR_SECTION + PPC_MSGSYNC + lwsync +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) lbz r0, HSTATE_HOST_IPI(r13) cmpwi r0, 0 bnelr --- linux-azure-4.13.0.orig/arch/powerpc/kvm/book3s_rmhandlers.S +++ linux-azure-4.13.0/arch/powerpc/kvm/book3s_rmhandlers.S @@ -46,6 +46,9 @@ #define FUNC(name) name +#define RFI_TO_KERNEL RFI +#define RFI_TO_GUEST RFI + .macro INTERRUPT_TRAMPOLINE intno .global kvmppc_trampoline_\intno @@ -141,7 +144,7 @@ GET_SCRATCH0(r13) /* And get back into the code */ - RFI + RFI_TO_KERNEL #endif /* @@ -164,6 +167,6 @@ ori r5, r5, MSR_EE mtsrr0 r7 mtsrr1 r6 - RFI + RFI_TO_KERNEL #include "book3s_segment.S" --- linux-azure-4.13.0.orig/arch/powerpc/kvm/book3s_segment.S +++ linux-azure-4.13.0/arch/powerpc/kvm/book3s_segment.S @@ -156,7 +156,7 @@ PPC_LL r9, SVCPU_R9(r3) PPC_LL r3, (SVCPU_R3)(r3) - RFI + RFI_TO_GUEST kvmppc_handler_trampoline_enter_end: @@ -407,5 +407,5 @@ cmpwi r12, BOOK3S_INTERRUPT_DOORBELL beqa BOOK3S_INTERRUPT_DOORBELL - RFI + RFI_TO_KERNEL kvmppc_handler_trampoline_exit_end: --- linux-azure-4.13.0.orig/arch/powerpc/kvm/book3s_xive.c +++ linux-azure-4.13.0/arch/powerpc/kvm/book3s_xive.c @@ -48,7 +48,6 @@ #define __x_tima xive_tima #define __x_eoi_page(xd) ((void __iomem *)((xd)->eoi_mmio)) #define __x_trig_page(xd) ((void __iomem *)((xd)->trig_mmio)) -#define __x_readb __raw_readb #define __x_writeb __raw_writeb #define __x_readw __raw_readw #define __x_readq __raw_readq @@ -623,7 +622,7 @@ return -EINVAL; state = &sb->irq_state[idx]; arch_spin_lock(&sb->lock); - *server = state->guest_server; + *server = state->act_server; *priority = state->guest_priority; arch_spin_unlock(&sb->lock); @@ -1332,7 +1331,7 @@ xive->saved_src_count++; /* Convert saved state into something compatible with xics */ - val = state->guest_server; + val = state->act_server; prio = state->saved_scan_prio; if (prio == MASKED) { @@ -1508,7 +1507,6 @@ /* First convert prio and mark interrupt as untargetted */ act_prio = xive_prio_from_guest(guest_prio); state->act_priority = MASKED; - state->guest_server = server; /* * We need to drop the lock due to the mutex below. Hopefully --- linux-azure-4.13.0.orig/arch/powerpc/kvm/book3s_xive.h +++ linux-azure-4.13.0/arch/powerpc/kvm/book3s_xive.h @@ -35,7 +35,6 @@ struct xive_irq_data *pt_data; /* XIVE Pass-through associated data */ /* Targetting as set by guest */ - u32 guest_server; /* Current guest selected target */ u8 guest_priority; /* Guest set priority */ u8 saved_priority; /* Saved priority when masking */ --- linux-azure-4.13.0.orig/arch/powerpc/kvm/book3s_xive_template.c +++ linux-azure-4.13.0/arch/powerpc/kvm/book3s_xive_template.c @@ -28,7 +28,8 @@ * bit. */ if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { - u8 pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); + __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS); + u8 pipr = be64_to_cpu(qw1) & 0xff; if (pipr >= xc->hw_cppr) return; } @@ -336,7 +337,6 @@ struct kvmppc_xive_vcpu *xc = vcpu->arch.xive_vcpu; u8 pending = xc->pending; u32 hirq; - u8 pipr; pr_devel("H_IPOLL(server=%ld)\n", server); @@ -353,7 +353,8 @@ pending = 0xff; } else { /* Grab pending interrupt if any */ - pipr = __x_readb(__x_tima + TM_QW1_OS + TM_PIPR); + __be64 qw1 = __x_readq(__x_tima + TM_QW1_OS); + u8 pipr = be64_to_cpu(qw1) & 0xff; if (pipr < 8) pending |= 1 << pipr; } --- linux-azure-4.13.0.orig/arch/powerpc/kvm/powerpc.c +++ linux-azure-4.13.0/arch/powerpc/kvm/powerpc.c @@ -639,8 +639,7 @@ break; #endif case KVM_CAP_PPC_HTM: - r = cpu_has_feature(CPU_FTR_TM_COMP) && - is_kvmppc_hv_enabled(kvm); + r = cpu_has_feature(CPU_FTR_TM_COMP) && hv_enabled; break; default: r = 0; --- linux-azure-4.13.0.orig/arch/powerpc/lib/feature-fixups.c +++ linux-azure-4.13.0/arch/powerpc/lib/feature-fixups.c @@ -116,6 +116,47 @@ } } +#ifdef CONFIG_PPC_BOOK3S_64 +void do_rfi_flush_fixups(enum l1d_flush_type types) +{ + unsigned int instrs[3], *dest; + long *start, *end; + int i; + + start = PTRRELOC(&__start___rfi_flush_fixup), + end = PTRRELOC(&__stop___rfi_flush_fixup); + + instrs[0] = 0x60000000; /* nop */ + instrs[1] = 0x60000000; /* nop */ + instrs[2] = 0x60000000; /* nop */ + + if (types & L1D_FLUSH_FALLBACK) + /* b .+16 to fallback flush */ + instrs[0] = 0x48000010; + + i = 0; + if (types & L1D_FLUSH_ORI) { + instrs[i++] = 0x63ff0000; /* ori 31,31,0 speculation barrier */ + instrs[i++] = 0x63de0000; /* ori 30,30,0 L1d flush*/ + } + + if (types & L1D_FLUSH_MTTRIG) + instrs[i++] = 0x7c12dba6; /* mtspr TRIG2,r0 (SPR #882) */ + + for (i = 0; start < end; start++, i++) { + dest = (void *)start + *start; + + pr_devel("patching dest %lx\n", (unsigned long)dest); + + patch_instruction(dest, instrs[0]); + patch_instruction(dest + 1, instrs[1]); + patch_instruction(dest + 2, instrs[2]); + } + + printk(KERN_DEBUG "rfi-flush: patched %d locations\n", i); +} +#endif /* CONFIG_PPC_BOOK3S_64 */ + void do_lwsync_fixups(unsigned long value, void *fixup_start, void *fixup_end) { long *start, *end; --- linux-azure-4.13.0.orig/arch/powerpc/lib/sstep.c +++ linux-azure-4.13.0/arch/powerpc/lib/sstep.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include --- linux-azure-4.13.0.orig/arch/powerpc/mm/hash_utils_64.c +++ linux-azure-4.13.0/arch/powerpc/mm/hash_utils_64.c @@ -509,7 +509,7 @@ phys_addr, block_size, expected_pages); if (phys_addr + (16 * GB) <= memblock_end_of_DRAM()) { memblock_reserve(phys_addr, block_size * expected_pages); - add_gpage(phys_addr, block_size, expected_pages); + pseries_add_gpage(phys_addr, block_size, expected_pages); } return 0; } @@ -1019,6 +1019,7 @@ __kernel_virt_size = H_KERN_VIRT_SIZE; __vmalloc_start = H_VMALLOC_START; __vmalloc_end = H_VMALLOC_END; + __kernel_io_start = H_KERN_IO_START; vmemmap = (struct page *)H_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; --- linux-azure-4.13.0.orig/arch/powerpc/mm/hugetlbpage.c +++ linux-azure-4.13.0/arch/powerpc/mm/hugetlbpage.c @@ -36,26 +36,6 @@ unsigned int HPAGE_SHIFT; EXPORT_SYMBOL(HPAGE_SHIFT); -/* - * Tracks gpages after the device tree is scanned and before the - * huge_boot_pages list is ready. On non-Freescale implementations, this is - * just used to track 16G pages and so is a single array. FSL-based - * implementations may have more than one gpage size, so we need multiple - * arrays - */ -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) -#define MAX_NUMBER_GPAGES 128 -struct psize_gpages { - u64 gpage_list[MAX_NUMBER_GPAGES]; - unsigned int nr_gpages; -}; -static struct psize_gpages gpage_freearray[MMU_PAGE_COUNT]; -#else -#define MAX_NUMBER_GPAGES 1024 -static u64 gpage_freearray[MAX_NUMBER_GPAGES]; -static unsigned nr_gpages; -#endif - #define hugepd_none(hpd) (hpd_val(hpd) == 0) pte_t *huge_pte_offset(struct mm_struct *mm, unsigned long addr, unsigned long sz) @@ -210,145 +190,20 @@ return hugepte_offset(*hpdp, addr, pdshift); } -#if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) -/* Build list of addresses of gigantic pages. This function is used in early - * boot before the buddy allocator is setup. - */ -void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) -{ - unsigned int idx = shift_to_mmu_psize(__ffs(page_size)); - int i; - - if (addr == 0) - return; - - gpage_freearray[idx].nr_gpages = number_of_pages; - - for (i = 0; i < number_of_pages; i++) { - gpage_freearray[idx].gpage_list[i] = addr; - addr += page_size; - } -} - -/* - * Moves the gigantic page addresses from the temporary list to the - * huge_boot_pages list. - */ -int alloc_bootmem_huge_page(struct hstate *hstate) -{ - struct huge_bootmem_page *m; - int idx = shift_to_mmu_psize(huge_page_shift(hstate)); - int nr_gpages = gpage_freearray[idx].nr_gpages; - - if (nr_gpages == 0) - return 0; - -#ifdef CONFIG_HIGHMEM - /* - * If gpages can be in highmem we can't use the trick of storing the - * data structure in the page; allocate space for this - */ - m = memblock_virt_alloc(sizeof(struct huge_bootmem_page), 0); - m->phys = gpage_freearray[idx].gpage_list[--nr_gpages]; -#else - m = phys_to_virt(gpage_freearray[idx].gpage_list[--nr_gpages]); -#endif - - list_add(&m->list, &huge_boot_pages); - gpage_freearray[idx].nr_gpages = nr_gpages; - gpage_freearray[idx].gpage_list[nr_gpages] = 0; - m->hstate = hstate; - - return 1; -} +#ifdef CONFIG_PPC_BOOK3S_64 /* - * Scan the command line hugepagesz= options for gigantic pages; store those in - * a list that we use to allocate the memory once all options are parsed. + * Tracks gpages after the device tree is scanned and before the + * huge_boot_pages list is ready on pseries. */ - -unsigned long gpage_npages[MMU_PAGE_COUNT]; - -static int __init do_gpage_early_setup(char *param, char *val, - const char *unused, void *arg) -{ - static phys_addr_t size; - unsigned long npages; - - /* - * The hugepagesz and hugepages cmdline options are interleaved. We - * use the size variable to keep track of whether or not this was done - * properly and skip over instances where it is incorrect. Other - * command-line parsing code will issue warnings, so we don't need to. - * - */ - if ((strcmp(param, "default_hugepagesz") == 0) || - (strcmp(param, "hugepagesz") == 0)) { - size = memparse(val, NULL); - } else if (strcmp(param, "hugepages") == 0) { - if (size != 0) { - if (sscanf(val, "%lu", &npages) <= 0) - npages = 0; - if (npages > MAX_NUMBER_GPAGES) { - pr_warn("MMU: %lu pages requested for page " -#ifdef CONFIG_PHYS_ADDR_T_64BIT - "size %llu KB, limiting to " -#else - "size %u KB, limiting to " -#endif - __stringify(MAX_NUMBER_GPAGES) "\n", - npages, size / 1024); - npages = MAX_NUMBER_GPAGES; - } - gpage_npages[shift_to_mmu_psize(__ffs(size))] = npages; - size = 0; - } - } - return 0; -} - +#define MAX_NUMBER_GPAGES 1024 +__initdata static u64 gpage_freearray[MAX_NUMBER_GPAGES]; +__initdata static unsigned nr_gpages; /* - * This function allocates physical space for pages that are larger than the - * buddy allocator can handle. We want to allocate these in highmem because - * the amount of lowmem is limited. This means that this function MUST be - * called before lowmem_end_addr is set up in MMU_init() in order for the lmb - * allocate to grab highmem. - */ -void __init reserve_hugetlb_gpages(void) -{ - static __initdata char cmdline[COMMAND_LINE_SIZE]; - phys_addr_t size, base; - int i; - - strlcpy(cmdline, boot_command_line, COMMAND_LINE_SIZE); - parse_args("hugetlb gpages", cmdline, NULL, 0, 0, 0, - NULL, &do_gpage_early_setup); - - /* - * Walk gpage list in reverse, allocating larger page sizes first. - * Skip over unsupported sizes, or sizes that have 0 gpages allocated. - * When we reach the point in the list where pages are no longer - * considered gpages, we're done. - */ - for (i = MMU_PAGE_COUNT-1; i >= 0; i--) { - if (mmu_psize_defs[i].shift == 0 || gpage_npages[i] == 0) - continue; - else if (mmu_psize_to_shift(i) < (MAX_ORDER + PAGE_SHIFT)) - break; - - size = (phys_addr_t)(1ULL << mmu_psize_to_shift(i)); - base = memblock_alloc_base(size * gpage_npages[i], size, - MEMBLOCK_ALLOC_ANYWHERE); - add_gpage(base, size, gpage_npages[i]); - } -} - -#else /* !PPC_FSL_BOOK3E */ - -/* Build list of addresses of gigantic pages. This function is used in early + * Build list of addresses of gigantic pages. This function is used in early * boot before the buddy allocator is setup. */ -void add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) +void __init pseries_add_gpage(u64 addr, u64 page_size, unsigned long number_of_pages) { if (!addr) return; @@ -360,10 +215,7 @@ } } -/* Moves the gigantic page addresses from the temporary list to the - * huge_boot_pages list. - */ -int alloc_bootmem_huge_page(struct hstate *hstate) +int __init pseries_alloc_bootmem_huge_page(struct hstate *hstate) { struct huge_bootmem_page *m; if (nr_gpages == 0) @@ -376,6 +228,17 @@ } #endif + +int __init alloc_bootmem_huge_page(struct hstate *h) +{ + +#ifdef CONFIG_PPC_BOOK3S_64 + if (firmware_has_feature(FW_FEATURE_LPAR) && !radix_enabled()) + return pseries_alloc_bootmem_huge_page(h); +#endif + return __alloc_bootmem_huge_page(h); +} + #if defined(CONFIG_PPC_FSL_BOOK3E) || defined(CONFIG_PPC_8xx) #define HUGEPD_FREELIST_SIZE \ ((PAGE_SIZE - sizeof(struct hugepd_freelist)) / sizeof(pte_t)) --- linux-azure-4.13.0.orig/arch/powerpc/mm/init_32.c +++ linux-azure-4.13.0/arch/powerpc/mm/init_32.c @@ -132,8 +132,6 @@ * Reserve gigantic pages for hugetlb. This MUST occur before * lowmem_end_addr is initialized below. */ - reserve_hugetlb_gpages(); - if (memblock.memory.cnt > 1) { #ifndef CONFIG_WII memblock_enforce_memory_limit(memblock.memory.regions[0].size); --- linux-azure-4.13.0.orig/arch/powerpc/mm/mem.c +++ linux-azure-4.13.0/arch/powerpc/mm/mem.c @@ -127,14 +127,18 @@ return -ENODEV; } -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { + struct pglist_data *pgdata; + struct zone *zone; unsigned long start_pfn = start >> PAGE_SHIFT; unsigned long nr_pages = size >> PAGE_SHIFT; int rc; resize_hpt_for_hotplug(memblock_phys_mem_size()); + pgdata = NODE_DATA(nid); + start = (unsigned long)__va(start); rc = create_section_mapping(start, start + size); if (rc) { @@ -144,7 +148,11 @@ return -EFAULT; } - return __add_pages(nid, start_pfn, nr_pages, want_memblock); + /* this should work for most non-highmem platforms */ + zone = pgdata->node_zones + + zone_for_memory(nid, start, size, 0, for_device); + + return __add_pages(nid, zone, start_pfn, nr_pages, !for_device); } #ifdef CONFIG_MEMORY_HOTREMOVE --- linux-azure-4.13.0.orig/arch/powerpc/mm/pgtable-radix.c +++ linux-azure-4.13.0/arch/powerpc/mm/pgtable-radix.c @@ -526,6 +526,7 @@ __kernel_virt_size = RADIX_KERN_VIRT_SIZE; __vmalloc_start = RADIX_VMALLOC_START; __vmalloc_end = RADIX_VMALLOC_END; + __kernel_io_start = RADIX_KERN_IO_START; vmemmap = (struct page *)RADIX_VMEMMAP_BASE; ioremap_bot = IOREMAP_BASE; @@ -836,9 +837,12 @@ */ pmd = *pmdp; pmd_clear(pmdp); + /*FIXME!! Verify whether we need this kick below */ kick_all_cpus_sync(); - flush_tlb_range(vma, address, address + HPAGE_PMD_SIZE); + + radix__flush_tlb_collapsed_pmd(vma->vm_mm, address); + return pmd; } --- linux-azure-4.13.0.orig/arch/powerpc/mm/pgtable_64.c +++ linux-azure-4.13.0/arch/powerpc/mm/pgtable_64.c @@ -104,6 +104,8 @@ EXPORT_SYMBOL(__vmalloc_start); unsigned long __vmalloc_end; EXPORT_SYMBOL(__vmalloc_end); +unsigned long __kernel_io_start; +EXPORT_SYMBOL(__kernel_io_start); struct page *vmemmap; EXPORT_SYMBOL(vmemmap); unsigned long __pte_frag_nr; --- linux-azure-4.13.0.orig/arch/powerpc/mm/slb_low.S +++ linux-azure-4.13.0/arch/powerpc/mm/slb_low.S @@ -121,12 +121,25 @@ 1: #endif /* CONFIG_SPARSEMEM_VMEMMAP */ - /* vmalloc mapping gets the encoding from the PACA as the mapping - * can be demoted from 64K -> 4K dynamically on some machines + /* + * r10 contains the ESID, which is the original faulting EA shifted + * right by 28 bits. We need to compare that with (H_VMALLOC_END >> 28) + * which is 0xd00038000. That can't be used as an immediate, even if we + * ignored the 0xd, so we have to load it into a register, and we only + * have one register free. So we must load all of (H_VMALLOC_END >> 28) + * into a register and compare ESID against that. + */ + lis r11,(H_VMALLOC_END >> 32)@h // r11 = 0xffffffffd0000000 + ori r11,r11,(H_VMALLOC_END >> 32)@l // r11 = 0xffffffffd0003800 + // Rotate left 4, then mask with 0xffffffff0 + rldic r11,r11,4,28 // r11 = 0xd00038000 + cmpld r10,r11 // if r10 >= r11 + bge 5f // goto io_mapping + + /* + * vmalloc mapping gets the encoding from the PACA as the mapping + * can be demoted from 64K -> 4K dynamically on some machines. */ - clrldi r11,r10,48 - cmpldi r11,(H_VMALLOC_SIZE >> 28) - 1 - bgt 5f lhz r11,PACAVMALLOCSLLP(r13) b 6f 5: --- linux-azure-4.13.0.orig/arch/powerpc/mm/tlb-radix.c +++ linux-azure-4.13.0/arch/powerpc/mm/tlb-radix.c @@ -54,23 +54,15 @@ */ __tlbiel_pid(pid, 0, ric); - if (ric == RIC_FLUSH_ALL) - /* For the remaining sets, just flush the TLB */ - ric = RIC_FLUSH_TLB; + /* For PWC, only one flush is needed */ + if (ric == RIC_FLUSH_PWC) { + asm volatile("ptesync": : :"memory"); + return; + } + /* For the remaining sets, just flush the TLB */ for (set = 1; set < POWER9_TLB_SETS_RADIX ; set++) - __tlbiel_pid(pid, set, ric); - - asm volatile("ptesync": : :"memory"); - asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); -} - -static inline void tlbiel_pwc(unsigned long pid) -{ - asm volatile("ptesync": : :"memory"); - - /* For PWC flush, we don't look at set number */ - __tlbiel_pid(pid, 0, RIC_FLUSH_PWC); + __tlbiel_pid(pid, set, RIC_FLUSH_TLB); asm volatile("ptesync": : :"memory"); asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); @@ -146,31 +138,23 @@ preempt_disable(); pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_TLB); preempt_enable(); } EXPORT_SYMBOL(radix__local_flush_tlb_mm); -void radix__local_flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +#ifndef CONFIG_SMP +static void radix__local_flush_all_mm(struct mm_struct *mm) { unsigned long pid; - struct mm_struct *mm = tlb->mm; - /* - * If we are doing a full mm flush, we will do a tlb flush - * with RIC_FLUSH_ALL later. - */ - if (tlb->fullmm) - return; preempt_disable(); - pid = mm->context.id; if (pid != MMU_NO_CONTEXT) - tlbiel_pwc(pid); - + _tlbiel_pid(pid, RIC_FLUSH_ALL); preempt_enable(); } -EXPORT_SYMBOL(radix__local_flush_tlb_pwc); +#endif /* CONFIG_SMP */ void radix__local_flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, int psize) @@ -208,38 +192,35 @@ goto no_context; if (!mm_is_thread_local(mm)) - _tlbie_pid(pid, RIC_FLUSH_ALL); + _tlbie_pid(pid, RIC_FLUSH_TLB); else - _tlbiel_pid(pid, RIC_FLUSH_ALL); + _tlbiel_pid(pid, RIC_FLUSH_TLB); no_context: preempt_enable(); } EXPORT_SYMBOL(radix__flush_tlb_mm); -void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +static void radix__flush_all_mm(struct mm_struct *mm) { unsigned long pid; - struct mm_struct *mm = tlb->mm; - /* - * If we are doing a full mm flush, we will do a tlb flush - * with RIC_FLUSH_ALL later. - */ - if (tlb->fullmm) - return; preempt_disable(); - pid = mm->context.id; if (unlikely(pid == MMU_NO_CONTEXT)) goto no_context; if (!mm_is_thread_local(mm)) - _tlbie_pid(pid, RIC_FLUSH_PWC); + _tlbie_pid(pid, RIC_FLUSH_ALL); else - tlbiel_pwc(pid); + _tlbiel_pid(pid, RIC_FLUSH_ALL); no_context: preempt_enable(); } + +void radix__flush_tlb_pwc(struct mmu_gather *tlb, unsigned long addr) +{ + tlb->need_flush_all = 1; +} EXPORT_SYMBOL(radix__flush_tlb_pwc); void radix__flush_tlb_page_psize(struct mm_struct *mm, unsigned long vmaddr, @@ -271,6 +252,8 @@ } EXPORT_SYMBOL(radix__flush_tlb_page); +#else /* CONFIG_SMP */ +#define radix__flush_all_mm radix__local_flush_all_mm #endif /* CONFIG_SMP */ void radix__flush_tlb_kernel_range(unsigned long start, unsigned long end) @@ -288,6 +271,7 @@ { struct mm_struct *mm = vma->vm_mm; + radix__flush_tlb_mm(mm); } EXPORT_SYMBOL(radix__flush_tlb_range); @@ -319,7 +303,10 @@ */ if (psize != -1 && !tlb->fullmm && !tlb->need_flush_all) radix__flush_tlb_range_psize(mm, tlb->start, tlb->end, psize); - else + else if (tlb->need_flush_all) { + tlb->need_flush_all = 0; + radix__flush_all_mm(mm); + } else radix__flush_tlb_mm(mm); } @@ -364,6 +351,43 @@ preempt_enable(); } +#ifdef CONFIG_TRANSPARENT_HUGEPAGE +void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr) +{ + int local = mm_is_thread_local(mm); + unsigned long ap = mmu_get_ap(mmu_virtual_psize); + unsigned long pid, end; + + + pid = mm ? mm->context.id : 0; + if (unlikely(pid == MMU_NO_CONTEXT)) + goto no_context; + + /* 4k page size, just blow the world */ + if (PAGE_SIZE == 0x1000) { + radix__flush_all_mm(mm); + return; + } + + /* Otherwise first do the PWC */ + if (local) + _tlbiel_pid(pid, RIC_FLUSH_PWC); + else + _tlbie_pid(pid, RIC_FLUSH_PWC); + + /* Then iterate the pages */ + end = addr + HPAGE_PMD_SIZE; + for (; addr < end; addr += PAGE_SIZE) { + if (local) + _tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB); + else + _tlbie_va(addr, pid, ap, RIC_FLUSH_TLB); + } +no_context: + preempt_enable(); +} +#endif /* CONFIG_TRANSPARENT_HUGEPAGE */ + void radix__flush_tlb_lpid_va(unsigned long lpid, unsigned long gpa, unsigned long page_size) { --- linux-azure-4.13.0.orig/arch/powerpc/perf/Makefile +++ linux-azure-4.13.0/arch/powerpc/perf/Makefile @@ -8,6 +8,7 @@ isa207-common.o power8-pmu.o power9-pmu.o obj32-$(CONFIG_PPC_PERF_CTRS) += mpc7450-pmu.o +obj-$(CONFIG_PPC_POWERNV) += imc-pmu.o obj-$(CONFIG_FSL_EMB_PERF_EVENT) += core-fsl-emb.o obj-$(CONFIG_FSL_EMB_PERF_EVENT_E500) += e500-pmu.o e6500-pmu.o --- linux-azure-4.13.0.orig/arch/powerpc/perf/core-book3s.c +++ linux-azure-4.13.0/arch/powerpc/perf/core-book3s.c @@ -1410,7 +1410,7 @@ int n = 0; struct perf_event *event; - if (!is_software_event(group)) { + if (group->pmu->task_ctx_nr == perf_hw_context) { if (n >= max_count) return -1; ctrs[n] = group; @@ -1418,7 +1418,7 @@ events[n++] = group->hw.config; } list_for_each_entry(event, &group->sibling_list, group_entry) { - if (!is_software_event(event) && + if (event->pmu->task_ctx_nr == perf_hw_context && event->state != PERF_EVENT_STATE_OFF) { if (n >= max_count) return -1; --- linux-azure-4.13.0.orig/arch/powerpc/perf/imc-pmu.c +++ linux-azure-4.13.0/arch/powerpc/perf/imc-pmu.c @@ -0,0 +1,1335 @@ +/* + * In-Memory Collection (IMC) Performance Monitor counter support. + * + * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation. + * (C) 2017 Anju T Sudhakar, IBM Corporation. + * (C) 2017 Hemant K Shaw, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or later version. + */ +#include +#include +#include +#include +#include +#include +#include + +/* Nest IMC data structures and variables */ + +/* + * Used to avoid races in counting the nest-pmu units during hotplug + * register and unregister + */ +static DEFINE_MUTEX(nest_init_lock); +static DEFINE_PER_CPU(struct imc_pmu_ref *, local_nest_imc_refc); +static struct imc_pmu *per_nest_pmu_arr[IMC_MAX_PMUS]; +static cpumask_t nest_imc_cpumask; +struct imc_pmu_ref *nest_imc_refc; +static int nest_pmus; + +/* Core IMC data structures and variables */ + +static cpumask_t core_imc_cpumask; +struct imc_pmu_ref *core_imc_refc; +static struct imc_pmu *core_imc_pmu; + +/* Thread IMC data structures and variables */ + +static DEFINE_PER_CPU(u64 *, thread_imc_mem); +static struct imc_pmu *thread_imc_pmu; +static int thread_imc_mem_size; + +struct imc_pmu *imc_event_to_pmu(struct perf_event *event) +{ + return container_of(event->pmu, struct imc_pmu, pmu); +} + +PMU_FORMAT_ATTR(event, "config:0-40"); +PMU_FORMAT_ATTR(offset, "config:0-31"); +PMU_FORMAT_ATTR(rvalue, "config:32"); +PMU_FORMAT_ATTR(mode, "config:33-40"); +static struct attribute *imc_format_attrs[] = { + &format_attr_event.attr, + &format_attr_offset.attr, + &format_attr_rvalue.attr, + &format_attr_mode.attr, + NULL, +}; + +static struct attribute_group imc_format_group = { + .name = "format", + .attrs = imc_format_attrs, +}; + +/* Get the cpumask printed to a buffer "buf" */ +static ssize_t imc_pmu_cpumask_get_attr(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct pmu *pmu = dev_get_drvdata(dev); + struct imc_pmu *imc_pmu = container_of(pmu, struct imc_pmu, pmu); + cpumask_t *active_mask; + + switch(imc_pmu->domain){ + case IMC_DOMAIN_NEST: + active_mask = &nest_imc_cpumask; + break; + case IMC_DOMAIN_CORE: + active_mask = &core_imc_cpumask; + break; + default: + return 0; + } + + return cpumap_print_to_pagebuf(true, buf, active_mask); +} + +static DEVICE_ATTR(cpumask, S_IRUGO, imc_pmu_cpumask_get_attr, NULL); + +static struct attribute *imc_pmu_cpumask_attrs[] = { + &dev_attr_cpumask.attr, + NULL, +}; + +static struct attribute_group imc_pmu_cpumask_attr_group = { + .attrs = imc_pmu_cpumask_attrs, +}; + +/* device_str_attr_create : Populate event "name" and string "str" in attribute */ +static struct attribute *device_str_attr_create(const char *name, const char *str) +{ + struct perf_pmu_events_attr *attr; + + attr = kzalloc(sizeof(*attr), GFP_KERNEL); + if (!attr) + return NULL; + sysfs_attr_init(&attr->attr.attr); + + attr->event_str = str; + attr->attr.attr.name = name; + attr->attr.attr.mode = 0444; + attr->attr.show = perf_event_sysfs_show; + + return &attr->attr.attr; +} + +struct imc_events *imc_parse_event(struct device_node *np, const char *scale, + const char *unit, const char *prefix, u32 base) +{ + struct imc_events *event; + const char *s; + u32 reg; + + event = kzalloc(sizeof(struct imc_events), GFP_KERNEL); + if (!event) + return NULL; + + if (of_property_read_u32(np, "reg", ®)) + goto error; + /* Add the base_reg value to the "reg" */ + event->value = base + reg; + + if (of_property_read_string(np, "event-name", &s)) + goto error; + + event->name = kasprintf(GFP_KERNEL, "%s%s", prefix, s); + if (!event->name) + goto error; + + if (of_property_read_string(np, "scale", &s)) + s = scale; + + if (s) { + event->scale = kstrdup(s, GFP_KERNEL); + if (!event->scale) + goto error; + } + + if (of_property_read_string(np, "unit", &s)) + s = unit; + + if (s) { + event->unit = kstrdup(s, GFP_KERNEL); + if (!event->unit) + goto error; + } + + return event; +error: + kfree(event->unit); + kfree(event->scale); + kfree(event->name); + kfree(event); + + return NULL; +} + +/* + * update_events_in_group: Update the "events" information in an attr_group + * and assign the attr_group to the pmu "pmu". + */ +static int update_events_in_group(struct device_node *node, struct imc_pmu *pmu) +{ + struct attribute_group *attr_group; + struct attribute **attrs, *dev_str; + struct device_node *np, *pmu_events; + struct imc_events *ev; + u32 handle, base_reg; + int i=0, j=0, ct; + const char *prefix, *g_scale, *g_unit; + const char *ev_val_str, *ev_scale_str, *ev_unit_str; + + if (!of_property_read_u32(node, "events", &handle)) + pmu_events = of_find_node_by_phandle(handle); + else + return 0; + + /* Did not find any node with a given phandle */ + if (!pmu_events) + return 0; + + /* Get a count of number of child nodes */ + ct = of_get_child_count(pmu_events); + + /* Get the event prefix */ + if (of_property_read_string(node, "events-prefix", &prefix)) + return 0; + + /* Get a global unit and scale data if available */ + if (of_property_read_string(node, "scale", &g_scale)) + g_scale = NULL; + + if (of_property_read_string(node, "unit", &g_unit)) + g_unit = NULL; + + /* "reg" property gives out the base offset of the counters data */ + of_property_read_u32(node, "reg", &base_reg); + + /* Allocate memory for the events */ + pmu->events = kcalloc(ct, sizeof(struct imc_events), GFP_KERNEL); + if (!pmu->events) + return -ENOMEM; + + ct = 0; + /* Parse the events and update the struct */ + for_each_child_of_node(pmu_events, np) { + ev = imc_parse_event(np, g_scale, g_unit, prefix, base_reg); + if (ev) + pmu->events[ct++] = ev; + } + + /* Allocate memory for attribute group */ + attr_group = kzalloc(sizeof(*attr_group), GFP_KERNEL); + if (!attr_group) + return -ENOMEM; + + /* + * Allocate memory for attributes. + * Since we have count of events for this pmu, we also allocate + * memory for the scale and unit attribute for now. + * "ct" has the total event structs added from the events-parent node. + * So allocate three times the "ct" (this includes event, event_scale and + * event_unit). + */ + attrs = kcalloc(((ct * 3) + 1), sizeof(struct attribute *), GFP_KERNEL); + if (!attrs) { + kfree(attr_group); + kfree(pmu->events); + return -ENOMEM; + } + + attr_group->name = "events"; + attr_group->attrs = attrs; + do { + ev_val_str = kasprintf(GFP_KERNEL, "event=0x%x", pmu->events[i]->value); + dev_str = device_str_attr_create(pmu->events[i]->name, ev_val_str); + if (!dev_str) + continue; + + attrs[j++] = dev_str; + if (pmu->events[i]->scale) { + ev_scale_str = kasprintf(GFP_KERNEL, "%s.scale",pmu->events[i]->name); + dev_str = device_str_attr_create(ev_scale_str, pmu->events[i]->scale); + if (!dev_str) + continue; + + attrs[j++] = dev_str; + } + + if (pmu->events[i]->unit) { + ev_unit_str = kasprintf(GFP_KERNEL, "%s.unit",pmu->events[i]->name); + dev_str = device_str_attr_create(ev_unit_str, pmu->events[i]->unit); + if (!dev_str) + continue; + + attrs[j++] = dev_str; + } + } while (++i < ct); + + /* Save the event attribute */ + pmu->attr_groups[IMC_EVENT_ATTR] = attr_group; + + kfree(pmu->events); + return 0; +} + +/* get_nest_pmu_ref: Return the imc_pmu_ref struct for the given node */ +static struct imc_pmu_ref *get_nest_pmu_ref(int cpu) +{ + return per_cpu(local_nest_imc_refc, cpu); +} + +static void nest_change_cpu_context(int old_cpu, int new_cpu) +{ + struct imc_pmu **pn = per_nest_pmu_arr; + int i; + + if (old_cpu < 0 || new_cpu < 0) + return; + + for (i = 0; *pn && i < IMC_MAX_PMUS; i++, pn++) + perf_pmu_migrate_context(&(*pn)->pmu, old_cpu, new_cpu); +} + +static int ppc_nest_imc_cpu_offline(unsigned int cpu) +{ + int nid, target = -1; + const struct cpumask *l_cpumask; + struct imc_pmu_ref *ref; + + /* + * Check in the designated list for this cpu. Dont bother + * if not one of them. + */ + if (!cpumask_test_and_clear_cpu(cpu, &nest_imc_cpumask)) + return 0; + + /* + * Now that this cpu is one of the designated, + * find a next cpu a) which is online and b) in same chip. + */ + nid = cpu_to_node(cpu); + l_cpumask = cpumask_of_node(nid); + target = cpumask_any_but(l_cpumask, cpu); + + /* + * Update the cpumask with the target cpu and + * migrate the context if needed + */ + if (target >= 0 && target < nr_cpu_ids) { + cpumask_set_cpu(target, &nest_imc_cpumask); + nest_change_cpu_context(cpu, target); + } else { + opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(cpu)); + /* + * If this is the last cpu in this chip then, skip the reference + * count mutex lock and make the reference count on this chip zero. + */ + ref = get_nest_pmu_ref(cpu); + if (!ref) + return -EINVAL; + + ref->refc = 0; + } + return 0; +} + +static int ppc_nest_imc_cpu_online(unsigned int cpu) +{ + const struct cpumask *l_cpumask; + static struct cpumask tmp_mask; + int res; + + /* Get the cpumask of this node */ + l_cpumask = cpumask_of_node(cpu_to_node(cpu)); + + /* + * If this is not the first online CPU on this node, then + * just return. + */ + if (cpumask_and(&tmp_mask, l_cpumask, &nest_imc_cpumask)) + return 0; + + /* + * If this is the first online cpu on this node + * disable the nest counters by making an OPAL call. + */ + res = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(cpu)); + if (res) + return res; + + /* Make this CPU the designated target for counter collection */ + cpumask_set_cpu(cpu, &nest_imc_cpumask); + return 0; +} + +static int nest_pmu_cpumask_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE, + "perf/powerpc/imc:online", + ppc_nest_imc_cpu_online, + ppc_nest_imc_cpu_offline); +} + +static void nest_imc_counters_release(struct perf_event *event) +{ + int rc, node_id; + struct imc_pmu_ref *ref; + + if (event->cpu < 0) + return; + + node_id = cpu_to_node(event->cpu); + + /* + * See if we need to disable the nest PMU. + * If no events are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * enable or disable the nest counters. + */ + ref = get_nest_pmu_ref(event->cpu); + if (!ref) + return; + + /* Take the mutex lock for this node and then decrement the reference count */ + mutex_lock(&ref->lock); + if (ref->refc == 0) { + /* + * The scenario where this is true is, when perf session is + * started, followed by offlining of all cpus in a given node. + * + * In the cpuhotplug offline path, ppc_nest_imc_cpu_offline() + * function set the ref->count to zero, if the cpu which is + * about to offline is the last cpu in a given node and make + * an OPAL call to disable the engine in that node. + * + */ + mutex_unlock(&ref->lock); + return; + } + ref->refc--; + if (ref->refc == 0) { + rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(event->cpu)); + if (rc) { + mutex_unlock(&ref->lock); + pr_err("nest-imc: Unable to stop the counters for core %d\n", node_id); + return; + } + } else if (ref->refc < 0) { + WARN(1, "nest-imc: Invalid event reference count\n"); + ref->refc = 0; + } + mutex_unlock(&ref->lock); +} + +static int nest_imc_event_init(struct perf_event *event) +{ + int chip_id, rc, node_id; + u32 l_config, config = event->attr.config; + struct imc_mem_info *pcni; + struct imc_pmu *pmu; + struct imc_pmu_ref *ref; + bool flag = false; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Sampling not supported */ + if (event->hw.sample_period) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + pmu = imc_event_to_pmu(event); + + /* Sanity check for config (event offset) */ + if ((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size) + return -EINVAL; + + /* + * Nest HW counter memory resides in a per-chip reserve-memory (HOMER). + * Get the base memory addresss for this cpu. + */ + chip_id = topology_physical_package_id(event->cpu); + pcni = pmu->mem_info; + do { + if (pcni->id == chip_id) { + flag = true; + break; + } + pcni++; + } while (pcni); + + if (!flag) + return -ENODEV; + + /* + * Add the event offset to the base address. + */ + l_config = config & IMC_EVENT_OFFSET_MASK; + event->hw.event_base = (u64)pcni->vbase + l_config; + node_id = cpu_to_node(event->cpu); + + /* + * Get the imc_pmu_ref struct for this node. + * Take the mutex lock and then increment the count of nest pmu events + * inited. + */ + ref = get_nest_pmu_ref(event->cpu); + if (!ref) + return -EINVAL; + + mutex_lock(&ref->lock); + if (ref->refc == 0) { + rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(event->cpu)); + if (rc) { + mutex_unlock(&ref->lock); + pr_err("nest-imc: Unable to start the counters for node %d\n", + node_id); + return rc; + } + } + ++ref->refc; + mutex_unlock(&ref->lock); + + event->destroy = nest_imc_counters_release; + return 0; +} + +/* + * core_imc_mem_init : Initializes memory for the current core. + * + * Uses alloc_pages_node() and uses the returned address as an argument to + * an opal call to configure the pdbar. The address sent as an argument is + * converted to physical address before the opal call is made. This is the + * base address at which the core imc counters are populated. + */ +static int core_imc_mem_init(int cpu, int size) +{ + int phys_id, rc = 0, core_id = (cpu / threads_per_core); + struct imc_mem_info *mem_info; + + /* + * alloc_pages_node() will allocate memory for core in the + * local node only. + */ + phys_id = topology_physical_package_id(cpu); + mem_info = &core_imc_pmu->mem_info[core_id]; + mem_info->id = core_id; + + /* We need only vbase for core counters */ + mem_info->vbase = page_address(alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size))); + if (!mem_info->vbase) + return -ENOMEM; + + /* Init the mutex */ + core_imc_refc[core_id].id = core_id; + mutex_init(&core_imc_refc[core_id].lock); + + rc = opal_imc_counters_init(OPAL_IMC_COUNTERS_CORE, + __pa((void *)mem_info->vbase), + get_hard_smp_processor_id(cpu)); + if (rc) { + free_pages((u64)mem_info->vbase, get_order(size)); + mem_info->vbase = NULL; + } + + return rc; +} + +static bool is_core_imc_mem_inited(int cpu) +{ + struct imc_mem_info *mem_info; + int core_id = (cpu / threads_per_core); + + mem_info = &core_imc_pmu->mem_info[core_id]; + if (!mem_info->vbase) + return false; + + return true; +} + +static int ppc_core_imc_cpu_online(unsigned int cpu) +{ + const struct cpumask *l_cpumask; + static struct cpumask tmp_mask; + int ret = 0; + + /* Get the cpumask for this core */ + l_cpumask = cpu_sibling_mask(cpu); + + /* If a cpu for this core is already set, then, don't do anything */ + if (cpumask_and(&tmp_mask, l_cpumask, &core_imc_cpumask)) + return 0; + + if (!is_core_imc_mem_inited(cpu)) { + ret = core_imc_mem_init(cpu, core_imc_pmu->counter_mem_size); + if (ret) { + pr_info("core_imc memory allocation for cpu %d failed\n", cpu); + return ret; + } + } + + /* set the cpu in the mask */ + cpumask_set_cpu(cpu, &core_imc_cpumask); + return 0; +} + +static int ppc_core_imc_cpu_offline(unsigned int cpu) +{ + unsigned int ncpu, core_id; + struct imc_pmu_ref *ref; + + /* + * clear this cpu out of the mask, if not present in the mask, + * don't bother doing anything. + */ + if (!cpumask_test_and_clear_cpu(cpu, &core_imc_cpumask)) + return 0; + + /* Find any online cpu in that core except the current "cpu" */ + ncpu = cpumask_any_but(cpu_sibling_mask(cpu), cpu); + + if (ncpu >= 0 && ncpu < nr_cpu_ids) { + cpumask_set_cpu(ncpu, &core_imc_cpumask); + perf_pmu_migrate_context(&core_imc_pmu->pmu, cpu, ncpu); + } else { + /* + * If this is the last cpu in this core then, skip taking refernce + * count mutex lock for this core and directly zero "refc" for + * this core. + */ + opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(cpu)); + core_id = cpu / threads_per_core; + ref = &core_imc_refc[core_id]; + if (!ref) + return -EINVAL; + + ref->refc = 0; + } + return 0; +} + +static int core_imc_pmu_cpumask_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE, + "perf/powerpc/imc_core:online", + ppc_core_imc_cpu_online, + ppc_core_imc_cpu_offline); +} + +static void core_imc_counters_release(struct perf_event *event) +{ + int rc, core_id; + struct imc_pmu_ref *ref; + + if (event->cpu < 0) + return; + /* + * See if we need to disable the IMC PMU. + * If no events are currently in use, then we have to take a + * mutex to ensure that we don't race with another task doing + * enable or disable the core counters. + */ + core_id = event->cpu / threads_per_core; + + /* Take the mutex lock and decrement the refernce count for this core */ + ref = &core_imc_refc[core_id]; + if (!ref) + return; + + mutex_lock(&ref->lock); + if (ref->refc == 0) { + /* + * The scenario where this is true is, when perf session is + * started, followed by offlining of all cpus in a given core. + * + * In the cpuhotplug offline path, ppc_core_imc_cpu_offline() + * function set the ref->count to zero, if the cpu which is + * about to offline is the last cpu in a given core and make + * an OPAL call to disable the engine in that core. + * + */ + mutex_unlock(&ref->lock); + return; + } + ref->refc--; + if (ref->refc == 0) { + rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(event->cpu)); + if (rc) { + mutex_unlock(&ref->lock); + pr_err("IMC: Unable to stop the counters for core %d\n", core_id); + return; + } + } else if (ref->refc < 0) { + WARN(1, "core-imc: Invalid event reference count\n"); + ref->refc = 0; + } + mutex_unlock(&ref->lock); +} + +static int core_imc_event_init(struct perf_event *event) +{ + int core_id, rc; + u64 config = event->attr.config; + struct imc_mem_info *pcmi; + struct imc_pmu *pmu; + struct imc_pmu_ref *ref; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Sampling not supported */ + if (event->hw.sample_period) + return -EINVAL; + + /* unsupported modes and filters */ + if (event->attr.exclude_user || + event->attr.exclude_kernel || + event->attr.exclude_hv || + event->attr.exclude_idle || + event->attr.exclude_host || + event->attr.exclude_guest) + return -EINVAL; + + if (event->cpu < 0) + return -EINVAL; + + event->hw.idx = -1; + pmu = imc_event_to_pmu(event); + + /* Sanity check for config (event offset) */ + if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size)) + return -EINVAL; + + if (!is_core_imc_mem_inited(event->cpu)) + return -ENODEV; + + core_id = event->cpu / threads_per_core; + pcmi = &core_imc_pmu->mem_info[core_id]; + if ((!pcmi->vbase)) + return -ENODEV; + + /* Get the core_imc mutex for this core */ + ref = &core_imc_refc[core_id]; + if (!ref) + return -EINVAL; + + /* + * Core pmu units are enabled only when it is used. + * See if this is triggered for the first time. + * If yes, take the mutex lock and enable the core counters. + * If not, just increment the count in core_imc_refc struct. + */ + mutex_lock(&ref->lock); + if (ref->refc == 0) { + rc = opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(event->cpu)); + if (rc) { + mutex_unlock(&ref->lock); + pr_err("core-imc: Unable to start the counters for core %d\n", + core_id); + return rc; + } + } + ++ref->refc; + mutex_unlock(&ref->lock); + + event->hw.event_base = (u64)pcmi->vbase + (config & IMC_EVENT_OFFSET_MASK); + event->destroy = core_imc_counters_release; + return 0; +} + +/* + * Allocates a page of memory for each of the online cpus, and write the + * physical base address of that page to the LDBAR for that cpu. + * + * LDBAR Register Layout: + * + * 0 4 8 12 16 20 24 28 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * | | [ ] [ Counter Address [8:50] + * | * Mode | + * | * PB Scope + * * Enable/Disable + * + * 32 36 40 44 48 52 56 60 + * | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | - - - - | + * Counter Address [8:50] ] + * + */ +static int thread_imc_mem_alloc(int cpu_id, int size) +{ + u64 ldbar_value, *local_mem = per_cpu(thread_imc_mem, cpu_id); + int phys_id = topology_physical_package_id(cpu_id); + + if (!local_mem) { + /* + * This case could happen only once at start, since we dont + * free the memory in cpu offline path. + */ + local_mem = page_address(alloc_pages_node(phys_id, + GFP_KERNEL | __GFP_ZERO | __GFP_THISNODE | + __GFP_NOWARN, get_order(size))); + if (!local_mem) + return -ENOMEM; + + per_cpu(thread_imc_mem, cpu_id) = local_mem; + } + + ldbar_value = ((u64)local_mem & THREAD_IMC_LDBAR_MASK) | THREAD_IMC_ENABLE; + + mtspr(SPRN_LDBAR, ldbar_value); + return 0; +} + +static int ppc_thread_imc_cpu_online(unsigned int cpu) +{ + return thread_imc_mem_alloc(cpu, thread_imc_mem_size); +} + +static int ppc_thread_imc_cpu_offline(unsigned int cpu) +{ + mtspr(SPRN_LDBAR, 0); + return 0; +} + +static int thread_imc_cpu_init(void) +{ + return cpuhp_setup_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE, + "perf/powerpc/imc_thread:online", + ppc_thread_imc_cpu_online, + ppc_thread_imc_cpu_offline); +} + +void thread_imc_pmu_sched_task(struct perf_event_context *ctx, + bool sched_in) +{ + int core_id; + struct imc_pmu_ref *ref; + + if (!is_core_imc_mem_inited(smp_processor_id())) + return; + + core_id = smp_processor_id() / threads_per_core; + /* + * imc pmus are enabled only when it is used. + * See if this is triggered for the first time. + * If yes, take the mutex lock and enable the counters. + * If not, just increment the count in ref count struct. + */ + ref = &core_imc_refc[core_id]; + if (!ref) + return; + + if (sched_in) { + mutex_lock(&ref->lock); + if (ref->refc == 0) { + if (opal_imc_counters_start(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(smp_processor_id()))) { + mutex_unlock(&ref->lock); + pr_err("thread-imc: Unable to start the counter\ + for core %d\n", core_id); + return; + } + } + ++ref->refc; + mutex_unlock(&ref->lock); + } else { + mutex_lock(&ref->lock); + ref->refc--; + if (ref->refc == 0) { + if (opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(smp_processor_id()))) { + mutex_unlock(&ref->lock); + pr_err("thread-imc: Unable to stop the counters\ + for core %d\n", core_id); + return; + } + } else if (ref->refc < 0) { + ref->refc = 0; + } + mutex_unlock(&ref->lock); + } + + return; +} + +static int thread_imc_event_init(struct perf_event *event) +{ + u32 config = event->attr.config; + struct task_struct *target; + struct imc_pmu *pmu; + + if (event->attr.type != event->pmu->type) + return -ENOENT; + + /* Sampling not supported */ + if (event->hw.sample_period) + return -EINVAL; + + event->hw.idx = -1; + pmu = imc_event_to_pmu(event); + + /* Sanity check for config offset */ + if (((config & IMC_EVENT_OFFSET_MASK) > pmu->counter_mem_size)) + return -EINVAL; + + target = event->hw.target; + if (!target) + return -EINVAL; + + event->pmu->task_ctx_nr = perf_sw_context; + return 0; +} + +static bool is_thread_imc_pmu(struct perf_event *event) +{ + if (!strncmp(event->pmu->name, "thread_imc", strlen("thread_imc"))) + return true; + + return false; +} + +static u64 * get_event_base_addr(struct perf_event *event) +{ + u64 addr; + + if (is_thread_imc_pmu(event)) { + addr = (u64)per_cpu(thread_imc_mem, smp_processor_id()); + return (u64 *)(addr + (event->attr.config & IMC_EVENT_OFFSET_MASK)); + } + + return (u64 *)event->hw.event_base; +} + +static void thread_imc_pmu_start_txn(struct pmu *pmu, + unsigned int txn_flags) +{ + if (txn_flags & ~PERF_PMU_TXN_ADD) + return; + perf_pmu_disable(pmu); +} + +static void thread_imc_pmu_cancel_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); +} + +static int thread_imc_pmu_commit_txn(struct pmu *pmu) +{ + perf_pmu_enable(pmu); + return 0; +} + +static u64 imc_read_counter(struct perf_event *event) +{ + u64 *addr, data; + + /* + * In-Memory Collection (IMC) counters are free flowing counters. + * So we take a snapshot of the counter value on enable and save it + * to calculate the delta at later stage to present the event counter + * value. + */ + addr = get_event_base_addr(event); + data = be64_to_cpu(READ_ONCE(*addr)); + local64_set(&event->hw.prev_count, data); + + return data; +} + +static void imc_event_update(struct perf_event *event) +{ + u64 counter_prev, counter_new, final_count; + + counter_prev = local64_read(&event->hw.prev_count); + counter_new = imc_read_counter(event); + final_count = counter_new - counter_prev; + + /* Update the delta to the event count */ + local64_add(final_count, &event->count); +} + +static void imc_event_start(struct perf_event *event, int flags) +{ + /* + * In Memory Counters are free flowing counters. HW or the microcode + * keeps adding to the counter offset in memory. To get event + * counter value, we snapshot the value here and we calculate + * delta at later point. + */ + imc_read_counter(event); +} + +static void imc_event_stop(struct perf_event *event, int flags) +{ + /* + * Take a snapshot and calculate the delta and update + * the event counter values. + */ + imc_event_update(event); +} + +static int imc_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + imc_event_start(event, flags); + + return 0; +} + +static int thread_imc_event_add(struct perf_event *event, int flags) +{ + if (flags & PERF_EF_START) + imc_event_start(event, flags); + + /* Enable the sched_task to start the engine */ + perf_sched_cb_inc(event->ctx->pmu); + return 0; +} + +static void thread_imc_event_del(struct perf_event *event, int flags) +{ + /* + * Take a snapshot and calculate the delta and update + * the event counter values. + */ + imc_event_update(event); + perf_sched_cb_dec(event->ctx->pmu); +} + +/* update_pmu_ops : Populate the appropriate operations for "pmu" */ +static int update_pmu_ops(struct imc_pmu *pmu) +{ + pmu->pmu.task_ctx_nr = perf_invalid_context; + pmu->pmu.add = imc_event_add; + pmu->pmu.del = imc_event_stop; + pmu->pmu.start = imc_event_start; + pmu->pmu.stop = imc_event_stop; + pmu->pmu.read = imc_event_update; + pmu->pmu.attr_groups = pmu->attr_groups; + pmu->attr_groups[IMC_FORMAT_ATTR] = &imc_format_group; + + switch (pmu->domain) { + case IMC_DOMAIN_NEST: + pmu->pmu.event_init = nest_imc_event_init; + pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group; + break; + case IMC_DOMAIN_CORE: + pmu->pmu.event_init = core_imc_event_init; + pmu->attr_groups[IMC_CPUMASK_ATTR] = &imc_pmu_cpumask_attr_group; + break; + case IMC_DOMAIN_THREAD: + pmu->pmu.event_init = thread_imc_event_init; + pmu->pmu.sched_task = thread_imc_pmu_sched_task; + pmu->pmu.add = thread_imc_event_add; + pmu->pmu.del = thread_imc_event_del; + pmu->pmu.start_txn = thread_imc_pmu_start_txn; + pmu->pmu.cancel_txn = thread_imc_pmu_cancel_txn; + pmu->pmu.commit_txn = thread_imc_pmu_commit_txn; + break; + default: + break; + } + + return 0; +} + +/* init_nest_pmu_ref: Initialize the imc_pmu_ref struct for all the nodes */ +static int init_nest_pmu_ref(void) +{ + int nid, i, cpu; + + nest_imc_refc = kcalloc(num_possible_nodes(), sizeof(*nest_imc_refc), + GFP_KERNEL); + + if (!nest_imc_refc) + return -ENOMEM; + + i = 0; + for_each_node(nid) { + /* + * Mutex lock to avoid races while tracking the number of + * sessions using the chip's nest pmu units. + */ + mutex_init(&nest_imc_refc[i].lock); + + /* + * Loop to init the "id" with the node_id. Variable "i" initialized to + * 0 and will be used as index to the array. "i" will not go off the + * end of the array since the "for_each_node" loops for "N_POSSIBLE" + * nodes only. + */ + nest_imc_refc[i++].id = nid; + } + + /* + * Loop to init the per_cpu "local_nest_imc_refc" with the proper + * "nest_imc_refc" index. This makes get_nest_pmu_ref() alot simple. + */ + for_each_possible_cpu(cpu) { + nid = cpu_to_node(cpu); + for (i = 0; i < num_possible_nodes(); i++) { + if (nest_imc_refc[i].id == nid) { + per_cpu(local_nest_imc_refc, cpu) = &nest_imc_refc[i]; + break; + } + } + } + return 0; +} + +static void cleanup_all_core_imc_memory(void) +{ + int i, nr_cores = num_present_cpus() / threads_per_core; + struct imc_mem_info *ptr = core_imc_pmu->mem_info; + int size = core_imc_pmu->counter_mem_size; + + /* mem_info will never be NULL */ + for (i = 0; i < nr_cores; i++) { + if (ptr[i].vbase) + free_pages((u64)ptr->vbase, get_order(size)); + } + + kfree(ptr); + kfree(core_imc_refc); +} + +static void thread_imc_ldbar_disable(void *dummy) +{ + /* + * By Zeroing LDBAR, we disable thread-imc + * updates. + */ + mtspr(SPRN_LDBAR, 0); +} + +void thread_imc_disable(void) +{ + on_each_cpu(thread_imc_ldbar_disable, NULL, 1); +} + +static void cleanup_all_thread_imc_memory(void) +{ + int i, order = get_order(thread_imc_mem_size); + + for_each_online_cpu(i) { + if (per_cpu(thread_imc_mem, i)) + free_pages((u64)per_cpu(thread_imc_mem, i), order); + + } +} + +/* + * Common function to unregister cpu hotplug callback and + * free the memory. + * TODO: Need to handle pmu unregistering, which will be + * done in followup series. + */ +static void imc_common_cpuhp_mem_free(struct imc_pmu *pmu_ptr) +{ + if (pmu_ptr->domain == IMC_DOMAIN_NEST) { + mutex_lock(&nest_init_lock); + if (nest_pmus == 1) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_NEST_IMC_ONLINE); + kfree(nest_imc_refc); + } + + if (nest_pmus > 0) + nest_pmus--; + mutex_unlock(&nest_init_lock); + } + + /* Free core_imc memory */ + if (pmu_ptr->domain == IMC_DOMAIN_CORE) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_CORE_IMC_ONLINE); + cleanup_all_core_imc_memory(); + } + + /* Free thread_imc memory */ + if (pmu_ptr->domain == IMC_DOMAIN_THREAD) { + cpuhp_remove_state(CPUHP_AP_PERF_POWERPC_THREAD_IMC_ONLINE); + cleanup_all_thread_imc_memory(); + } + + /* Only free the attr_groups which are dynamically allocated */ + if (pmu_ptr->attr_groups[IMC_EVENT_ATTR]) + kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]->attrs); + kfree(pmu_ptr->attr_groups[IMC_EVENT_ATTR]); + kfree(pmu_ptr); + return; +} + + +/* + * imc_mem_init : Function to support memory allocation for core imc. + */ +static int imc_mem_init(struct imc_pmu *pmu_ptr, struct device_node *parent, + int pmu_index) +{ + const char *s; + int nr_cores, cpu, res; + + if (of_property_read_string(parent, "name", &s)) + return -ENODEV; + + switch (pmu_ptr->domain) { + case IMC_DOMAIN_NEST: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s_imc", "nest_", s); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + /* Needed for hotplug/migration */ + per_nest_pmu_arr[pmu_index] = pmu_ptr; + break; + case IMC_DOMAIN_CORE: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + nr_cores = num_present_cpus() / threads_per_core; + pmu_ptr->mem_info = kcalloc(nr_cores, sizeof(struct imc_mem_info), + GFP_KERNEL); + + if (!pmu_ptr->mem_info) + return -ENOMEM; + + core_imc_refc = kcalloc(nr_cores, sizeof(struct imc_pmu_ref), + GFP_KERNEL); + + if (!core_imc_refc) + return -ENOMEM; + + core_imc_pmu = pmu_ptr; + break; + case IMC_DOMAIN_THREAD: + /* Update the pmu name */ + pmu_ptr->pmu.name = kasprintf(GFP_KERNEL, "%s%s", s, "_imc"); + if (!pmu_ptr->pmu.name) + return -ENOMEM; + + thread_imc_mem_size = pmu_ptr->counter_mem_size; + for_each_online_cpu(cpu) { + res = thread_imc_mem_alloc(cpu, pmu_ptr->counter_mem_size); + if (res) + return res; + } + + thread_imc_pmu = pmu_ptr; + break; + default: + return -EINVAL; + } + + return 0; +} + +/* + * init_imc_pmu : Setup and register the IMC pmu device. + * + * @parent: Device tree unit node + * @pmu_ptr: memory allocated for this pmu + * @pmu_idx: Count of nest pmc registered + * + * init_imc_pmu() setup pmu cpumask and registers for a cpu hotplug callback. + * Handles failure cases and accordingly frees memory. + */ +int init_imc_pmu(struct device_node *parent, struct imc_pmu *pmu_ptr, int pmu_idx) +{ + int ret; + + ret = imc_mem_init(pmu_ptr, parent, pmu_idx); + if (ret) + goto err_free; + + switch (pmu_ptr->domain) { + case IMC_DOMAIN_NEST: + /* + * Nest imc pmu need only one cpu per chip, we initialize the + * cpumask for the first nest imc pmu and use the same for the + * rest. To handle the cpuhotplug callback unregister, we track + * the number of nest pmus in "nest_pmus". + */ + mutex_lock(&nest_init_lock); + if (nest_pmus == 0) { + ret = init_nest_pmu_ref(); + if (ret) { + mutex_unlock(&nest_init_lock); + goto err_free; + } + /* Register for cpu hotplug notification. */ + ret = nest_pmu_cpumask_init(); + if (ret) { + mutex_unlock(&nest_init_lock); + goto err_free; + } + } + nest_pmus++; + mutex_unlock(&nest_init_lock); + break; + case IMC_DOMAIN_CORE: + ret = core_imc_pmu_cpumask_init(); + if (ret) { + cleanup_all_core_imc_memory(); + return ret; + } + + break; + case IMC_DOMAIN_THREAD: + ret = thread_imc_cpu_init(); + if (ret) { + cleanup_all_thread_imc_memory(); + return ret; + } + + break; + default: + return -1; /* Unknown domain */ + } + + ret = update_events_in_group(parent, pmu_ptr); + if (ret) + goto err_free; + + ret = update_pmu_ops(pmu_ptr); + if (ret) + goto err_free; + + ret = perf_pmu_register(&pmu_ptr->pmu, pmu_ptr->pmu.name, -1); + if (ret) + goto err_free; + + pr_info("%s performance monitor hardware support registered\n", + pmu_ptr->pmu.name); + + return 0; + +err_free: + imc_common_cpuhp_mem_free(pmu_ptr); + return ret; +} --- linux-azure-4.13.0.orig/arch/powerpc/platforms/powernv/Makefile +++ linux-azure-4.13.0/arch/powerpc/platforms/powernv/Makefile @@ -12,3 +12,4 @@ obj-$(CONFIG_MEMORY_FAILURE) += opal-memory-errors.o obj-$(CONFIG_TRACEPOINTS) += opal-tracepoints.o obj-$(CONFIG_OPAL_PRD) += opal-prd.o +obj-$(CONFIG_PERF_EVENTS) += opal-imc.o --- linux-azure-4.13.0.orig/arch/powerpc/platforms/powernv/idle.c +++ linux-azure-4.13.0/arch/powerpc/platforms/powernv/idle.c @@ -69,7 +69,7 @@ * all cpus at boot. Get these reg values of current cpu and use the * same across all cpus. */ - uint64_t lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + uint64_t lpcr_val = mfspr(SPRN_LPCR); uint64_t hid0_val = mfspr(SPRN_HID0); uint64_t hid1_val = mfspr(SPRN_HID1); uint64_t hid4_val = mfspr(SPRN_HID4); @@ -388,6 +388,20 @@ } #ifdef CONFIG_HOTPLUG_CPU +static void pnv_program_cpu_hotplug_lpcr(unsigned int cpu, u64 lpcr_val) +{ + u64 pir = get_hard_smp_processor_id(cpu); + + mtspr(SPRN_LPCR, lpcr_val); + + /* + * Program the LPCR via stop-api only for deepest stop state + * can lose hypervisor context. + */ + if (supported_cpuidle_states & OPAL_PM_LOSE_FULL_CONTEXT) + opal_slw_set_reg(pir, SPRN_LPCR, lpcr_val); +} + /* * pnv_cpu_offline: A function that puts the CPU into the deepest * available platform idle state on a CPU-Offline. @@ -397,6 +411,20 @@ { unsigned long srr1; u32 idle_states = pnv_get_supported_cpuidle_states(); + u64 lpcr_val; + + /* + * We don't want to take decrementer interrupts while we are + * offline, so clear LPCR:PECE1. We keep PECE2 (and + * LPCR_PECE_HVEE on P9) enabled as to let IPIs in. + * + * If the CPU gets woken up by a special wakeup, ensure that + * the SLW engine sets LPCR with decrementer bit cleared, else + * the CPU will come back to the kernel due to a spurious + * wakeup. + */ + lpcr_val = mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); __ppc64_runlatch_off(); @@ -428,6 +456,16 @@ __ppc64_runlatch_on(); + /* + * Re-enable decrementer interrupts in LPCR. + * + * Further, we want stop states to be woken up by decrementer + * for non-hotplug cases. So program the LPCR via stop api as + * well. + */ + lpcr_val = mfspr(SPRN_LPCR) | (u64)LPCR_PECE1; + pnv_program_cpu_hotplug_lpcr(cpu, lpcr_val); + return srr1; } #endif --- linux-azure-4.13.0.orig/arch/powerpc/platforms/powernv/npu-dma.c +++ linux-azure-4.13.0/arch/powerpc/platforms/powernv/npu-dma.c @@ -546,6 +546,12 @@ unsigned long pid = npu_context->mm->context.id; /* + * Unfortunately the nest mmu does not support flushing specific + * addresses so we have to flush the whole mm. + */ + flush_tlb_mm(npu_context->mm); + + /* * Loop over all the NPUs this process is active on and launch * an invalidate. */ @@ -576,12 +582,6 @@ } } - /* - * Unfortunately the nest mmu does not support flushing specific - * addresses so we have to flush the whole mm. - */ - flush_tlb_mm(npu_context->mm); - mmio_invalidate_wait(mmio_atsd_reg, flush); if (flush) /* Wait for the flush to complete */ --- linux-azure-4.13.0.orig/arch/powerpc/platforms/powernv/opal-hmi.c +++ linux-azure-4.13.0/arch/powerpc/platforms/powernv/opal-hmi.c @@ -30,6 +30,8 @@ #include #include +#include "powernv.h" + static int opal_hmi_handler_nb_init; struct OpalHmiEvtNode { struct list_head list; @@ -267,8 +269,6 @@ spin_unlock_irqrestore(&opal_hmi_evt_lock, flags); if (unrecoverable) { - int ret; - /* Pull all HMI events from OPAL before we panic. */ while (opal_get_msg(__pa(&msg), sizeof(msg)) == OPAL_SUCCESS) { u32 type; @@ -284,23 +284,7 @@ print_hmi_event_info(hmi_evt); } - /* - * Unrecoverable HMI exception. We need to inform BMC/OCC - * about this error so that it can collect relevant data - * for error analysis before rebooting. - */ - ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, - "Unrecoverable HMI exception"); - if (ret == OPAL_UNSUPPORTED) { - pr_emerg("Reboot type %d not supported\n", - OPAL_REBOOT_PLATFORM_ERROR); - } - - /* - * Fall through and panic if opal_cec_reboot2() returns - * OPAL_UNSUPPORTED. - */ - panic("Unrecoverable HMI exception"); + pnv_platform_error_reboot(NULL, "Unrecoverable HMI exception"); } } --- linux-azure-4.13.0.orig/arch/powerpc/platforms/powernv/opal-imc.c +++ linux-azure-4.13.0/arch/powerpc/platforms/powernv/opal-imc.c @@ -0,0 +1,226 @@ +/* + * OPAL IMC interface detection driver + * Supported on POWERNV platform + * + * Copyright (C) 2017 Madhavan Srinivasan, IBM Corporation. + * (C) 2017 Anju T Sudhakar, IBM Corporation. + * (C) 2017 Hemant K Shaw, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or later version. + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * imc_get_mem_addr_nest: Function to get nest counter memory region + * for each chip + */ +static int imc_get_mem_addr_nest(struct device_node *node, + struct imc_pmu *pmu_ptr, + u32 offset) +{ + int nr_chips = 0, i; + u64 *base_addr_arr, baddr; + u32 *chipid_arr; + + nr_chips = of_property_count_u32_elems(node, "chip-id"); + if (nr_chips <= 0) + return -ENODEV; + + base_addr_arr = kcalloc(nr_chips, sizeof(u64), GFP_KERNEL); + if (!base_addr_arr) + return -ENOMEM; + + chipid_arr = kcalloc(nr_chips, sizeof(u32), GFP_KERNEL); + if (!chipid_arr) + return -ENOMEM; + + if (of_property_read_u32_array(node, "chip-id", chipid_arr, nr_chips)) + goto error; + + if (of_property_read_u64_array(node, "base-addr", base_addr_arr, + nr_chips)) + goto error; + + pmu_ptr->mem_info = kcalloc(nr_chips, sizeof(struct imc_mem_info), + GFP_KERNEL); + if (!pmu_ptr->mem_info) + goto error; + + for (i = 0; i < nr_chips; i++) { + pmu_ptr->mem_info[i].id = chipid_arr[i]; + baddr = base_addr_arr[i] + offset; + pmu_ptr->mem_info[i].vbase = phys_to_virt(baddr); + } + + pmu_ptr->imc_counter_mmaped = true; + kfree(base_addr_arr); + kfree(chipid_arr); + return 0; + +error: + kfree(pmu_ptr->mem_info); + kfree(base_addr_arr); + kfree(chipid_arr); + return -1; +} + +/* + * imc_pmu_create : Takes the parent device which is the pmu unit, pmu_index + * and domain as the inputs. + * Allocates memory for the struct imc_pmu, sets up its domain, size and offsets + */ +static int imc_pmu_create(struct device_node *parent, int pmu_index, int domain) +{ + int ret = 0; + struct imc_pmu *pmu_ptr; + u32 offset; + + /* memory for pmu */ + pmu_ptr = kzalloc(sizeof(struct imc_pmu), GFP_KERNEL); + if (!pmu_ptr) + return -ENOMEM; + + /* Set the domain */ + pmu_ptr->domain = domain; + + ret = of_property_read_u32(parent, "size", &pmu_ptr->counter_mem_size); + if (ret) { + ret = -EINVAL; + goto free_pmu; + } + + if (!of_property_read_u32(parent, "offset", &offset)) { + if (imc_get_mem_addr_nest(parent, pmu_ptr, offset)) { + ret = -EINVAL; + goto free_pmu; + } + } + + /* Function to register IMC pmu */ + ret = init_imc_pmu(parent, pmu_ptr, pmu_index); + if (ret) + pr_err("IMC PMU %s Register failed\n", pmu_ptr->pmu.name); + + return 0; + +free_pmu: + kfree(pmu_ptr); + return ret; +} + +static void disable_nest_pmu_counters(void) +{ + int nid, cpu; + const struct cpumask *l_cpumask; + + get_online_cpus(); + for_each_online_node(nid) { + l_cpumask = cpumask_of_node(nid); + cpu = cpumask_first(l_cpumask); + opal_imc_counters_stop(OPAL_IMC_COUNTERS_NEST, + get_hard_smp_processor_id(cpu)); + } + put_online_cpus(); +} + +static void disable_core_pmu_counters(void) +{ + cpumask_t cores_map; + int cpu, rc; + + get_online_cpus(); + /* Disable the IMC Core functions */ + cores_map = cpu_online_cores_map(); + for_each_cpu(cpu, &cores_map) { + rc = opal_imc_counters_stop(OPAL_IMC_COUNTERS_CORE, + get_hard_smp_processor_id(cpu)); + if (rc) + pr_err("%s: Failed to stop Core (cpu = %d)\n", + __FUNCTION__, cpu); + } + put_online_cpus(); +} + +static int opal_imc_counters_probe(struct platform_device *pdev) +{ + struct device_node *imc_dev = pdev->dev.of_node; + int pmu_count = 0, domain; + u32 type; + + /* + * Check whether this is kdump kernel. If yes, force the engines to + * stop and return. + */ + if (is_kdump_kernel()) { + disable_nest_pmu_counters(); + disable_core_pmu_counters(); + return -ENODEV; + } + + for_each_compatible_node(imc_dev, NULL, IMC_DTB_UNIT_COMPAT) { + if (of_property_read_u32(imc_dev, "type", &type)) { + pr_warn("IMC Device without type property\n"); + continue; + } + + switch (type) { + case IMC_TYPE_CHIP: + domain = IMC_DOMAIN_NEST; + break; + case IMC_TYPE_CORE: + domain =IMC_DOMAIN_CORE; + break; + case IMC_TYPE_THREAD: + domain = IMC_DOMAIN_THREAD; + break; + default: + pr_warn("IMC Unknown Device type \n"); + domain = -1; + break; + } + + if (!imc_pmu_create(imc_dev, pmu_count, domain)) + pmu_count++; + } + + return 0; +} + +static void opal_imc_counters_shutdown(struct platform_device *pdev) +{ + /* + * Function only stops the engines which is bare minimum. + * TODO: Need to handle proper memory cleanup and pmu + * unregister. + */ + disable_nest_pmu_counters(); + disable_core_pmu_counters(); +} + +static const struct of_device_id opal_imc_match[] = { + { .compatible = IMC_DTB_COMPAT }, + {}, +}; + +static struct platform_driver opal_imc_driver = { + .driver = { + .name = "opal-imc-counters", + .of_match_table = opal_imc_match, + }, + .probe = opal_imc_counters_probe, + .shutdown = opal_imc_counters_shutdown, +}; + +builtin_platform_driver(opal_imc_driver); --- linux-azure-4.13.0.orig/arch/powerpc/platforms/powernv/opal-wrappers.S +++ linux-azure-4.13.0/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -310,3 +310,6 @@ OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); +OPAL_CALL(opal_imc_counters_init, OPAL_IMC_COUNTERS_INIT); +OPAL_CALL(opal_imc_counters_start, OPAL_IMC_COUNTERS_START); +OPAL_CALL(opal_imc_counters_stop, OPAL_IMC_COUNTERS_STOP); --- linux-azure-4.13.0.orig/arch/powerpc/platforms/powernv/opal.c +++ linux-azure-4.13.0/arch/powerpc/platforms/powernv/opal.c @@ -16,6 +16,7 @@ #include #include #include +#include #include #include #include @@ -25,11 +26,16 @@ #include #include #include +#include +#include +#include +#include #include #include #include #include +#include #include "powernv.h" @@ -436,10 +442,55 @@ return recovered; } +void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) +{ + /* + * This is mostly taken from kernel/panic.c, but tries to do + * relatively minimal work. Don't use delay functions (TB may + * be broken), don't crash dump (need to set a firmware log), + * don't run notifiers. We do want to get some information to + * Linux console. + */ + console_verbose(); + bust_spinlocks(1); + pr_emerg("Hardware platform error: %s\n", msg); + if (regs) + show_regs(regs); + smp_send_stop(); + printk_safe_flush_on_panic(); + kmsg_dump(KMSG_DUMP_PANIC); + bust_spinlocks(0); + debug_locks_off(); + console_flush_on_panic(); + + /* + * Don't bother to shut things down because this will + * xstop the system. + */ + if (opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, msg) + == OPAL_UNSUPPORTED) { + pr_emerg("Reboot type %d not supported for %s\n", + OPAL_REBOOT_PLATFORM_ERROR, msg); + } + + /* + * We reached here. There can be three possibilities: + * 1. We are running on a firmware level that do not support + * opal_cec_reboot2() + * 2. We are running on a firmware level that do not support + * OPAL_REBOOT_PLATFORM_ERROR reboot type. + * 3. We are running on FSP based system that does not need + * opal to trigger checkstop explicitly for error analysis. + * The FSP PRD component would have already got notified + * about this error through other channels. + */ + + ppc_md.restart(NULL); +} + int opal_machine_check(struct pt_regs *regs) { struct machine_check_event evt; - int ret; if (!get_mce_event(&evt, MCE_EVENT_RELEASE)) return 0; @@ -455,43 +506,7 @@ if (opal_recover_mce(regs, &evt)) return 1; - /* - * Unrecovered machine check, we are heading to panic path. - * - * We may have hit this MCE in very early stage of kernel - * initialization even before opal-prd has started running. If - * this is the case then this MCE error may go un-noticed or - * un-analyzed if we go down panic path. We need to inform - * BMC/OCC about this error so that they can collect relevant - * data for error analysis before rebooting. - * Use opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR) to do so. - * This function may not return on BMC based system. - */ - ret = opal_cec_reboot2(OPAL_REBOOT_PLATFORM_ERROR, - "Unrecoverable Machine Check exception"); - if (ret == OPAL_UNSUPPORTED) { - pr_emerg("Reboot type %d not supported\n", - OPAL_REBOOT_PLATFORM_ERROR); - } - - /* - * We reached here. There can be three possibilities: - * 1. We are running on a firmware level that do not support - * opal_cec_reboot2() - * 2. We are running on a firmware level that do not support - * OPAL_REBOOT_PLATFORM_ERROR reboot type. - * 3. We are running on FSP based system that does not need opal - * to trigger checkstop explicitly for error analysis. The FSP - * PRD component would have already got notified about this - * error through other channels. - * - * If hardware marked this as an unrecoverable MCE, we are - * going to panic anyway. Even if it didn't, it's not safe to - * continue at this point, so we should explicitly panic. - */ - - panic("PowerNV Unrecovered Machine Check"); - return 0; + pnv_platform_error_reboot(regs, "Unrecoverable Machine Check exception"); } /* Early hmi handler called in real mode. */ @@ -720,6 +735,15 @@ of_platform_device_create(np, NULL, NULL); } +static void __init opal_imc_init_dev(void) +{ + struct device_node *np; + + np = of_find_compatible_node(NULL, NULL, IMC_DTB_COMPAT); + if (np) + of_platform_device_create(np, NULL, NULL); +} + static int kopald(void *unused) { unsigned long timeout = msecs_to_jiffies(opal_heartbeat) + 1; @@ -793,6 +817,9 @@ /* Setup a heatbeat thread if requested by OPAL */ opal_init_heartbeat(); + /* Detect In-Memory Collection counters and create devices*/ + opal_imc_init_dev(); + /* Create leds platform devices */ leds = of_find_node_by_path("/ibm,opal/leds"); if (leds) { --- linux-azure-4.13.0.orig/arch/powerpc/platforms/powernv/powernv.h +++ linux-azure-4.13.0/arch/powerpc/platforms/powernv/powernv.h @@ -7,6 +7,8 @@ static inline void pnv_smp_init(void) { } #endif +extern void pnv_platform_error_reboot(struct pt_regs *regs, const char *msg) __noreturn; + struct pci_dev; #ifdef CONFIG_PCI --- linux-azure-4.13.0.orig/arch/powerpc/platforms/powernv/setup.c +++ linux-azure-4.13.0/arch/powerpc/platforms/powernv/setup.c @@ -36,13 +36,62 @@ #include #include #include +#include #include "powernv.h" +static void pnv_setup_rfi_flush(void) +{ + struct device_node *np, *fw_features; + enum l1d_flush_type type; + int enable; + + /* Default to fallback in case fw-features are not available */ + type = L1D_FLUSH_FALLBACK; + enable = 1; + + np = of_find_node_by_name(NULL, "ibm,opal"); + fw_features = of_get_child_by_name(np, "fw-features"); + of_node_put(np); + + if (fw_features) { + np = of_get_child_by_name(fw_features, "inst-l1d-flush-trig2"); + if (np && of_property_read_bool(np, "enabled")) + type = L1D_FLUSH_MTTRIG; + + of_node_put(np); + + np = of_get_child_by_name(fw_features, "inst-l1d-flush-ori30,30,0"); + if (np && of_property_read_bool(np, "enabled")) + type = L1D_FLUSH_ORI; + + of_node_put(np); + + /* Enable unless firmware says NOT to */ + enable = 2; + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-hv-1-to-0"); + if (np && of_property_read_bool(np, "disabled")) + enable--; + + of_node_put(np); + + np = of_get_child_by_name(fw_features, "needs-l1d-flush-msr-pr-0-to-1"); + if (np && of_property_read_bool(np, "disabled")) + enable--; + + of_node_put(np); + of_node_put(fw_features); + } + + setup_rfi_flush(type, enable > 0); +} + static void __init pnv_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); + pnv_setup_rfi_flush(); + /* Initialize SMP */ pnv_smp_init(); @@ -272,7 +321,15 @@ #ifdef CONFIG_MEMORY_HOTPLUG_SPARSE static unsigned long pnv_memory_block_size(void) { - return 256UL * 1024 * 1024; + /* + * We map the kernel linear region with 1GB large pages on radix. For + * memory hot unplug to work our memory block size must be at least + * this size. + */ + if (radix_enabled()) + return 1UL * 1024 * 1024 * 1024; + else + return 256UL * 1024 * 1024; } #endif --- linux-azure-4.13.0.orig/arch/powerpc/platforms/powernv/smp.c +++ linux-azure-4.13.0/arch/powerpc/platforms/powernv/smp.c @@ -49,6 +49,13 @@ static void pnv_smp_setup_cpu(int cpu) { + /* + * P9 workaround for CI vector load (see traps.c), + * enable the corresponding HMI interrupt + */ + if (pvr_version_is(PVR_POWER9)) + mtspr(SPRN_HMEER, mfspr(SPRN_HMEER) | PPC_BIT(17)); + if (xive_enabled()) xive_smp_setup_cpu(); else if (cpu != boot_cpuid) @@ -164,12 +171,6 @@ if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; - /* We don't want to take decrementer interrupts while we are offline, - * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9) - * enabled as to let IPIs in. - */ - mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) & ~(u64)LPCR_PECE1); - while (!generic_check_cpu_restart(cpu)) { /* * Clear IPI flag, since we don't handle IPIs while @@ -219,8 +220,6 @@ } - /* Re-enable decrementer interrupts */ - mtspr(SPRN_LPCR, mfspr(SPRN_LPCR) | LPCR_PECE1); DBG("CPU%d coming online...\n", cpu); } --- linux-azure-4.13.0.orig/arch/powerpc/platforms/ps3/setup.c +++ linux-azure-4.13.0/arch/powerpc/platforms/ps3/setup.c @@ -104,20 +104,6 @@ ps3_sys_manager_halt(); /* never returns */ } -static void ps3_panic(char *str) -{ - DBG("%s:%d %s\n", __func__, __LINE__, str); - - smp_send_stop(); - printk("\n"); - printk(" System does not reboot automatically.\n"); - printk(" Please press POWER button.\n"); - printk("\n"); - - while(1) - lv1_pause(1); -} - #if defined(CONFIG_FB_PS3) || defined(CONFIG_FB_PS3_MODULE) || \ defined(CONFIG_PS3_FLASH) || defined(CONFIG_PS3_FLASH_MODULE) static void __init prealloc(struct ps3_prealloc *p) @@ -269,7 +255,6 @@ .probe = ps3_probe, .setup_arch = ps3_setup_arch, .init_IRQ = ps3_init_IRQ, - .panic = ps3_panic, .get_boot_time = ps3_get_boot_time, .set_dabr = ps3_set_dabr, .calibrate_decr = ps3_calibrate_decr, --- linux-azure-4.13.0.orig/arch/powerpc/platforms/pseries/hotplug-memory.c +++ linux-azure-4.13.0/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -817,6 +817,9 @@ return -EINVAL; for (i = 0; i < num_lmbs && lmbs_to_add != lmbs_added; i++) { + if (lmbs[i].flags & DRCONF_MEM_ASSIGNED) + continue; + rc = dlpar_acquire_drc(lmbs[i].drc_index); if (rc) continue; @@ -859,6 +862,7 @@ lmbs[i].base_addr, lmbs[i].drc_index); lmbs[i].reserved = 0; } + rc = 0; } return rc; --- linux-azure-4.13.0.orig/arch/powerpc/platforms/pseries/mobility.c +++ linux-azure-4.13.0/arch/powerpc/platforms/pseries/mobility.c @@ -226,8 +226,10 @@ return -ENOENT; dn = dlpar_configure_connector(drc_index, parent_dn); - if (!dn) + if (!dn) { + of_node_put(parent_dn); return -ENOENT; + } rc = dlpar_attach_node(dn); if (rc) --- linux-azure-4.13.0.orig/arch/powerpc/platforms/pseries/setup.c +++ linux-azure-4.13.0/arch/powerpc/platforms/pseries/setup.c @@ -455,6 +455,38 @@ of_pci_check_probe_only(); } +static void pSeries_setup_rfi_flush(void) +{ + unsigned long character, behaviour, rc; + enum l1d_flush_type types; + bool enable; + + /* Enable by default */ + enable = true; + + rc = plpar_get_cpu_characteristics(&character, &behaviour); + if (rc == H_SUCCESS) { + types = L1D_FLUSH_NONE; + + if (character & H_GET_CPU_CHAR_CHAR_MTTRIG2_L1_FLUSH) + types |= L1D_FLUSH_MTTRIG; + if (character & H_GET_CPU_CHAR_CHAR_ORI30_L1_FLUSH) + types |= L1D_FLUSH_ORI; + + /* Use fallback if nothing set in hcall */ + if (types == L1D_FLUSH_NONE) + types = L1D_FLUSH_FALLBACK; + + if (!(behaviour & H_GET_CPU_CHAR_BEHAV_L1_FLUSH_LOW_PRIV)) + enable = false; + } else { + /* Default to fallback if case hcall is not available */ + types = L1D_FLUSH_FALLBACK; + } + + setup_rfi_flush(types, enable); +} + static void __init pSeries_setup_arch(void) { set_arch_panic_timeout(10, ARCH_PANIC_TIMEOUT); @@ -472,6 +504,8 @@ fwnmi_init(); + pSeries_setup_rfi_flush(); + /* By default, only probe PCI (can be overridden by rtas_pci) */ pci_add_flags(PCI_PROBE_ONLY); @@ -722,7 +756,6 @@ .pcibios_fixup = pSeries_final_fixup, .restart = rtas_restart, .halt = rtas_halt, - .panic = rtas_os_term, .get_boot_time = rtas_get_boot_time, .get_rtc_time = rtas_get_rtc_time, .set_rtc_time = rtas_set_rtc_time, --- linux-azure-4.13.0.orig/arch/powerpc/sysdev/xive/common.c +++ linux-azure-4.13.0/arch/powerpc/sysdev/xive/common.c @@ -447,7 +447,7 @@ int cpu, first, num, i; /* Pick up a starting point CPU in the mask based on fuzz */ - num = cpumask_weight(mask); + num = min_t(int, cpumask_weight(mask), nr_cpu_ids); first = fuzz % num; /* Locate it */ --- linux-azure-4.13.0.orig/arch/s390/Kconfig +++ linux-azure-4.13.0/arch/s390/Kconfig @@ -519,6 +519,23 @@ If unsure, say Y. +config KERNEL_NOBP + def_bool n + prompt "Enable modified branch prediction for the kernel by default" + help + If this option is selected the kernel will switch to a modified + branch prediction mode if the firmware interface is available. + The modified branch prediction mode improves the behaviour in + regard to speculative execution. + + With the option enabled the kernel parameter "nobp=0" or "nospec" + can be used to run the kernel in the normal branch prediction mode. + + With the option disabled the modified branch prediction mode is + enabled with the "nobp=1" kernel parameter. + + If unsure, say N. + endmenu menu "Memory setup" @@ -925,3 +942,11 @@ need this. endmenu + +config KMSG_IDS + def_bool y + prompt "Kernel message numbers" + help + Select this option if you want to include a message number to the + prefix for kernel messages issued by the s390 architecture and + driver code. See "Documentation/s390/kmsg.txt" for more details. --- linux-azure-4.13.0.orig/arch/s390/configs/zfcpdump_defconfig +++ linux-azure-4.13.0/arch/s390/configs/zfcpdump_defconfig @@ -59,6 +59,7 @@ # CONFIG_NETWORK_FILESYSTEMS is not set CONFIG_PRINTK_TIME=y CONFIG_DEBUG_INFO=y +CONFIG_DEBUG_FS=y CONFIG_DEBUG_KERNEL=y CONFIG_PANIC_ON_OOPS=y # CONFIG_SCHED_DEBUG is not set --- linux-azure-4.13.0.orig/arch/s390/include/asm/alternative.h +++ linux-azure-4.13.0/arch/s390/include/asm/alternative.h @@ -0,0 +1,149 @@ +#ifndef _ASM_S390_ALTERNATIVE_H +#define _ASM_S390_ALTERNATIVE_H + +#ifndef __ASSEMBLY__ + +#include +#include +#include + +struct alt_instr { + s32 instr_offset; /* original instruction */ + s32 repl_offset; /* offset to replacement instruction */ + u16 facility; /* facility bit set for replacement */ + u8 instrlen; /* length of original instruction */ + u8 replacementlen; /* length of new instruction */ +} __packed; + +extern void apply_alternative_instructions(void); +extern void apply_alternatives(struct alt_instr *start, struct alt_instr *end); + +/* + * |661: |662: |6620 |663: + * +-----------+---------------------+ + * | oldinstr | oldinstr_padding | + * | +----------+----------+ + * | | | | + * | | >6 bytes |6/4/2 nops| + * | |6 bytes jg-----------> + * +-----------+---------------------+ + * ^^ static padding ^^ + * + * .altinstr_replacement section + * +---------------------+-----------+ + * |6641: |6651: + * | alternative instr 1 | + * +-----------+---------+- - - - - -+ + * |6642: |6652: | + * | alternative instr 2 | padding + * +---------------------+- - - - - -+ + * ^ runtime ^ + * + * .altinstructions section + * +---------------------------------+ + * | alt_instr entries for each | + * | alternative instr | + * +---------------------------------+ + */ + +#define b_altinstr(num) "664"#num +#define e_altinstr(num) "665"#num + +#define e_oldinstr_pad_end "663" +#define oldinstr_len "662b-661b" +#define oldinstr_total_len e_oldinstr_pad_end"b-661b" +#define altinstr_len(num) e_altinstr(num)"b-"b_altinstr(num)"b" +#define oldinstr_pad_len(num) \ + "-(((" altinstr_len(num) ")-(" oldinstr_len ")) > 0) * " \ + "((" altinstr_len(num) ")-(" oldinstr_len "))" + +#define INSTR_LEN_SANITY_CHECK(len) \ + ".if " len " > 254\n" \ + "\t.error \"cpu alternatives does not support instructions " \ + "blocks > 254 bytes\"\n" \ + ".endif\n" \ + ".if (" len ") %% 2\n" \ + "\t.error \"cpu alternatives instructions length is odd\"\n" \ + ".endif\n" + +#define OLDINSTR_PADDING(oldinstr, num) \ + ".if " oldinstr_pad_len(num) " > 6\n" \ + "\tjg " e_oldinstr_pad_end "f\n" \ + "6620:\n" \ + "\t.fill (" oldinstr_pad_len(num) " - (6620b-662b)) / 2, 2, 0x0700\n" \ + ".else\n" \ + "\t.fill " oldinstr_pad_len(num) " / 6, 6, 0xc0040000\n" \ + "\t.fill " oldinstr_pad_len(num) " %% 6 / 4, 4, 0x47000000\n" \ + "\t.fill " oldinstr_pad_len(num) " %% 6 %% 4 / 2, 2, 0x0700\n" \ + ".endif\n" + +#define OLDINSTR(oldinstr, num) \ + "661:\n\t" oldinstr "\n662:\n" \ + OLDINSTR_PADDING(oldinstr, num) \ + e_oldinstr_pad_end ":\n" \ + INSTR_LEN_SANITY_CHECK(oldinstr_len) + +#define OLDINSTR_2(oldinstr, num1, num2) \ + "661:\n\t" oldinstr "\n662:\n" \ + ".if " altinstr_len(num1) " < " altinstr_len(num2) "\n" \ + OLDINSTR_PADDING(oldinstr, num2) \ + ".else\n" \ + OLDINSTR_PADDING(oldinstr, num1) \ + ".endif\n" \ + e_oldinstr_pad_end ":\n" \ + INSTR_LEN_SANITY_CHECK(oldinstr_len) + +#define ALTINSTR_ENTRY(facility, num) \ + "\t.long 661b - .\n" /* old instruction */ \ + "\t.long " b_altinstr(num)"b - .\n" /* alt instruction */ \ + "\t.word " __stringify(facility) "\n" /* facility bit */ \ + "\t.byte " oldinstr_total_len "\n" /* source len */ \ + "\t.byte " altinstr_len(num) "\n" /* alt instruction len */ + +#define ALTINSTR_REPLACEMENT(altinstr, num) /* replacement */ \ + b_altinstr(num)":\n\t" altinstr "\n" e_altinstr(num) ":\n" \ + INSTR_LEN_SANITY_CHECK(altinstr_len(num)) + +/* alternative assembly primitive: */ +#define ALTERNATIVE(oldinstr, altinstr, facility) \ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(altinstr, 1) \ + ".popsection\n" \ + OLDINSTR(oldinstr, 1) \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(facility, 1) \ + ".popsection\n" + +#define ALTERNATIVE_2(oldinstr, altinstr1, facility1, altinstr2, facility2)\ + ".pushsection .altinstr_replacement, \"ax\"\n" \ + ALTINSTR_REPLACEMENT(altinstr1, 1) \ + ALTINSTR_REPLACEMENT(altinstr2, 2) \ + ".popsection\n" \ + OLDINSTR_2(oldinstr, 1, 2) \ + ".pushsection .altinstructions,\"a\"\n" \ + ALTINSTR_ENTRY(facility1, 1) \ + ALTINSTR_ENTRY(facility2, 2) \ + ".popsection\n" + +/* + * Alternative instructions for different CPU types or capabilities. + * + * This allows to use optimized instructions even on generic binary + * kernels. + * + * oldinstr is padded with jump and nops at compile time if altinstr is + * longer. altinstr is padded with jump and nops at run-time during patching. + * + * For non barrier like inlines please define new variants + * without volatile and memory clobber. + */ +#define alternative(oldinstr, altinstr, facility) \ + asm volatile(ALTERNATIVE(oldinstr, altinstr, facility) : : : "memory") + +#define alternative_2(oldinstr, altinstr1, facility1, altinstr2, facility2) \ + asm volatile(ALTERNATIVE_2(oldinstr, altinstr1, facility1, \ + altinstr2, facility2) ::: "memory") + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_S390_ALTERNATIVE_H */ --- linux-azure-4.13.0.orig/arch/s390/include/asm/barrier.h +++ linux-azure-4.13.0/arch/s390/include/asm/barrier.h @@ -7,6 +7,8 @@ #ifndef __ASM_BARRIER_H #define __ASM_BARRIER_H +#include + /* * Force strict CPU ordering. * And yes, this is required on UP too when we're talking @@ -22,6 +24,14 @@ #define mb() do { asm volatile(__ASM_BARRIER : : : "memory"); } while (0) +static inline void osb(void) +{ + asm volatile( + ALTERNATIVE("", ".long 0xb2e8f000", 81) + : : : "memory"); +} +#define osb osb + #define rmb() barrier() #define wmb() barrier() #define dma_rmb() mb() --- linux-azure-4.13.0.orig/arch/s390/include/asm/facility.h +++ linux-azure-4.13.0/arch/s390/include/asm/facility.h @@ -14,6 +14,24 @@ #define MAX_FACILITY_BIT (sizeof(((struct lowcore *)0)->stfle_fac_list) * 8) +static inline void __set_facility(unsigned long nr, void *facilities) +{ + unsigned char *ptr = (unsigned char *) facilities; + + if (nr >= MAX_FACILITY_BIT) + return; + ptr[nr >> 3] |= 0x80 >> (nr & 7); +} + +static inline void __clear_facility(unsigned long nr, void *facilities) +{ + unsigned char *ptr = (unsigned char *) facilities; + + if (nr >= MAX_FACILITY_BIT) + return; + ptr[nr >> 3] &= ~(0x80 >> (nr & 7)); +} + static inline int __test_facility(unsigned long nr, void *facilities) { unsigned char *ptr; --- linux-azure-4.13.0.orig/arch/s390/include/asm/futex.h +++ linux-azure-4.13.0/arch/s390/include/asm/futex.h @@ -21,17 +21,12 @@ : "0" (-EFAULT), "d" (oparg), "a" (uaddr), \ "m" (*uaddr) : "cc"); -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, newval, ret; load_kernel_asce(); - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; pagefault_disable(); switch (op) { @@ -60,17 +55,9 @@ } pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/s390/include/asm/kvm_host.h +++ linux-azure-4.13.0/arch/s390/include/asm/kvm_host.h @@ -210,7 +210,8 @@ __u16 ipa; /* 0x0056 */ __u32 ipb; /* 0x0058 */ __u32 scaoh; /* 0x005c */ - __u8 reserved60; /* 0x0060 */ +#define FPF_BPBC 0x20 + __u8 fpf; /* 0x0060 */ #define ECB_GS 0x40 #define ECB_TE 0x10 #define ECB_SRSI 0x04 --- linux-azure-4.13.0.orig/arch/s390/include/asm/lowcore.h +++ linux-azure-4.13.0/arch/s390/include/asm/lowcore.h @@ -154,7 +154,8 @@ __u8 pad_0x0e20[0x0f00-0x0e20]; /* 0x0e20 */ /* Extended facility list */ - __u64 stfle_fac_list[32]; /* 0x0f00 */ + __u64 stfle_fac_list[16]; /* 0x0f00 */ + __u64 alt_stfle_fac_list[16]; /* 0x0f80 */ __u8 pad_0x1000[0x11b0-0x1000]; /* 0x1000 */ /* Pointer to the machine check extended save area */ --- linux-azure-4.13.0.orig/arch/s390/include/asm/mmu.h +++ linux-azure-4.13.0/arch/s390/include/asm/mmu.h @@ -5,6 +5,7 @@ #include typedef struct { + spinlock_t lock; cpumask_t cpu_attach_mask; atomic_t flush_count; unsigned int flush_mm; @@ -27,6 +28,7 @@ } mm_context_t; #define INIT_MM_CONTEXT(name) \ + .context.lock = __SPIN_LOCK_UNLOCKED(name.context.lock), \ .context.pgtable_lock = \ __SPIN_LOCK_UNLOCKED(name.context.pgtable_lock), \ .context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \ --- linux-azure-4.13.0.orig/arch/s390/include/asm/mmu_context.h +++ linux-azure-4.13.0/arch/s390/include/asm/mmu_context.h @@ -12,10 +12,12 @@ #include #include #include +#include static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { + spin_lock_init(&mm->context.lock); spin_lock_init(&mm->context.pgtable_lock); INIT_LIST_HEAD(&mm->context.pgtable_list); spin_lock_init(&mm->context.gmap_lock); @@ -102,7 +104,6 @@ if (prev == next) return; cpumask_set_cpu(cpu, &next->context.cpu_attach_mask); - cpumask_set_cpu(cpu, mm_cpumask(next)); /* Clear old ASCE by loading the kernel ASCE. */ __ctl_load(S390_lowcore.kernel_asce, 1, 1); __ctl_load(S390_lowcore.kernel_asce, 7, 7); @@ -120,9 +121,8 @@ preempt_disable(); while (atomic_read(&mm->context.flush_count)) cpu_relax(); - - if (mm->context.flush_mm) - __tlb_flush_mm(mm); + cpumask_set_cpu(smp_processor_id(), mm_cpumask(mm)); + __tlb_flush_mm_lazy(mm); preempt_enable(); } set_fs(current->thread.mm_segment); @@ -135,33 +135,8 @@ struct mm_struct *next) { switch_mm(prev, next, current); + cpumask_set_cpu(smp_processor_id(), mm_cpumask(next)); set_user_asce(next); } -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) -{ -} - -static inline void arch_exit_mmap(struct mm_struct *mm) -{ -} - -static inline void arch_unmap(struct mm_struct *mm, - struct vm_area_struct *vma, - unsigned long start, unsigned long end) -{ -} - -static inline void arch_bprm_mm_init(struct mm_struct *mm, - struct vm_area_struct *vma) -{ -} - -static inline bool arch_vma_access_permitted(struct vm_area_struct *vma, - bool write, bool execute, bool foreign) -{ - /* by default, allow everything */ - return true; -} #endif /* __S390_MMU_CONTEXT_H */ --- linux-azure-4.13.0.orig/arch/s390/include/asm/pgtable.h +++ linux-azure-4.13.0/arch/s390/include/asm/pgtable.h @@ -505,7 +505,7 @@ * In the case that a guest uses storage keys * faults should no longer be backed by zero pages */ -#define mm_forbids_zeropage mm_use_skey +#define mm_forbids_zeropage mm_has_pgste static inline int mm_use_skey(struct mm_struct *mm) { #ifdef CONFIG_PGSTE @@ -1462,7 +1462,9 @@ static inline void pmdp_invalidate(struct vm_area_struct *vma, unsigned long addr, pmd_t *pmdp) { - pmdp_xchg_direct(vma->vm_mm, addr, pmdp, __pmd(_SEGMENT_ENTRY_EMPTY)); + pmd_t pmd = __pmd(pmd_val(*pmdp) | _SEGMENT_ENTRY_INVALID); + + pmdp_xchg_direct(vma->vm_mm, addr, pmdp, pmd); } #define __HAVE_ARCH_PMDP_SET_WRPROTECT --- linux-azure-4.13.0.orig/arch/s390/include/asm/processor.h +++ linux-azure-4.13.0/arch/s390/include/asm/processor.h @@ -88,6 +88,7 @@ extern const struct seq_operations cpuinfo_op; extern int sysctl_ieee_emulation_warnings; extern void execve_tail(void); +extern void __bpon(void); /* * User space process size: 2GB for 31 bit, 4TB or 8PT for 64 bit. --- linux-azure-4.13.0.orig/arch/s390/include/asm/tlbflush.h +++ linux-azure-4.13.0/arch/s390/include/asm/tlbflush.h @@ -43,23 +43,6 @@ * Flush TLB entries for a specific mm on all CPUs (in case gmap is used * this implicates multiple ASCEs!). */ -static inline void __tlb_flush_full(struct mm_struct *mm) -{ - preempt_disable(); - atomic_inc(&mm->context.flush_count); - if (cpumask_equal(mm_cpumask(mm), cpumask_of(smp_processor_id()))) { - /* Local TLB flush */ - __tlb_flush_local(); - } else { - /* Global TLB flush */ - __tlb_flush_global(); - /* Reset TLB flush mask */ - cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); - } - atomic_dec(&mm->context.flush_count); - preempt_enable(); -} - static inline void __tlb_flush_mm(struct mm_struct *mm) { unsigned long gmap_asce; @@ -71,16 +54,18 @@ */ preempt_disable(); atomic_inc(&mm->context.flush_count); + /* Reset TLB flush mask */ + cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); + barrier(); gmap_asce = READ_ONCE(mm->context.gmap_asce); if (MACHINE_HAS_IDTE && gmap_asce != -1UL) { if (gmap_asce) __tlb_flush_idte(gmap_asce); __tlb_flush_idte(mm->context.asce); } else { - __tlb_flush_full(mm); + /* Global TLB flush */ + __tlb_flush_global(); } - /* Reset TLB flush mask */ - cpumask_copy(mm_cpumask(mm), &mm->context.cpu_attach_mask); atomic_dec(&mm->context.flush_count); preempt_enable(); } @@ -94,7 +79,6 @@ } #else #define __tlb_flush_global() __tlb_flush_local() -#define __tlb_flush_full(mm) __tlb_flush_local() /* * Flush TLB entries for a specific ASCE on all CPUs. @@ -112,10 +96,12 @@ static inline void __tlb_flush_mm_lazy(struct mm_struct * mm) { + spin_lock(&mm->context.lock); if (mm->context.flush_mm) { - __tlb_flush_mm(mm); mm->context.flush_mm = 0; + __tlb_flush_mm(mm); } + spin_unlock(&mm->context.lock); } /* --- linux-azure-4.13.0.orig/arch/s390/include/uapi/asm/kvm.h +++ linux-azure-4.13.0/arch/s390/include/uapi/asm/kvm.h @@ -221,6 +221,7 @@ #define KVM_SYNC_RICCB (1UL << 7) #define KVM_SYNC_FPRS (1UL << 8) #define KVM_SYNC_GSCB (1UL << 9) +#define KVM_SYNC_BPBC (1UL << 10) /* length and alignment of the sdnx as a power of two */ #define SDNXC 8 #define SDNXL (1UL << SDNXC) @@ -244,7 +245,9 @@ }; __u8 reserved[512]; /* for future vector expansion */ __u32 fpc; /* valid on KVM_SYNC_VRS or KVM_SYNC_FPRS */ - __u8 padding1[52]; /* riccb needs to be 64byte aligned */ + __u8 bpbc : 1; /* bp mode */ + __u8 reserved2 : 7; + __u8 padding1[51]; /* riccb needs to be 64byte aligned */ __u8 riccb[64]; /* runtime instrumentation controls block */ __u8 padding2[192]; /* sdnx needs to be 256byte aligned */ union { --- linux-azure-4.13.0.orig/arch/s390/kernel/Makefile +++ linux-azure-4.13.0/arch/s390/kernel/Makefile @@ -56,7 +56,7 @@ obj-y += debug.o irq.o ipl.o dis.o diag.o vdso.o als.o obj-y += sysinfo.o jump_label.o lgr.o os_info.o machine_kexec.o pgm_check.o obj-y += runtime_instr.o cache.o fpu.o dumpstack.o guarded_storage.o -obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o +obj-y += entry.o reipl.o relocate_kernel.o kdebugfs.o alternative.o extra-y += head.o head64.o vmlinux.lds @@ -83,3 +83,6 @@ # vdso obj-y += vdso64/ obj-$(CONFIG_COMPAT) += vdso32/ + +# kernel message catalog +obj-$(CONFIG_KMSG_IDS) += kmsg.o --- linux-azure-4.13.0.orig/arch/s390/kernel/alternative.c +++ linux-azure-4.13.0/arch/s390/kernel/alternative.c @@ -0,0 +1,147 @@ +#include +#include +#include + +#define MAX_PATCH_LEN (255 - 1) + +static int __initdata_or_module alt_instr_disabled; + +static int __init disable_alternative_instructions(char *str) +{ + alt_instr_disabled = 1; + return 0; +} + +early_param("noaltinstr", disable_alternative_instructions); + +static int __init nobp_setup_early(char *str) +{ + bool enabled; + int rc; + + rc = kstrtobool(str, &enabled); + if (rc) + return rc; + if (enabled && test_facility(82)) + __set_facility(82, S390_lowcore.alt_stfle_fac_list); + else + __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + return 0; +} +early_param("nobp", nobp_setup_early); + +static int __init nospec_setup_early(char *str) +{ + __clear_facility(82, S390_lowcore.alt_stfle_fac_list); + return 0; +} +early_param("nospec", nospec_setup_early); + +static int __init nogmb_setup_early(char *str) +{ + __clear_facility(81, S390_lowcore.alt_stfle_fac_list); + return 0; +} +early_param("nogmb", nogmb_setup_early); + +struct brcl_insn { + u16 opc; + s32 disp; +} __packed; + +static u16 __initdata_or_module nop16 = 0x0700; +static u32 __initdata_or_module nop32 = 0x47000000; +static struct brcl_insn __initdata_or_module nop48 = { + 0xc004, 0 +}; + +static const void * __initdata_or_module nops[] = { + &nop16, + &nop32, + &nop48 +}; + +static void __init_or_module add_jump_padding(void *insns, unsigned int len) +{ + struct brcl_insn brcl = { + 0xc0f4, + len / 2 + }; + + memcpy(insns, &brcl, sizeof(brcl)); + insns += sizeof(brcl); + len -= sizeof(brcl); + + while (len > 0) { + memcpy(insns, &nop16, 2); + insns += 2; + len -= 2; + } +} + +static void __init_or_module add_padding(void *insns, unsigned int len) +{ + if (len > 6) + add_jump_padding(insns, len); + else if (len >= 2) + memcpy(insns, nops[len / 2 - 1], len); +} + +static void __init_or_module __apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + struct alt_instr *a; + u8 *instr, *replacement; + u8 insnbuf[MAX_PATCH_LEN]; + + /* + * The scan order should be from start to end. A later scanned + * alternative code can overwrite previously scanned alternative code. + */ + for (a = start; a < end; a++) { + int insnbuf_sz = 0; + + instr = (u8 *)&a->instr_offset + a->instr_offset; + replacement = (u8 *)&a->repl_offset + a->repl_offset; + + if (!__test_facility(a->facility, + S390_lowcore.alt_stfle_fac_list)) + continue; + + if (unlikely(a->instrlen % 2 || a->replacementlen % 2)) { + WARN_ONCE(1, "cpu alternatives instructions length is " + "odd, skipping patching\n"); + continue; + } + + memcpy(insnbuf, replacement, a->replacementlen); + insnbuf_sz = a->replacementlen; + + if (a->instrlen > a->replacementlen) { + add_padding(insnbuf + a->replacementlen, + a->instrlen - a->replacementlen); + insnbuf_sz += a->instrlen - a->replacementlen; + } + + s390_kernel_write(instr, insnbuf, insnbuf_sz); + } +} + +void __init_or_module apply_alternatives(struct alt_instr *start, + struct alt_instr *end) +{ + if (!alt_instr_disabled) + __apply_alternatives(start, end); +} + +extern struct alt_instr __alt_instructions[], __alt_instructions_end[]; +void __init apply_alternative_instructions(void) +{ + pr_info("gmb %s", + (__test_facility(81, S390_lowcore.alt_stfle_fac_list)) ? + "enabled" : "disabled"); + pr_info("nobp %s", + (__test_facility(82, S390_lowcore.alt_stfle_fac_list)) ? + "enabled" : "disabled"); + apply_alternatives(__alt_instructions, __alt_instructions_end); +} --- linux-azure-4.13.0.orig/arch/s390/kernel/early.c +++ linux-azure-4.13.0/arch/s390/kernel/early.c @@ -327,6 +327,11 @@ { stfle(S390_lowcore.stfle_fac_list, ARRAY_SIZE(S390_lowcore.stfle_fac_list)); + memcpy(S390_lowcore.alt_stfle_fac_list, + S390_lowcore.stfle_fac_list, + sizeof(S390_lowcore.alt_stfle_fac_list)); + if (!IS_ENABLED(CONFIG_KERNEL_NOBP)) + __clear_facility(82, S390_lowcore.alt_stfle_fac_list); } static __init void detect_diag9c(void) --- linux-azure-4.13.0.orig/arch/s390/kernel/entry.S +++ linux-azure-4.13.0/arch/s390/kernel/entry.S @@ -157,6 +157,34 @@ tm off+\addr, \mask .endm + .macro BPOFF + .pushsection .altinstr_replacement, "ax" +660: .long 0xb2e8c000 + .popsection +661: .long 0x47000000 + .pushsection .altinstructions, "a" + .long 661b - . + .long 660b - . + .word 82 + .byte 4 + .byte 4 + .popsection + .endm + + .macro BPON + .pushsection .altinstr_replacement, "ax" +662: .long 0xb2e8d000 + .popsection +663: .long 0x47000000 + .pushsection .altinstructions, "a" + .long 663b - . + .long 662b - . + .word 82 + .byte 4 + .byte 4 + .popsection + .endm + .section .kprobes.text, "ax" .Ldummy: /* @@ -169,6 +197,11 @@ */ nop 0 +ENTRY(__bpon) + .globl __bpon + BPON + br %r14 + /* * Scheduler resume function, called by switch_to * gpr2 = (task_struct *) prev @@ -225,8 +258,11 @@ jnz .Lsie_skip TSTMSK __LC_CPU_FLAGS,_CIF_FPU jo .Lsie_skip # exit if fp/vx regs changed + BPON .Lsie_entry: sie 0(%r14) +.Lsie_exit: + BPOFF .Lsie_skip: ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE lctlg %c1,%c1,__LC_USER_ASCE # load primary asce @@ -272,6 +308,7 @@ stpt __LC_SYNC_ENTER_TIMER .Lsysc_stmg: stmg %r8,%r15,__LC_SAVE_AREA_SYNC + BPOFF lg %r12,__LC_CURRENT lghi %r13,__TASK_thread lghi %r14,_PIF_SYSCALL @@ -320,6 +357,7 @@ lg %r14,__LC_VDSO_PER_CPU lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) + BPON .Lsysc_exit_timer: stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER @@ -513,6 +551,7 @@ ENTRY(pgm_check_handler) stpt __LC_SYNC_ENTER_TIMER + BPOFF stmg %r8,%r15,__LC_SAVE_AREA_SYNC lg %r10,__LC_LAST_BREAK lg %r12,__LC_CURRENT @@ -521,12 +560,15 @@ tmhh %r8,0x0001 # test problem state bit jnz 2f # -> fault in user space #if IS_ENABLED(CONFIG_KVM) - # cleanup critical section for sie64a + # cleanup critical section for program checks in sie64a lgr %r14,%r9 slg %r14,BASED(.Lsie_critical_start) clg %r14,BASED(.Lsie_critical_length) jhe 0f - brasl %r14,.Lcleanup_sie + lg %r14,__SF_EMPTY(%r15) # get control block pointer + ni __SIE_PROG0C+3(%r14),0xfe # no longer in SIE + lctlg %c1,%c1,__LC_USER_ASCE # load primary asce + larl %r9,sie_exit # skip forward to sie_exit #endif 0: tmhh %r8,0x4000 # PER bit set in old PSW ? jnz 1f # -> enabled, can't be a double fault @@ -605,6 +647,7 @@ ENTRY(io_int_handler) STCK __LC_INT_CLOCK stpt __LC_ASYNC_ENTER_TIMER + BPOFF stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r12,__LC_CURRENT larl %r13,cleanup_critical @@ -645,9 +688,13 @@ lg %r14,__LC_VDSO_PER_CPU lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) + tm __PT_PSW+1(%r11),0x01 # returning to user ? + jno .Lio_exit_kernel + BPON .Lio_exit_timer: stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER +.Lio_exit_kernel: lmg %r11,%r15,__PT_R11(%r11) lpswe __LC_RETURN_PSW .Lio_done: @@ -810,6 +857,7 @@ ENTRY(ext_int_handler) STCK __LC_INT_CLOCK stpt __LC_ASYNC_ENTER_TIMER + BPOFF stmg %r8,%r15,__LC_SAVE_AREA_ASYNC lg %r12,__LC_CURRENT larl %r13,cleanup_critical @@ -848,6 +896,7 @@ .Lpsw_idle_stcctm: #endif oi __LC_CPU_FLAGS+7,_CIF_ENABLED_WAIT + BPON STCK __CLOCK_IDLE_ENTER(%r2) stpt __TIMER_IDLE_ENTER(%r2) .Lpsw_idle_lpsw: @@ -948,6 +997,7 @@ */ ENTRY(mcck_int_handler) STCK __LC_MCCK_CLOCK + BPOFF la %r1,4095 # revalidate r1 spt __LC_CPU_TIMER_SAVE_AREA-4095(%r1) # revalidate cpu timer lmg %r0,%r15,__LC_GPREGS_SAVE_AREA-4095(%r1)# revalidate gprs @@ -1003,6 +1053,7 @@ mvc __LC_RETURN_MCCK_PSW(16),__PT_PSW(%r11) # move return PSW tm __LC_RETURN_MCCK_PSW+1,0x01 # returning to user ? jno 0f + BPON stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER 0: lmg %r11,%r15,__PT_R11(%r11) @@ -1317,7 +1368,7 @@ .Lsie_crit_mcck_start: .quad .Lsie_entry .Lsie_crit_mcck_length: - .quad .Lsie_skip - .Lsie_entry + .quad .Lsie_exit - .Lsie_entry #endif .section .rodata, "a" --- linux-azure-4.13.0.orig/arch/s390/kernel/ipl.c +++ linux-azure-4.13.0/arch/s390/kernel/ipl.c @@ -564,6 +564,7 @@ static void __ipl_run(void *unused) { + __bpon(); diag308(DIAG308_LOAD_CLEAR, NULL); if (MACHINE_IS_VM) __cpcmd("IPL", NULL, 0, NULL); --- linux-azure-4.13.0.orig/arch/s390/kernel/kmsg.c +++ linux-azure-4.13.0/arch/s390/kernel/kmsg.c @@ -0,0 +1,114 @@ +/* + * Message printing with message catalog prefixes. + * + * Copyright IBM Corp. 2012 + */ + +#include +#include +#include +#include +#include + +static inline u32 __printk_jhash(const void *key, u32 length) +{ + u32 a, b, c, len; + const u8 *k; + u8 zk[12]; + + a = b = 0x9e3779b9; + c = 0; + for (len = length + 12, k = key; len >= 12; len -= 12, k += 12) { + if (len >= 24) { + a += k[0] | k[1] << 8 | k[2] << 16 | k[3] << 24; + b += k[4] | k[5] << 8 | k[6] << 16 | k[7] << 24; + c += k[8] | k[9] << 8 | k[10] << 16 | k[11] << 24; + } else { + memset(zk, 0, 12); + memcpy(zk, k, len - 12); + a += zk[0] | zk[1] << 8 | zk[2] << 16 | zk[3] << 24; + b += zk[4] | zk[5] << 8 | zk[6] << 16 | zk[7] << 24; + c += (u32) zk[8] << 8; + c += (u32) zk[9] << 16; + c += (u32) zk[10] << 24; + c += length; + } + a -= b + c; a ^= (c>>13); + b -= a + c; b ^= (a<<8); + c -= a + b; c ^= (b>>13); + a -= b + c; a ^= (c>>12); + b -= a + c; b ^= (a<<16); + c -= a + b; c ^= (b>>5); + a -= b + c; a ^= (c>>3); + b -= a + c; b ^= (a<<10); + c -= a + b; c ^= (b>>15); + } + return c; +} + +/** + * __jhash_string - calculate the six digit jhash of a string + * @str: string to calculate the jhash + */ +unsigned long long __jhash_string(const char *str) +{ + return __printk_jhash(str, strlen(str)) & 0xffffff; +} +EXPORT_SYMBOL(__jhash_string); + +static int __dev_printk_hash(const char *level, const struct device *dev, + struct va_format *vaf) +{ + if (!dev) + return printk("%s(NULL device *): %pV", level, vaf); + + return printk("%s%s.%06x: %pV", level, dev_driver_string(dev), + __printk_jhash(vaf->fmt, strlen(vaf->fmt)) & 0xffffff, + vaf); +} + +int dev_printk_hash(const char *level, const struct device *dev, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int r; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + r = __dev_printk_hash(level, dev, &vaf); + va_end(args); + + return r; +} +EXPORT_SYMBOL(dev_printk_hash); + +#define define_dev_printk_hash_level(func, kern_level) \ +int func(const struct device *dev, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + int r; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + r = __dev_printk_hash(kern_level, dev, &vaf); \ + va_end(args); \ + \ + return r; \ +} \ +EXPORT_SYMBOL(func); + +define_dev_printk_hash_level(dev_emerg_hash, KERN_EMERG); +define_dev_printk_hash_level(dev_alert_hash, KERN_ALERT); +define_dev_printk_hash_level(dev_crit_hash, KERN_CRIT); +define_dev_printk_hash_level(dev_err_hash, KERN_ERR); +define_dev_printk_hash_level(dev_warn_hash, KERN_WARNING); +define_dev_printk_hash_level(dev_notice_hash, KERN_NOTICE); +define_dev_printk_hash_level(_dev_info_hash, KERN_INFO); --- linux-azure-4.13.0.orig/arch/s390/kernel/module.c +++ linux-azure-4.13.0/arch/s390/kernel/module.c @@ -31,6 +31,7 @@ #include #include #include +#include #if 0 #define DEBUGP printk @@ -429,6 +430,18 @@ const Elf_Shdr *sechdrs, struct module *me) { + const Elf_Shdr *s; + char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; + + for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { + if (!strcmp(".altinstructions", secstrings + s->sh_name)) { + /* patch .altinstructions */ + void *aseg = (void *)s->sh_addr; + + apply_alternatives(aseg, aseg + s->sh_size); + } + } + jump_label_apply_nops(me); return 0; } --- linux-azure-4.13.0.orig/arch/s390/kernel/perf_cpum_sf.c +++ linux-azure-4.13.0/arch/s390/kernel/perf_cpum_sf.c @@ -823,9 +823,12 @@ } /* Check online status of the CPU to which the event is pinned */ - if ((unsigned int)event->cpu >= nr_cpumask_bits || - (event->cpu >= 0 && !cpu_online(event->cpu))) - return -ENODEV; + if (event->cpu >= 0) { + if ((unsigned int)event->cpu >= nr_cpumask_bits) + return -ENODEV; + if (!cpu_online(event->cpu)) + return -ENODEV; + } /* Force reset of idle/hv excludes regardless of what the * user requested. --- linux-azure-4.13.0.orig/arch/s390/kernel/setup.c +++ linux-azure-4.13.0/arch/s390/kernel/setup.c @@ -66,6 +66,7 @@ #include #include #include +#include #include "entry.h" /* @@ -338,7 +339,9 @@ lc->preempt_count = S390_lowcore.preempt_count; lc->stfl_fac_list = S390_lowcore.stfl_fac_list; memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, - MAX_FACILITY_BIT/8); + sizeof(lc->stfle_fac_list)); + memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list, + sizeof(lc->alt_stfle_fac_list)); if (MACHINE_HAS_VX || MACHINE_HAS_GS) { unsigned long bits, size; @@ -951,6 +954,8 @@ conmode_default(); set_preferred_console(); + apply_alternative_instructions(); + /* Setup zfcpdump support */ setup_zfcpdump(); --- linux-azure-4.13.0.orig/arch/s390/kernel/smp.c +++ linux-azure-4.13.0/arch/s390/kernel/smp.c @@ -280,7 +280,9 @@ __ctl_store(lc->cregs_save_area, 0, 15); save_access_regs((unsigned int *) lc->access_regs_save_area); memcpy(lc->stfle_fac_list, S390_lowcore.stfle_fac_list, - MAX_FACILITY_BIT/8); + sizeof(lc->stfle_fac_list)); + memcpy(lc->alt_stfle_fac_list, S390_lowcore.alt_stfle_fac_list, + sizeof(lc->alt_stfle_fac_list)); } static void pcpu_attach_task(struct pcpu *pcpu, struct task_struct *tsk) @@ -293,7 +295,10 @@ lc->lpp = LPP_MAGIC; lc->current_pid = tsk->pid; lc->user_timer = tsk->thread.user_timer; + lc->guest_timer = tsk->thread.guest_timer; lc->system_timer = tsk->thread.system_timer; + lc->hardirq_timer = tsk->thread.hardirq_timer; + lc->softirq_timer = tsk->thread.softirq_timer; lc->steal_timer = 0; } @@ -327,6 +332,7 @@ mem_assign_absolute(lc->restart_fn, (unsigned long) func); mem_assign_absolute(lc->restart_data, (unsigned long) data); mem_assign_absolute(lc->restart_source, source_cpu); + __bpon(); asm volatile( "0: sigp 0,%0,%2 # sigp restart to target cpu\n" " brc 2,0b # busy, try again\n" @@ -902,6 +908,7 @@ void __noreturn cpu_die(void) { idle_task_exit(); + __bpon(); pcpu_sigp_retry(pcpu_devices + smp_processor_id(), SIGP_STOP, 0); for (;;) ; } --- linux-azure-4.13.0.orig/arch/s390/kernel/vmlinux.lds.S +++ linux-azure-4.13.0/arch/s390/kernel/vmlinux.lds.S @@ -104,6 +104,29 @@ EXIT_DATA } + /* + * struct alt_inst entries. From the header (alternative.h): + * "Alternative instructions for different CPU types or capabilities" + * Think locking instructions on spinlocks. + * Note, that it is a part of __init region. + */ + . = ALIGN(8); + .altinstructions : { + __alt_instructions = .; + *(.altinstructions) + __alt_instructions_end = .; + } + + /* + * And here are the replacement instructions. The linker sticks + * them as binary blobs. The .altinstructions has enough data to + * get the address and the length of them to patch the kernel safely. + * Note, that it is a part of __init region. + */ + .altinstr_replacement : { + *(.altinstr_replacement) + } + /* early.c uses stsi, which requires page aligned data. */ . = ALIGN(PAGE_SIZE); INIT_DATA_SECTION(0x100) --- linux-azure-4.13.0.orig/arch/s390/kvm/kvm-s390.c +++ linux-azure-4.13.0/arch/s390/kvm/kvm-s390.c @@ -417,6 +417,9 @@ case KVM_CAP_S390_GS: r = test_facility(133); break; + case KVM_CAP_S390_BPB: + r = test_facility(82); + break; default: r = 0; } @@ -2124,6 +2127,8 @@ kvm_s390_set_prefix(vcpu, 0); if (test_kvm_facility(vcpu->kvm, 64)) vcpu->run->kvm_valid_regs |= KVM_SYNC_RICCB; + if (test_kvm_facility(vcpu->kvm, 82)) + vcpu->run->kvm_valid_regs |= KVM_SYNC_BPBC; if (test_kvm_facility(vcpu->kvm, 133)) vcpu->run->kvm_valid_regs |= KVM_SYNC_GSCB; /* fprs can be synchronized via vrs, even if the guest has no vx. With @@ -2265,6 +2270,7 @@ current->thread.fpu.fpc = 0; vcpu->arch.sie_block->gbea = 1; vcpu->arch.sie_block->pp = 0; + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; vcpu->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; kvm_clear_async_pf_completion_queue(vcpu); if (!kvm_s390_user_cpu_state_ctrl(vcpu->kvm)) @@ -3187,6 +3193,11 @@ vcpu->arch.sie_block->ecd |= ECD_HOSTREGMGMT; vcpu->arch.gs_enabled = 1; } + if ((kvm_run->kvm_dirty_regs & KVM_SYNC_BPBC) && + test_kvm_facility(vcpu->kvm, 82)) { + vcpu->arch.sie_block->fpf &= ~FPF_BPBC; + vcpu->arch.sie_block->fpf |= kvm_run->s.regs.bpbc ? FPF_BPBC : 0; + } save_access_regs(vcpu->arch.host_acrs); restore_access_regs(vcpu->run->s.regs.acrs); /* save host (userspace) fprs/vrs */ @@ -3233,6 +3244,7 @@ kvm_run->s.regs.pft = vcpu->arch.pfault_token; kvm_run->s.regs.pfs = vcpu->arch.pfault_select; kvm_run->s.regs.pfc = vcpu->arch.pfault_compare; + kvm_run->s.regs.bpbc = (vcpu->arch.sie_block->fpf & FPF_BPBC) == FPF_BPBC; save_access_regs(vcpu->run->s.regs.acrs); restore_access_regs(vcpu->arch.host_acrs); /* Save guest register state */ --- linux-azure-4.13.0.orig/arch/s390/kvm/sigp.c +++ linux-azure-4.13.0/arch/s390/kvm/sigp.c @@ -155,29 +155,26 @@ return rc; } -static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter) +static int __sigp_set_arch(struct kvm_vcpu *vcpu, u32 parameter, + u64 *status_reg) { - int rc; unsigned int i; struct kvm_vcpu *v; + bool all_stopped = true; - switch (parameter & 0xff) { - case 0: - rc = SIGP_CC_NOT_OPERATIONAL; - break; - case 1: - case 2: - kvm_for_each_vcpu(i, v, vcpu->kvm) { - v->arch.pfault_token = KVM_S390_PFAULT_TOKEN_INVALID; - kvm_clear_async_pf_completion_queue(v); - } - - rc = SIGP_CC_ORDER_CODE_ACCEPTED; - break; - default: - rc = -EOPNOTSUPP; + kvm_for_each_vcpu(i, v, vcpu->kvm) { + if (v == vcpu) + continue; + if (!is_vcpu_stopped(v)) + all_stopped = false; } - return rc; + + *status_reg &= 0xffffffff00000000UL; + + /* Reject set arch order, with czam we're always in z/Arch mode. */ + *status_reg |= (all_stopped ? SIGP_STATUS_INVALID_PARAMETER : + SIGP_STATUS_INCORRECT_STATE); + return SIGP_CC_STATUS_STORED; } static int __sigp_set_prefix(struct kvm_vcpu *vcpu, struct kvm_vcpu *dst_vcpu, @@ -446,7 +443,8 @@ switch (order_code) { case SIGP_SET_ARCHITECTURE: vcpu->stat.instruction_sigp_arch++; - rc = __sigp_set_arch(vcpu, parameter); + rc = __sigp_set_arch(vcpu, parameter, + &vcpu->run->s.regs.gprs[r1]); break; default: rc = handle_sigp_dst(vcpu, order_code, cpu_addr, --- linux-azure-4.13.0.orig/arch/s390/kvm/vsie.c +++ linux-azure-4.13.0/arch/s390/kvm/vsie.c @@ -226,6 +226,12 @@ memcpy(scb_o->gcr, scb_s->gcr, 128); scb_o->pp = scb_s->pp; + /* branch prediction */ + if (test_kvm_facility(vcpu->kvm, 82)) { + scb_o->fpf &= ~FPF_BPBC; + scb_o->fpf |= scb_s->fpf & FPF_BPBC; + } + /* interrupt intercept */ switch (scb_s->icptcode) { case ICPT_PROGI: @@ -268,6 +274,7 @@ scb_s->ecb3 = 0; scb_s->ecd = 0; scb_s->fac = 0; + scb_s->fpf = 0; rc = prepare_cpuflags(vcpu, vsie_page); if (rc) @@ -327,6 +334,9 @@ prefix_unmapped(vsie_page); scb_s->ecb |= scb_o->ecb & ECB_TE; } + /* branch prediction */ + if (test_kvm_facility(vcpu->kvm, 82)) + scb_s->fpf |= scb_o->fpf & FPF_BPBC; /* SIMD */ if (test_kvm_facility(vcpu->kvm, 129)) { scb_s->eca |= scb_o->eca & ECA_VX; --- linux-azure-4.13.0.orig/arch/s390/mm/gmap.c +++ linux-azure-4.13.0/arch/s390/mm/gmap.c @@ -2121,6 +2121,37 @@ } /* + * Remove all empty zero pages from the mapping for lazy refaulting + * - This must be called after mm->context.has_pgste is set, to avoid + * future creation of zero pages + * - This must be called after THP was enabled + */ +static int __zap_zero_pages(pmd_t *pmd, unsigned long start, + unsigned long end, struct mm_walk *walk) +{ + unsigned long addr; + + for (addr = start; addr != end; addr += PAGE_SIZE) { + pte_t *ptep; + spinlock_t *ptl; + + ptep = pte_offset_map_lock(walk->mm, pmd, addr, &ptl); + if (is_zero_pfn(pte_pfn(*ptep))) + ptep_xchg_direct(walk->mm, addr, ptep, __pte(_PAGE_INVALID)); + pte_unmap_unlock(ptep, ptl); + } + return 0; +} + +static inline void zap_zero_pages(struct mm_struct *mm) +{ + struct mm_walk walk = { .pmd_entry = __zap_zero_pages }; + + walk.mm = mm; + walk_page_range(0, TASK_SIZE, &walk); +} + +/* * switch on pgstes for its userspace process (for kvm) */ int s390_enable_sie(void) @@ -2137,6 +2168,7 @@ mm->context.has_pgste = 1; /* split thp mappings and disable thp for future mappings */ thp_split_mm(mm); + zap_zero_pages(mm); up_write(&mm->mmap_sem); return 0; } @@ -2149,13 +2181,6 @@ static int __s390_enable_skey(pte_t *pte, unsigned long addr, unsigned long next, struct mm_walk *walk) { - /* - * Remove all zero page mappings, - * after establishing a policy to forbid zero page mappings - * following faults for that page will get fresh anonymous pages - */ - if (is_zero_pfn(pte_pfn(*pte))) - ptep_xchg_direct(walk->mm, addr, pte, __pte(_PAGE_INVALID)); /* Clear storage key */ ptep_zap_key(walk->mm, addr, pte); return 0; --- linux-azure-4.13.0.orig/arch/s390/mm/gup.c +++ linux-azure-4.13.0/arch/s390/mm/gup.c @@ -56,13 +56,12 @@ static inline int gup_huge_pmd(pmd_t *pmdp, pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { - unsigned long mask, result; struct page *head, *page; + unsigned long mask; int refs; - result = write ? 0 : _SEGMENT_ENTRY_PROTECT; - mask = result | _SEGMENT_ENTRY_INVALID; - if ((pmd_val(pmd) & mask) != result) + mask = (write ? _SEGMENT_ENTRY_PROTECT : 0) | _SEGMENT_ENTRY_INVALID; + if ((pmd_val(pmd) & mask) != 0) return 0; VM_BUG_ON(!pfn_valid(pmd_val(pmd) >> PAGE_SHIFT)); --- linux-azure-4.13.0.orig/arch/s390/mm/init.c +++ linux-azure-4.13.0/arch/s390/mm/init.c @@ -166,17 +166,43 @@ } #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { + unsigned long zone_start_pfn, zone_end_pfn, nr_pages; unsigned long start_pfn = PFN_DOWN(start); unsigned long size_pages = PFN_DOWN(size); - int rc; + pg_data_t *pgdat = NODE_DATA(nid); + struct zone *zone; + int rc, i; rc = vmem_add_mapping(start, size); if (rc) return rc; - rc = __add_pages(nid, start_pfn, size_pages, want_memblock); + for (i = 0; i < MAX_NR_ZONES; i++) { + zone = pgdat->node_zones + i; + if (zone_idx(zone) != ZONE_MOVABLE) { + /* Add range within existing zone limits, if possible */ + zone_start_pfn = zone->zone_start_pfn; + zone_end_pfn = zone->zone_start_pfn + + zone->spanned_pages; + } else { + /* Add remaining range to ZONE_MOVABLE */ + zone_start_pfn = start_pfn; + zone_end_pfn = start_pfn + size_pages; + } + if (start_pfn < zone_start_pfn || start_pfn >= zone_end_pfn) + continue; + nr_pages = (start_pfn + size_pages > zone_end_pfn) ? + zone_end_pfn - start_pfn : size_pages; + rc = __add_pages(nid, zone, start_pfn, nr_pages, !for_device); + if (rc) + break; + start_pfn += nr_pages; + size_pages -= nr_pages; + if (!size_pages) + break; + } if (rc) vmem_remove_mapping(start, size); return rc; --- linux-azure-4.13.0.orig/arch/s390/tools/gen_facilities.c +++ linux-azure-4.13.0/arch/s390/tools/gen_facilities.c @@ -80,6 +80,7 @@ 78, /* enhanced-DAT 2 */ 130, /* instruction-execution-protection */ 131, /* enhanced-SOP 2 and side-effect */ + 138, /* configuration z/architecture mode (czam) */ 146, /* msa extension 8 */ -1 /* END */ } --- linux-azure-4.13.0.orig/arch/sh/include/asm/futex.h +++ linux-azure-4.13.0/arch/sh/include/asm/futex.h @@ -27,21 +27,12 @@ return atomic_futex_op_cmpxchg_inatomic(uval, uaddr, oldval, newval); } -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - u32 oparg = (encoded_op << 8) >> 20; - u32 cmparg = (encoded_op << 20) >> 20; u32 oldval, newval, prev; int ret; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); do { @@ -80,17 +71,8 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = ((int)oldval < (int)cmparg); break; - case FUTEX_OP_CMP_GE: ret = ((int)oldval >= (int)cmparg); break; - case FUTEX_OP_CMP_LE: ret = ((int)oldval <= (int)cmparg); break; - case FUTEX_OP_CMP_GT: ret = ((int)oldval > (int)cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; return ret; } --- linux-azure-4.13.0.orig/arch/sh/mm/init.c +++ linux-azure-4.13.0/arch/sh/mm/init.c @@ -485,14 +485,20 @@ #endif #ifdef CONFIG_MEMORY_HOTPLUG -int arch_add_memory(int nid, u64 start, u64 size, bool want_memblock) +int arch_add_memory(int nid, u64 start, u64 size, bool for_device) { + pg_data_t *pgdat; unsigned long start_pfn = PFN_DOWN(start); unsigned long nr_pages = size >> PAGE_SHIFT; int ret; + pgdat = NODE_DATA(nid); + /* We only have ZONE_NORMAL, so this is easy.. */ - ret = __add_pages(nid, start_pfn, nr_pages, want_memblock); + ret = __add_pages(nid, pgdat->node_zones + + zone_for_memory(nid, start, size, ZONE_NORMAL, + for_device), + start_pfn, nr_pages, !for_device); if (unlikely(ret)) printk("%s: Failed, __add_pages() == %d\n", __func__, ret); --- linux-azure-4.13.0.orig/arch/sparc/Kconfig +++ linux-azure-4.13.0/arch/sparc/Kconfig @@ -44,7 +44,6 @@ select ARCH_HAS_SG_CHAIN select CPU_NO_EFFICIENT_FFS select LOCKDEP_SMALL if LOCKDEP - select ARCH_WANT_RELAX_ORDER config SPARC32 def_bool !64BIT --- linux-azure-4.13.0.orig/arch/sparc/include/asm/futex_64.h +++ linux-azure-4.13.0/arch/sparc/include/asm/futex_64.h @@ -29,22 +29,14 @@ : "r" (uaddr), "r" (oparg), "i" (-EFAULT) \ : "memory") -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (unlikely(!access_ok(VERIFY_WRITE, uaddr, sizeof(u32)))) - return -EFAULT; if (unlikely((((unsigned long) uaddr) & 0x3UL))) return -EINVAL; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - pagefault_disable(); switch (op) { @@ -69,17 +61,9 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: ret = (oldval == cmparg); break; - case FUTEX_OP_CMP_NE: ret = (oldval != cmparg); break; - case FUTEX_OP_CMP_LT: ret = (oldval < cmparg); break; - case FUTEX_OP_CMP_GE: ret = (oldval >= cmparg); break; - case FUTEX_OP_CMP_LE: ret = (oldval <= cmparg); break; - case FUTEX_OP_CMP_GT: ret = (oldval > cmparg); break; - default: ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/tile/include/asm/futex.h +++ linux-azure-4.13.0/arch/tile/include/asm/futex.h @@ -106,12 +106,9 @@ lock = __atomic_hashed_lock((int __force *)uaddr) #endif -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, u32 oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int uninitialized_var(val), ret; __futex_prolog(); @@ -119,12 +116,6 @@ /* The 32-bit futex code makes this assumption, so validate it here. */ BUILD_BUG_ON(sizeof(atomic_t) != sizeof(int)); - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { case FUTEX_OP_SET: @@ -148,30 +139,9 @@ } pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (val == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (val != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (val < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (val >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (val <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (val > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = val; + return ret; } --- linux-azure-4.13.0.orig/arch/um/include/asm/Kbuild +++ linux-azure-4.13.0/arch/um/include/asm/Kbuild @@ -1,4 +1,5 @@ generic-y += barrier.h +generic-y += bpf_perf_event.h generic-y += bug.h generic-y += clkdev.h generic-y += current.h --- linux-azure-4.13.0.orig/arch/um/include/asm/mmu_context.h +++ linux-azure-4.13.0/arch/um/include/asm/mmu_context.h @@ -15,9 +15,10 @@ /* * Needed since we do not use the asm-generic/mm_hooks.h: */ -static inline void arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { uml_setup_stubs(mm); + return 0; } extern void arch_exit_mmap(struct mm_struct *mm); static inline void arch_unmap(struct mm_struct *mm, --- linux-azure-4.13.0.orig/arch/um/include/asm/unwind.h +++ linux-azure-4.13.0/arch/um/include/asm/unwind.h @@ -0,0 +1,8 @@ +#ifndef _ASM_UML_UNWIND_H +#define _ASM_UML_UNWIND_H + +static inline void +unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} + +#endif /* _ASM_UML_UNWIND_H */ --- linux-azure-4.13.0.orig/arch/unicore32/include/asm/mmu_context.h +++ linux-azure-4.13.0/arch/unicore32/include/asm/mmu_context.h @@ -81,9 +81,10 @@ } \ } while (0) -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, + struct mm_struct *mm) { + return 0; } static inline void arch_unmap(struct mm_struct *mm, --- linux-azure-4.13.0.orig/arch/x86/Kconfig +++ linux-azure-4.13.0/arch/x86/Kconfig @@ -73,7 +73,6 @@ select ARCH_USE_QUEUED_RWLOCKS select ARCH_USE_QUEUED_SPINLOCKS select ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH - select ARCH_WANT_FRAME_POINTERS select ARCH_WANTS_DYNAMIC_TASK_STRUCT select ARCH_WANTS_THP_SWAP if X86_64 select BUILDTIME_EXTABLE_SORT @@ -88,6 +87,7 @@ select GENERIC_CLOCKEVENTS_MIN_ADJUST select GENERIC_CMOS_UPDATE select GENERIC_CPU_AUTOPROBE + select GENERIC_CPU_VULNERABILITIES select GENERIC_EARLY_IOREMAP select GENERIC_FIND_FIRST_BIT select GENERIC_IOMAP @@ -107,7 +107,7 @@ select HAVE_ARCH_AUDITSYSCALL select HAVE_ARCH_HUGE_VMAP if X86_64 || X86_PAE select HAVE_ARCH_JUMP_LABEL - select HAVE_ARCH_KASAN if X86_64 && SPARSEMEM_VMEMMAP + select HAVE_ARCH_KASAN if X86_64 select HAVE_ARCH_KGDB select HAVE_ARCH_KMEMCHECK select HAVE_ARCH_MMAP_RND_BITS if MMU @@ -158,6 +158,7 @@ select HAVE_MEMBLOCK select HAVE_MEMBLOCK_NODE_MAP select HAVE_MIXED_BREAKPOINTS_REGS + select HAVE_MOD_ARCH_SPECIFIC select HAVE_NMI select HAVE_OPROFILE select HAVE_OPTPROBES @@ -168,7 +169,7 @@ select HAVE_PERF_REGS select HAVE_PERF_USER_STACK_DUMP select HAVE_REGS_AND_STACK_ACCESS_API - select HAVE_RELIABLE_STACKTRACE if X86_64 && FRAME_POINTER && STACK_VALIDATION + select HAVE_RELIABLE_STACKTRACE if X86_64 && UNWINDER_FRAME_POINTER && STACK_VALIDATION select HAVE_STACK_VALIDATION if X86_64 select HAVE_SYSCALL_TRACEPOINTS select HAVE_UNSTABLE_SCHED_CLOCK @@ -300,7 +301,6 @@ config KASAN_SHADOW_OFFSET hex depends on KASAN - default 0xdff8000000000000 if X86_5LEVEL default 0xdffffc0000000000 config HAVE_INTEL_TXT @@ -425,16 +425,29 @@ def_bool y depends on X86_GOLDFISH -config INTEL_RDT_A - bool "Intel Resource Director Technology Allocation support" +config RETPOLINE + bool "Avoid speculative indirect branches in kernel" + default y + help + Compile kernel with the retpoline compiler options to guard against + kernel-to-user data leaks by avoiding speculative indirect + branches. Requires a compiler with -mindirect-branch=thunk-extern + support for full protection. The kernel may run slower. + + Without compiler support, at least indirect branches in assembler + code are eliminated. Since this includes the syscall entry path, + it is not entirely pointless. + +config INTEL_RDT + bool "Intel Resource Director Technology support" default n depends on X86 && CPU_SUP_INTEL select KERNFS help - Select to enable resource allocation which is a sub-feature of - Intel Resource Director Technology(RDT). More information about - RDT can be found in the Intel x86 Architecture Software - Developer Manual. + Select to enable resource allocation and monitoring which are + sub-features of Intel Resource Director Technology(RDT). More + information about RDT can be found in the Intel x86 + Architecture Software Developer Manual. Say N if unsure. @@ -924,7 +937,8 @@ config NR_CPUS int "Maximum number of CPUs" if SMP && !MAXSMP range 2 8 if SMP && X86_32 && !X86_BIGSMP - range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK + range 2 64 if SMP && X86_32 && X86_BIGSMP + range 2 512 if SMP && !MAXSMP && !CPUMASK_OFFSTACK && X86_64 range 2 8192 if SMP && !MAXSMP && CPUMASK_OFFSTACK && X86_64 default "1" if !SMP default "8192" if MAXSMP @@ -1836,6 +1850,28 @@ If unsure, say N. +config EFI_SECURE_BOOT_LOCK_DOWN + def_bool n + depends on EFI + prompt "Lock down the kernel when UEFI Secure Boot is enabled" + ---help--- + UEFI Secure Boot provides a mechanism for ensuring that the firmware + will only load signed bootloaders and kernels. Certain use cases may + also require that all kernel modules also be signed and that + userspace is prevented from directly changing the running kernel + image. Say Y here to automatically lock down the kernel when a + system boots with UEFI Secure Boot enabled. + +config EFI_ALLOW_SECURE_BOOT_EXIT + def_bool n + depends on EFI_SECURE_BOOT_LOCK_DOWN && MAGIC_SYSRQ + select ALLOW_LOCKDOWN_LIFT + prompt "Allow secure boot mode to be exited with SysRq+x on a keyboard" + ---help--- + Allow secure boot mode to be exited and the kernel lockdown lifted by + typing SysRq+x on a keyboard attached to the system (not permitted + through procfs). + config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" @@ -2809,6 +2845,8 @@ source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "drivers/firmware/Kconfig" source "fs/Kconfig" --- linux-azure-4.13.0.orig/arch/x86/Kconfig.debug +++ linux-azure-4.13.0/arch/x86/Kconfig.debug @@ -305,8 +305,6 @@ Some of these sanity checks may slow down kernel entries and exits or otherwise impact performance. - This is currently used to help test NMI code. - If unsure, say N. config DEBUG_NMI_SELFTEST @@ -358,4 +356,63 @@ The current power state can be read from /sys/kernel/debug/punit_atom/dev_power_state +choice + prompt "Choose kernel unwinder" + default UNWINDER_ORC if X86_64 + default UNWINDER_FRAME_POINTER if X86_32 + ---help--- + This determines which method will be used for unwinding kernel stack + traces for panics, oopses, bugs, warnings, perf, /proc//stack, + livepatch, lockdep, and more. + +config UNWINDER_ORC + bool "ORC unwinder" + depends on X86_64 + select STACK_VALIDATION + ---help--- + This option enables the ORC (Oops Rewind Capability) unwinder for + unwinding kernel stack traces. It uses a custom data format which is + a simplified version of the DWARF Call Frame Information standard. + + This unwinder is more accurate across interrupt entry frames than the + frame pointer unwinder. It can also enable a 5-10% performance + improvement across the entire kernel if CONFIG_FRAME_POINTER is + disabled. + + Enabling this option will increase the kernel's runtime memory usage + by roughly 2-4MB, depending on your kernel config. + +config UNWINDER_FRAME_POINTER + bool "Frame pointer unwinder" + select FRAME_POINTER + ---help--- + This option enables the frame pointer unwinder for unwinding kernel + stack traces. + + The unwinder itself is fast and it uses less RAM than the ORC + unwinder, but the kernel text size will grow by ~3% and the kernel's + overall performance will degrade by roughly 5-10%. + + This option is recommended if you want to use the livepatch + consistency model, as this is currently the only way to get a + reliable stack trace (CONFIG_HAVE_RELIABLE_STACKTRACE). + +config UNWINDER_GUESS + bool "Guess unwinder" + depends on EXPERT + ---help--- + This option enables the "guess" unwinder for unwinding kernel stack + traces. It scans the stack and reports every kernel text address it + finds. Some of the addresses it reports may be incorrect. + + While this option often produces false positives, it can still be + useful in many cases. Unlike the other unwinders, it has no runtime + overhead. + +endchoice + +config FRAME_POINTER + depends on !UNWINDER_ORC && !UNWINDER_GUESS + bool + endmenu --- linux-azure-4.13.0.orig/arch/x86/Makefile +++ linux-azure-4.13.0/arch/x86/Makefile @@ -235,6 +235,14 @@ KBUILD_CFLAGS += $(mflags-y) KBUILD_AFLAGS += $(mflags-y) +# Avoid indirect branches in kernel to deal with Spectre +ifdef CONFIG_RETPOLINE + RETPOLINE_CFLAGS += $(call cc-option,-mindirect-branch=thunk-extern -mindirect-branch-register) + ifneq ($(RETPOLINE_CFLAGS),) + KBUILD_CFLAGS += $(RETPOLINE_CFLAGS) -DRETPOLINE + endif +endif + archscripts: scripts_basic $(Q)$(MAKE) $(build)=arch/x86/tools relocs --- linux-azure-4.13.0.orig/arch/x86/boot/compressed/eboot.c +++ linux-azure-4.13.0/arch/x86/boot/compressed/eboot.c @@ -14,6 +14,7 @@ #include #include #include +#include #include "../string.h" #include "eboot.h" @@ -606,6 +607,31 @@ } } +#define MEMORY_ONLY_RESET_CONTROL_GUID \ + EFI_GUID (0xe20939be, 0x32d4, 0x41be, 0xa1, 0x50, 0x89, 0x7f, 0x85, 0xd4, 0x98, 0x29) + +static void enable_reset_attack_mitigation(void) +{ + static const efi_guid_t var_guid = MEMORY_ONLY_RESET_CONTROL_GUID; + static const efi_char16_t MemoryOverwriteRequestControl_name[] = { + 'M', 'e', 'm', 'o', 'r', 'y', + 'O', 'v', 'e', 'r', 'w', 'r', 'i', 't', 'e', + 'R', 'e', 'q', 'u', 'e', 's', 't', + 'C', 'o', 'n', 't', 'r', 'o', 'l', + 0 + }; + u8 val = 1; + + /* Ignore the return value here - there's not really a lot we can do */ + efi_call_runtime(set_variable, + (efi_char16_t *)MemoryOverwriteRequestControl_name, + (efi_guid_t *)&var_guid, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, + sizeof(val), val); +} + /* * Because the x86 boot code expects to be passed a boot_params we * need to create one ourselves (usually the bootloader would create @@ -990,6 +1016,11 @@ else setup_boot_services32(efi_early); + /* Ask the firmware to clear memory if we don't have a clean shutdown */ + enable_reset_attack_mitigation(); + + sanitize_boot_params(boot_params); + /* * If the boot loader gave us a value for secure_boot then we use that, * otherwise we ask the BIOS. --- linux-azure-4.13.0.orig/arch/x86/boot/compressed/pagetable.c +++ linux-azure-4.13.0/arch/x86/boot/compressed/pagetable.c @@ -15,6 +15,9 @@ #define __pa(x) ((unsigned long)(x)) #define __va(x) ((void *)((unsigned long)(x))) +/* No PAGE_TABLE_ISOLATION support needed either: */ +#undef CONFIG_PAGE_TABLE_ISOLATION + #include "misc.h" /* These actually do the work of building the kernel identity maps. */ --- linux-azure-4.13.0.orig/arch/x86/boot/video-vga.c +++ linux-azure-4.13.0/arch/x86/boot/video-vga.c @@ -190,7 +190,7 @@ vga_set_vertical_end(60*8); } -static int vga_set_mode(struct mode_info *mode) +static int __attribute__((optimize("no-jump-tables"))) vga_set_mode(struct mode_info *mode) { /* Set the basic mode */ vga_set_basic_mode(); --- linux-azure-4.13.0.orig/arch/x86/configs/tiny.config +++ linux-azure-4.13.0/arch/x86/configs/tiny.config @@ -1,3 +1,5 @@ CONFIG_NOHIGHMEM=y # CONFIG_HIGHMEM4G is not set # CONFIG_HIGHMEM64G is not set +CONFIG_UNWINDER_GUESS=y +# CONFIG_UNWINDER_FRAME_POINTER is not set --- linux-azure-4.13.0.orig/arch/x86/configs/x86_64_defconfig +++ linux-azure-4.13.0/arch/x86/configs/x86_64_defconfig @@ -299,6 +299,7 @@ # CONFIG_DEBUG_RODATA_TEST is not set CONFIG_DEBUG_BOOT_PARAMS=y CONFIG_OPTIMIZE_INLINING=y +CONFIG_UNWINDER_ORC=y CONFIG_SECURITY=y CONFIG_SECURITY_NETWORK=y CONFIG_SECURITY_SELINUX=y --- linux-azure-4.13.0.orig/arch/x86/crypto/aesni-intel_asm.S +++ linux-azure-4.13.0/arch/x86/crypto/aesni-intel_asm.S @@ -32,6 +32,7 @@ #include #include #include +#include /* * The following macros are used to move an (un)aligned 16 byte value to/from @@ -2884,7 +2885,7 @@ pxor INC, STATE4 movdqu IV, 0x30(OUTP) - call *%r11 + CALL_NOSPEC %r11 movdqu 0x00(OUTP), INC pxor INC, STATE1 @@ -2929,7 +2930,7 @@ _aesni_gf128mul_x_ble() movups IV, (IVP) - call *%r11 + CALL_NOSPEC %r11 movdqu 0x40(OUTP), INC pxor INC, STATE1 --- linux-azure-4.13.0.orig/arch/x86/crypto/camellia-aesni-avx-asm_64.S +++ linux-azure-4.13.0/arch/x86/crypto/camellia-aesni-avx-asm_64.S @@ -17,6 +17,7 @@ #include #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1227,7 +1228,7 @@ vpxor 14 * 16(%rax), %xmm15, %xmm14; vpxor 15 * 16(%rax), %xmm15, %xmm15; - call *%r9; + CALL_NOSPEC %r9; addq $(16 * 16), %rsp; --- linux-azure-4.13.0.orig/arch/x86/crypto/camellia-aesni-avx2-asm_64.S +++ linux-azure-4.13.0/arch/x86/crypto/camellia-aesni-avx2-asm_64.S @@ -12,6 +12,7 @@ #include #include +#include #define CAMELLIA_TABLE_BYTE_LEN 272 @@ -1343,7 +1344,7 @@ vpxor 14 * 32(%rax), %ymm15, %ymm14; vpxor 15 * 32(%rax), %ymm15, %ymm15; - call *%r9; + CALL_NOSPEC %r9; addq $(16 * 32), %rsp; --- linux-azure-4.13.0.orig/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +++ linux-azure-4.13.0/arch/x86/crypto/crc32c-pcl-intel-asm_64.S @@ -45,6 +45,7 @@ #include #include +#include ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction @@ -172,7 +173,7 @@ movzxw (bufp, %rax, 2), len lea crc_array(%rip), bufp lea (bufp, len, 1), bufp - jmp *bufp + JMP_NOSPEC bufp ################################################################ ## 2a) PROCESS FULL BLOCKS: --- linux-azure-4.13.0.orig/arch/x86/crypto/salsa20_glue.c +++ linux-azure-4.13.0/arch/x86/crypto/salsa20_glue.c @@ -59,13 +59,6 @@ salsa20_ivsetup(ctx, walk.iv); - if (likely(walk.nbytes == nbytes)) - { - salsa20_encrypt_bytes(ctx, walk.src.virt.addr, - walk.dst.virt.addr, nbytes); - return blkcipher_walk_done(desc, &walk, 0); - } - while (walk.nbytes >= 64) { salsa20_encrypt_bytes(ctx, walk.src.virt.addr, walk.dst.virt.addr, --- linux-azure-4.13.0.orig/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S +++ linux-azure-4.13.0/arch/x86/crypto/sha1-mb/sha1_mb_mgr_flush_avx2.S @@ -157,8 +157,8 @@ .endr # Find min length - vmovdqa _lens+0*16(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens+0*16(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} @@ -178,8 +178,8 @@ vpsubd %xmm2, %xmm0, %xmm0 vpsubd %xmm2, %xmm1, %xmm1 - vmovdqa %xmm0, _lens+0*16(state) - vmovdqa %xmm1, _lens+1*16(state) + vmovdqu %xmm0, _lens+0*16(state) + vmovdqu %xmm1, _lens+1*16(state) # "state" and "args" are the same address, arg1 # len is arg2 @@ -235,8 +235,8 @@ jc .return_null # Find min length - vmovdqa _lens(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} --- linux-azure-4.13.0.orig/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S +++ linux-azure-4.13.0/arch/x86/crypto/sha256-mb/sha256_mb_mgr_flush_avx2.S @@ -155,8 +155,8 @@ .endr # Find min length - vmovdqa _lens+0*16(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens+0*16(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} @@ -176,8 +176,8 @@ vpsubd %xmm2, %xmm0, %xmm0 vpsubd %xmm2, %xmm1, %xmm1 - vmovdqa %xmm0, _lens+0*16(state) - vmovdqa %xmm1, _lens+1*16(state) + vmovdqu %xmm0, _lens+0*16(state) + vmovdqu %xmm1, _lens+1*16(state) # "state" and "args" are the same address, arg1 # len is arg2 @@ -234,8 +234,8 @@ jc .return_null # Find min length - vmovdqa _lens(state), %xmm0 - vmovdqa _lens+1*16(state), %xmm1 + vmovdqu _lens(state), %xmm0 + vmovdqu _lens+1*16(state), %xmm1 vpminud %xmm1, %xmm0, %xmm2 # xmm2 has {D,C,B,A} vpalignr $8, %xmm2, %xmm3, %xmm3 # xmm3 has {x,x,D,C} --- linux-azure-4.13.0.orig/arch/x86/entry/Makefile +++ linux-azure-4.13.0/arch/x86/entry/Makefile @@ -2,7 +2,6 @@ # Makefile for the x86 low level entry code # -OBJECT_FILES_NON_STANDARD_entry_$(BITS).o := y OBJECT_FILES_NON_STANDARD_entry_64_compat.o := y CFLAGS_syscall_64.o += $(call cc-option,-Wno-override-init,) --- linux-azure-4.13.0.orig/arch/x86/entry/calling.h +++ linux-azure-4.13.0/arch/x86/entry/calling.h @@ -1,4 +1,10 @@ #include +#include +#include +#include +#include +#include +#include /* @@ -112,6 +118,7 @@ movq %rdx, 12*8+\offset(%rsp) movq %rsi, 13*8+\offset(%rsp) movq %rdi, 14*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset extra=0 .endm .macro SAVE_C_REGS offset=0 SAVE_C_REGS_HELPER \offset, 1, 1, 1, 1 @@ -136,56 +143,28 @@ movq %r12, 3*8+\offset(%rsp) movq %rbp, 4*8+\offset(%rsp) movq %rbx, 5*8+\offset(%rsp) + UNWIND_HINT_REGS offset=\offset .endm - .macro RESTORE_EXTRA_REGS offset=0 - movq 0*8+\offset(%rsp), %r15 - movq 1*8+\offset(%rsp), %r14 - movq 2*8+\offset(%rsp), %r13 - movq 3*8+\offset(%rsp), %r12 - movq 4*8+\offset(%rsp), %rbp - movq 5*8+\offset(%rsp), %rbx - .endm - - .macro RESTORE_C_REGS_HELPER rstor_rax=1, rstor_rcx=1, rstor_r11=1, rstor_r8910=1, rstor_rdx=1 - .if \rstor_r11 - movq 6*8(%rsp), %r11 - .endif - .if \rstor_r8910 - movq 7*8(%rsp), %r10 - movq 8*8(%rsp), %r9 - movq 9*8(%rsp), %r8 - .endif - .if \rstor_rax - movq 10*8(%rsp), %rax - .endif - .if \rstor_rcx - movq 11*8(%rsp), %rcx - .endif - .if \rstor_rdx - movq 12*8(%rsp), %rdx - .endif - movq 13*8(%rsp), %rsi - movq 14*8(%rsp), %rdi - .endm - .macro RESTORE_C_REGS - RESTORE_C_REGS_HELPER 1,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RAX - RESTORE_C_REGS_HELPER 0,1,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX - RESTORE_C_REGS_HELPER 1,0,1,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_R11 - RESTORE_C_REGS_HELPER 1,1,0,1,1 - .endm - .macro RESTORE_C_REGS_EXCEPT_RCX_R11 - RESTORE_C_REGS_HELPER 1,0,0,1,1 - .endm - - .macro REMOVE_PT_GPREGS_FROM_STACK addskip=0 - subq $-(15*8+\addskip), %rsp + .macro POP_EXTRA_REGS + popq %r15 + popq %r14 + popq %r13 + popq %r12 + popq %rbp + popq %rbx + .endm + + .macro POP_C_REGS + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + popq %rdi .endm .macro icebp @@ -212,6 +191,148 @@ #endif .endm +#ifdef CONFIG_PAGE_TABLE_ISOLATION + +/* + * PAGE_TABLE_ISOLATION PGDs are 8k. Flip bit 12 to switch between the two + * halves: + */ +#define PTI_USER_PGTABLE_BIT PAGE_SHIFT +#define PTI_USER_PGTABLE_MASK (1 << PTI_USER_PGTABLE_BIT) +#define PTI_USER_PCID_BIT X86_CR3_PTI_PCID_USER_BIT +#define PTI_USER_PCID_MASK (1 << PTI_USER_PCID_BIT) +#define PTI_USER_PGTABLE_AND_PCID_MASK (PTI_USER_PCID_MASK | PTI_USER_PGTABLE_MASK) + +.macro SET_NOFLUSH_BIT reg:req + bts $X86_CR3_PCID_NOFLUSH_BIT, \reg +.endm + +.macro ADJUST_KERNEL_CR3 reg:req + ALTERNATIVE "", "SET_NOFLUSH_BIT \reg", X86_FEATURE_PCID + /* Clear PCID and "PAGE_TABLE_ISOLATION bit", point CR3 at kernel pagetables: */ + andq $(~PTI_USER_PGTABLE_AND_PCID_MASK), \reg +.endm + +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + mov %cr3, \scratch_reg + ADJUST_KERNEL_CR3 \scratch_reg + mov \scratch_reg, %cr3 +.Lend_\@: +.endm + +#define THIS_CPU_user_pcid_flush_mask \ + PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask + +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + mov %cr3, \scratch_reg + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID + + /* + * Test if the ASID needs a flush. + */ + movq \scratch_reg, \scratch_reg2 + andq $(0x7FF), \scratch_reg /* mask ASID */ + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask + jnc .Lnoflush_\@ + + /* Flush needed, clear the bit */ + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask + movq \scratch_reg2, \scratch_reg + jmp .Lwrcr3_pcid_\@ + +.Lnoflush_\@: + movq \scratch_reg2, \scratch_reg + SET_NOFLUSH_BIT \scratch_reg + +.Lwrcr3_pcid_\@: + /* Flip the ASID to the user version */ + orq $(PTI_USER_PCID_MASK), \scratch_reg + +.Lwrcr3_\@: + /* Flip the PGD to the user version */ + orq $(PTI_USER_PGTABLE_MASK), \scratch_reg + mov \scratch_reg, %cr3 +.Lend_\@: +.endm + +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req + pushq %rax + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax + popq %rax +.endm + +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req + ALTERNATIVE "jmp .Ldone_\@", "", X86_FEATURE_PTI + movq %cr3, \scratch_reg + movq \scratch_reg, \save_reg + /* + * Test the user pagetable bit. If set, then the user page tables + * are active. If clear CR3 already has the kernel page table + * active. + */ + bt $PTI_USER_PGTABLE_BIT, \scratch_reg + jnc .Ldone_\@ + + ADJUST_KERNEL_CR3 \scratch_reg + movq \scratch_reg, %cr3 + +.Ldone_\@: +.endm + +.macro RESTORE_CR3 scratch_reg:req save_reg:req + ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI + + ALTERNATIVE "jmp .Lwrcr3_\@", "", X86_FEATURE_PCID + + /* + * KERNEL pages can always resume with NOFLUSH as we do + * explicit flushes. + */ + bt $PTI_USER_PGTABLE_BIT, \save_reg + jnc .Lnoflush_\@ + + /* + * Check if there's a pending flush for the user ASID we're + * about to set. + */ + movq \save_reg, \scratch_reg + andq $(0x7FF), \scratch_reg + bt \scratch_reg, THIS_CPU_user_pcid_flush_mask + jnc .Lnoflush_\@ + + btr \scratch_reg, THIS_CPU_user_pcid_flush_mask + jmp .Lwrcr3_\@ + +.Lnoflush_\@: + SET_NOFLUSH_BIT \save_reg + +.Lwrcr3_\@: + /* + * The CR3 write could be avoided when not changing its value, + * but would require a CR3 read *and* a scratch register. + */ + movq \save_reg, %cr3 +.Lend_\@: +.endm + +#else /* CONFIG_PAGE_TABLE_ISOLATION=n: */ + +.macro SWITCH_TO_KERNEL_CR3 scratch_reg:req +.endm +.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req +.endm +.macro SWITCH_TO_USER_CR3_STACK scratch_reg:req +.endm +.macro SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg:req save_reg:req +.endm +.macro RESTORE_CR3 scratch_reg:req save_reg:req +.endm + +#endif + #endif /* CONFIG_X86_64 */ /* --- linux-azure-4.13.0.orig/arch/x86/entry/common.c +++ linux-azure-4.13.0/arch/x86/entry/common.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -183,6 +184,8 @@ struct thread_info *ti = current_thread_info(); u32 cached_flags; + addr_limit_user_check(); + if (IS_ENABLED(CONFIG_PROVE_LOCKING) && WARN_ON(!irqs_disabled())) local_irq_disable(); --- linux-azure-4.13.0.orig/arch/x86/entry/entry_32.S +++ linux-azure-4.13.0/arch/x86/entry/entry_32.S @@ -43,6 +43,7 @@ #include #include #include +#include .section .entry.text, "ax" @@ -242,6 +243,18 @@ movl %ebx, PER_CPU_VAR(stack_canary)+stack_canary_offset #endif +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + /* Clobbers %ebx */ + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + /* restore callee-saved registers */ popl %esi popl %edi @@ -289,7 +302,7 @@ /* kernel thread */ 1: movl %edi, %eax - call *%ebx + CALL_NOSPEC %ebx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -891,14 +904,6 @@ #endif /* CONFIG_HYPERV */ -#ifdef CONFIG_TRACING -ENTRY(trace_page_fault) - ASM_CLAC - pushl $trace_do_page_fault - jmp common_exception -END(trace_page_fault) -#endif - ENTRY(page_fault) ASM_CLAC pushl $do_page_fault @@ -934,7 +939,7 @@ movl %ecx, %es TRACE_IRQS_OFF movl %esp, %eax # pt_regs pointer - call *%edi + CALL_NOSPEC %edi jmp ret_from_exception END(common_exception) @@ -956,9 +961,10 @@ movl %esp, %eax # pt_regs pointer /* Are we currently on the SYSENTER stack? */ - PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx jb .Ldebug_from_sysenter_stack TRACE_IRQS_OFF @@ -999,9 +1005,10 @@ movl %esp, %eax # pt_regs pointer /* Are we currently on the SYSENTER stack? */ - PER_CPU(cpu_tss + CPU_TSS_SYSENTER_stack + SIZEOF_SYSENTER_stack, %ecx) - subl %eax, %ecx /* ecx = (end of SYSENTER_stack) - esp */ - cmpl $SIZEOF_SYSENTER_stack, %ecx + movl PER_CPU_VAR(cpu_entry_area), %ecx + addl $CPU_ENTRY_AREA_entry_stack + SIZEOF_entry_stack, %ecx + subl %eax, %ecx /* ecx = (end of entry_stack) - esp */ + cmpl $SIZEOF_entry_stack, %ecx jb .Lnmi_from_sysenter_stack /* Not on SYSENTER stack. */ --- linux-azure-4.13.0.orig/arch/x86/entry/entry_64.S +++ linux-azure-4.13.0/arch/x86/entry/entry_64.S @@ -22,7 +22,6 @@ #include #include #include -#include "calling.h" #include #include #include @@ -36,16 +35,22 @@ #include #include #include +#include +#include +#include #include +#include "calling.h" + .code64 .section .entry.text, "ax" #ifdef CONFIG_PARAVIRT ENTRY(native_usergs_sysret64) + UNWIND_HINT_EMPTY swapgs sysretq -ENDPROC(native_usergs_sysret64) +END(native_usergs_sysret64) #endif /* CONFIG_PARAVIRT */ .macro TRACE_IRQS_IRETQ @@ -133,31 +138,90 @@ * with them due to bugs in both AMD and Intel CPUs. */ + .pushsection .entry_trampoline, "ax" + +/* + * The code in here gets remapped into cpu_entry_area's trampoline. This means + * that the assembler and linker have the wrong idea as to where this code + * lives (and, in fact, it's mapped more than once, so it's not even at a + * fixed address). So we can't reference any symbols outside the entry + * trampoline and expect it to work. + * + * Instead, we carefully abuse %rip-relative addressing. + * _entry_trampoline(%rip) refers to the start of the remapped) entry + * trampoline. We can thus find cpu_entry_area with this macro: + */ + +#define CPU_ENTRY_AREA \ + _entry_trampoline - CPU_ENTRY_AREA_entry_trampoline(%rip) + +/* The top word of the SYSENTER stack is hot and is usable as scratch space. */ +#define RSP_SCRATCH CPU_ENTRY_AREA_entry_stack + \ + SIZEOF_entry_stack - 8 + CPU_ENTRY_AREA + +ENTRY(entry_SYSCALL_64_trampoline) + UNWIND_HINT_EMPTY + swapgs + + /* Stash the user RSP. */ + movq %rsp, RSP_SCRATCH + + /* Note: using %rsp as a scratch reg. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + + /* Load the top of the task stack into RSP */ + movq CPU_ENTRY_AREA_tss + TSS_sp1 + CPU_ENTRY_AREA, %rsp + + /* Start building the simulated IRET frame. */ + pushq $__USER_DS /* pt_regs->ss */ + pushq RSP_SCRATCH /* pt_regs->sp */ + pushq %r11 /* pt_regs->flags */ + pushq $__USER_CS /* pt_regs->cs */ + pushq %rcx /* pt_regs->ip */ + + /* + * x86 lacks a near absolute jump, and we can't jump to the real + * entry text with a relative jump. We could push the target + * address and then use retq, but this destroys the pipeline on + * many CPUs (wasting over 20 cycles on Sandy Bridge). Instead, + * spill RDI and restore it in a second-stage trampoline. + */ + pushq %rdi + movq $entry_SYSCALL_64_stage2, %rdi + JMP_NOSPEC %rdi +END(entry_SYSCALL_64_trampoline) + + .popsection + +ENTRY(entry_SYSCALL_64_stage2) + UNWIND_HINT_EMPTY + popq %rdi + jmp entry_SYSCALL_64_after_hwframe +END(entry_SYSCALL_64_stage2) + ENTRY(entry_SYSCALL_64) + UNWIND_HINT_EMPTY /* * Interrupts are off on entry. * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON, * it is too small to ever cause noticeable irq latency. */ - SWAPGS_UNSAFE_STACK + + swapgs /* - * A hypervisor implementation might want to use a label - * after the swapgs, so that it can do the swapgs - * for the guest and jump here on syscall. + * This path is not taken when PAGE_TABLE_ISOLATION is disabled so it + * is not required to switch CR3. */ -GLOBAL(entry_SYSCALL_64_after_swapgs) - movq %rsp, PER_CPU_VAR(rsp_scratch) movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - TRACE_IRQS_OFF - /* Construct struct pt_regs on stack */ pushq $__USER_DS /* pt_regs->ss */ pushq PER_CPU_VAR(rsp_scratch) /* pt_regs->sp */ pushq %r11 /* pt_regs->flags */ pushq $__USER_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_64_after_hwframe) pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -169,6 +233,13 @@ pushq %r10 /* pt_regs->r10 */ pushq %r11 /* pt_regs->r11 */ sub $(6*8), %rsp /* pt_regs->bp, bx, r12-15 not saved */ + UNWIND_HINT_REGS extra=0 + + ENABLE_IBRS + + STUFF_RSB + + TRACE_IRQS_OFF /* * If we need to do entry work or if we guess we'll need to do @@ -200,7 +271,12 @@ * It might end up jumping to the slow path. If it jumps, RAX * and all argument registers are clobbered. */ +#ifdef CONFIG_RETPOLINE + movq sys_call_table(, %rax, 8), %rax + call __x86_indirect_thunk_rax +#else call *sys_call_table(, %rax, 8) +#endif .Lentry_SYSCALL_64_after_fastpath_call: movq %rax, RAX(%rsp) @@ -221,9 +297,10 @@ TRACE_IRQS_ON /* user mode is traced as IRQs on */ movq RIP(%rsp), %rcx movq EFLAGS(%rsp), %r11 - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp - USERGS_SYSRET64 + DISABLE_IBRS + addq $6*8, %rsp /* skip extra regs -- they were preserved */ + UNWIND_HINT_EMPTY + jmp .Lpop_c_regs_except_rcx_r11_and_sysret 1: /* @@ -245,17 +322,18 @@ call do_syscall_64 /* returns with IRQs disabled */ return_from_SYSCALL_64: - RESTORE_EXTRA_REGS TRACE_IRQS_IRETQ /* we're about to change IF */ /* * Try to use SYSRET instead of IRET if we're returning to - * a completely clean 64-bit userspace context. + * a completely clean 64-bit userspace context. If we're not, + * go to the slow exit path. */ movq RCX(%rsp), %rcx movq RIP(%rsp), %r11 - cmpq %rcx, %r11 /* RCX == RIP */ - jne opportunistic_sysret_failed + + cmpq %rcx, %r11 /* SYSRET requires RCX == RIP */ + jne swapgs_restore_regs_and_return_to_usermode /* * On Intel CPUs, SYSRET with non-canonical RCX/RIP will #GP @@ -273,14 +351,14 @@ /* If this changed %rcx, it was not canonical */ cmpq %rcx, %r11 - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode cmpq $__USER_CS, CS(%rsp) /* CS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode movq R11(%rsp), %r11 cmpq %r11, EFLAGS(%rsp) /* R11 == RFLAGS */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * SYSCALL clears RF when it saves RFLAGS in R11 and SYSRET cannot @@ -301,26 +379,52 @@ * would never get past 'stuck_here'. */ testq $(X86_EFLAGS_RF|X86_EFLAGS_TF), %r11 - jnz opportunistic_sysret_failed + jnz swapgs_restore_regs_and_return_to_usermode /* nothing to check for RSP */ cmpq $__USER_DS, SS(%rsp) /* SS must match SYSRET */ - jne opportunistic_sysret_failed + jne swapgs_restore_regs_and_return_to_usermode /* * We win! This label is here just for ease of understanding * perf profiles. Nothing jumps here. */ syscall_return_via_sysret: + DISABLE_IBRS + /* rcx and r11 are already restored (see code above) */ - RESTORE_C_REGS_EXCEPT_RCX_R11 - movq RSP(%rsp), %rsp - USERGS_SYSRET64 + UNWIND_HINT_EMPTY + POP_EXTRA_REGS +.Lpop_c_regs_except_rcx_r11_and_sysret: + popq %rsi /* skip r11 */ + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rsi /* skip rcx */ + popq %rdx + popq %rsi -opportunistic_sysret_failed: - SWAPGS - jmp restore_c_regs_and_iret + /* + * Now all regs are restored except RSP and RDI. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + + pushq RSP-RDI(%rdi) /* RSP */ + pushq (%rdi) /* RDI */ + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + + popq %rdi + popq %rsp + USERGS_SYSRET64 END(entry_SYSCALL_64) ENTRY(stub_ptregs_64) @@ -343,14 +447,16 @@ DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF popq %rax + UNWIND_HINT_REGS extra=0 jmp entry_SYSCALL64_slow_path 1: - jmp *%rax /* Called from C */ + JMP_NOSPEC %rax /* Called from C */ END(stub_ptregs_64) .macro ptregs_stub func ENTRY(ptregs_\func) + UNWIND_HINT_FUNC leaq \func(%rip), %rax jmp stub_ptregs_64 END(ptregs_\func) @@ -367,6 +473,7 @@ * %rsi: next task */ ENTRY(__switch_to_asm) + UNWIND_HINT_FUNC /* * Save callee-saved registers * This must match the order in inactive_task_frame @@ -387,6 +494,18 @@ movq %rbx, PER_CPU_VAR(irq_stack_union)+stack_canary_offset #endif +#ifdef CONFIG_RETPOLINE + /* + * When switching from a shallower to a deeper call stack + * the RSB may either underflow or use entries populated + * with userspace addresses. On CPUs where those concerns + * exist, overwrite the RSB with entries which capture + * speculative execution to prevent attack. + */ + /* Clobbers %rbx */ + FILL_RETURN_BUFFER RSB_CLEAR_LOOPS, X86_FEATURE_RSB_CTXSW +#endif + /* restore callee-saved registers */ popq %r15 popq %r14 @@ -406,6 +525,7 @@ * r12: kernel thread arg */ ENTRY(ret_from_fork) + UNWIND_HINT_EMPTY movq %rax, %rdi call schedule_tail /* rdi: 'prev' task parameter */ @@ -413,16 +533,16 @@ jnz 1f /* kernel threads are uncommon */ 2: + UNWIND_HINT_REGS movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode 1: /* kernel thread */ movq %r12, %rdi - call *%rbx + CALL_NOSPEC %rbx /* * A kernel thread is allowed to return here after successfully * calling do_execve(). Exit to userspace to complete the execve() @@ -440,13 +560,103 @@ ENTRY(irq_entries_start) vector=FIRST_EXTERNAL_VECTOR .rept (FIRST_SYSTEM_VECTOR - FIRST_EXTERNAL_VECTOR) + UNWIND_HINT_IRET_REGS pushq $(~vector+0x80) /* Note: always in signed byte range */ - vector=vector+1 jmp common_interrupt .align 8 + vector=vector+1 .endr END(irq_entries_start) +.macro DEBUG_ENTRY_ASSERT_IRQS_OFF +#ifdef CONFIG_DEBUG_ENTRY + pushq %rax + SAVE_FLAGS(CLBR_RAX) + testl $X86_EFLAGS_IF, %eax + jz .Lokay_\@ + ud2 +.Lokay_\@: + popq %rax +#endif +.endm + +/* + * Enters the IRQ stack if we're not already using it. NMI-safe. Clobbers + * flags and puts old RSP into old_rsp, and leaves all other GPRs alone. + * Requires kernel GSBASE. + * + * The invariant is that, if irq_count != -1, then the IRQ stack is in use. + */ +.macro ENTER_IRQ_STACK regs=1 old_rsp + DEBUG_ENTRY_ASSERT_IRQS_OFF + movq %rsp, \old_rsp + + .if \regs + UNWIND_HINT_REGS base=\old_rsp + .endif + + incl PER_CPU_VAR(irq_count) + jnz .Lirq_stack_push_old_rsp_\@ + + /* + * Right now, if we just incremented irq_count to zero, we've + * claimed the IRQ stack but we haven't switched to it yet. + * + * If anything is added that can interrupt us here without using IST, + * it must be *extremely* careful to limit its stack usage. This + * could include kprobes and a hypothetical future IST-less #DB + * handler. + * + * The OOPS unwinder relies on the word at the top of the IRQ + * stack linking back to the previous RSP for the entire time we're + * on the IRQ stack. For this to work reliably, we need to write + * it before we actually move ourselves to the IRQ stack. + */ + + movq \old_rsp, PER_CPU_VAR(irq_stack_union + IRQ_STACK_SIZE - 8) + movq PER_CPU_VAR(irq_stack_ptr), %rsp + +#ifdef CONFIG_DEBUG_ENTRY + /* + * If the first movq above becomes wrong due to IRQ stack layout + * changes, the only way we'll notice is if we try to unwind right + * here. Assert that we set up the stack right to catch this type + * of bug quickly. + */ + cmpq -8(%rsp), \old_rsp + je .Lirq_stack_okay\@ + ud2 + .Lirq_stack_okay\@: +#endif + +.Lirq_stack_push_old_rsp_\@: + pushq \old_rsp + + .if \regs + UNWIND_HINT_REGS indirect=1 + .endif +.endm + +/* + * Undoes ENTER_IRQ_STACK. + */ +.macro LEAVE_IRQ_STACK regs=1 + DEBUG_ENTRY_ASSERT_IRQS_OFF + /* We need to be off the IRQ stack before decrementing irq_count. */ + popq %rsp + + .if \regs + UNWIND_HINT_REGS + .endif + + /* + * As in ENTER_IRQ_STACK, irq_count == 0, we are still claiming + * the irq stack but we're not on it. + */ + + decl PER_CPU_VAR(irq_count) +.endm + /* * Interrupt entry/exit. * @@ -458,19 +668,33 @@ /* 0(%rsp): ~(interrupt number) */ .macro interrupt func cld + + testb $3, CS-ORIG_RAX(%rsp) + jz 1f + SWAPGS + call switch_to_thread_stack +1: + ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS SAVE_EXTRA_REGS + + /* + * Have to do stuffing before encoding frame pointer. + * Could add some unnecessary RSB clearing if coming + * from kernel for non-SMEP platform. + */ + STUFF_RSB ENCODE_FRAME_POINTER testb $3, CS(%rsp) jz 1f /* - * IRQ from user mode. Switch to kernel gsbase and inform context - * tracking that we're in kernel mode. + * IRQ from user mode. + * */ - SWAPGS + ENABLE_IBRS /* * We need to tell lockdep that IRQs are off. We can't do this until @@ -485,17 +709,7 @@ CALL_enter_from_user_mode 1: - /* - * Save previous stack pointer, optionally switch to interrupt stack. - * irq_count is used to check if a CPU is already on an interrupt stack - * or not. While this is essentially redundant with preempt_count it is - * a little cheaper to use a separate counter in the PDA (short of - * moving irq_enter into assembly, which would be too much work) - */ - movq %rsp, %rdi - incl PER_CPU_VAR(irq_count) - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rdi + ENTER_IRQ_STACK old_rsp=%rdi /* We entered an interrupt context - irqs are off: */ TRACE_IRQS_OFF @@ -515,10 +729,8 @@ ret_from_intr: DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF - decl PER_CPU_VAR(irq_count) - /* Restore saved previous stack */ - popq %rsp + LEAVE_IRQ_STACK testb $3, CS(%rsp) jz retint_kernel @@ -528,8 +740,54 @@ mov %rsp,%rdi call prepare_exit_to_usermode TRACE_IRQS_IRETQ + +GLOBAL(swapgs_restore_regs_and_return_to_usermode) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates user mode. */ + testb $3, CS(%rsp) + jnz 1f + ud2 +1: +#endif + POP_EXTRA_REGS + popq %r11 + popq %r10 + popq %r9 + popq %r8 + popq %rax + popq %rcx + popq %rdx + popq %rsi + + /* + * The stack is now user RDI, orig_ax, RIP, CS, EFLAGS, RSP, SS. + * Save old stack pointer and switch to trampoline stack. + */ + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_tss_rw + TSS_sp0), %rsp + + /* Copy the IRET frame to the trampoline stack. */ + pushq 6*8(%rdi) /* SS */ + pushq 5*8(%rdi) /* RSP */ + pushq 4*8(%rdi) /* EFLAGS */ + pushq 3*8(%rdi) /* CS */ + pushq 2*8(%rdi) /* RIP */ + + /* Push user RDI on the trampoline stack. */ + pushq (%rdi) + + /* + * We are on the trampoline stack. All regs except RDI are live. + * We can do future final exit work right here. + */ + DISABLE_IBRS + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + + /* Restore RDI. */ + popq %rdi SWAPGS - jmp restore_regs_and_iret + INTERRUPT_RETURN + /* Returning to kernel space */ retint_kernel: @@ -549,18 +807,21 @@ */ TRACE_IRQS_IRETQ -/* - * At this label, code paths which return to kernel and to user, - * which come from interrupts/exception and from syscalls, merge. - */ -GLOBAL(restore_regs_and_iret) - RESTORE_EXTRA_REGS -restore_c_regs_and_iret: - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 +GLOBAL(restore_regs_and_return_to_kernel) +#ifdef CONFIG_DEBUG_ENTRY + /* Assert that pt_regs indicates kernel mode. */ + testb $3, CS(%rsp) + jz 1f + ud2 +1: +#endif + POP_EXTRA_REGS + POP_C_REGS + addq $8, %rsp /* skip regs->orig_ax */ INTERRUPT_RETURN ENTRY(native_iret) + UNWIND_HINT_IRET_REGS /* * Are we returning to a stack segment from the LDT? Note: in * 64-bit mode SS:RSP on the exception stack is always valid. @@ -604,7 +865,9 @@ */ pushq %rdi /* Stash user RDI */ - SWAPGS + SWAPGS /* to kernel GS */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi /* to kernel CR3 */ + movq PER_CPU_VAR(espfix_waddr), %rdi movq %rax, (0*8)(%rdi) /* user RAX */ movq (1*8)(%rsp), %rax /* user RIP */ @@ -620,7 +883,6 @@ /* Now RAX == RSP. */ andl $0xffff0000, %eax /* RAX = (RSP & 0xffff0000) */ - popq %rdi /* Restore user RDI */ /* * espfix_stack[31:16] == 0. The page tables are set up such that @@ -631,8 +893,13 @@ * still points to an RO alias of the ESPFIX stack. */ orq PER_CPU_VAR(espfix_stack), %rax - SWAPGS + + SWITCH_TO_USER_CR3_STACK scratch_reg=%rdi + SWAPGS /* to user GS */ + popq %rdi /* Restore user RDI */ + movq %rax, %rsp + UNWIND_HINT_IRET_REGS offset=8 /* * At this point, we cannot write to the stack any more, but we can @@ -654,6 +921,7 @@ */ .macro apicinterrupt3 num sym do_sym ENTRY(\sym) + UNWIND_HINT_IRET_REGS ASM_CLAC pushq $~(\num) .Lcommon_\sym: @@ -675,13 +943,8 @@ #endif /* Make sure APIC interrupt handlers end up in the irqentry section: */ -#if defined(CONFIG_FUNCTION_GRAPH_TRACER) || defined(CONFIG_KASAN) -# define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" -# define POP_SECTION_IRQENTRY .popsection -#else -# define PUSH_SECTION_IRQENTRY -# define POP_SECTION_IRQENTRY -#endif +#define PUSH_SECTION_IRQENTRY .pushsection .irqentry.text, "ax" +#define POP_SECTION_IRQENTRY .popsection .macro apicinterrupt num sym do_sym PUSH_SECTION_IRQENTRY @@ -736,33 +999,64 @@ /* * Exception entry points. */ -#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss) + (TSS_ist + ((x) - 1) * 8) +#define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + ((x) - 1) * 8) + +/* + * Switch to the thread stack. This is called with the IRET frame and + * orig_ax on the stack. (That is, RDI..R12 are not on the stack and + * space has not been allocated for them.) + */ +ENTRY(switch_to_thread_stack) + UNWIND_HINT_FUNC + + pushq %rdi + /* Need to switch before accessing the thread stack. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdi + movq %rsp, %rdi + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT sp_offset=16 sp_reg=ORC_REG_DI + + pushq 7*8(%rdi) /* regs->ss */ + pushq 6*8(%rdi) /* regs->rsp */ + pushq 5*8(%rdi) /* regs->eflags */ + pushq 4*8(%rdi) /* regs->cs */ + pushq 3*8(%rdi) /* regs->ip */ + pushq 2*8(%rdi) /* regs->orig_ax */ + pushq 8(%rdi) /* return address */ + UNWIND_HINT_FUNC + + movq (%rdi), %rdi + ret +END(switch_to_thread_stack) .macro idtentry sym do_sym has_error_code:req paranoid=0 shift_ist=-1 ENTRY(\sym) + UNWIND_HINT_IRET_REGS offset=\has_error_code*8 + /* Sanity check */ .if \shift_ist != -1 && \paranoid == 0 .error "using shift_ist requires paranoid=1" .endif ASM_CLAC - PARAVIRT_ADJUST_EXCEPTION_FRAME - .ifeq \has_error_code + .if \has_error_code == 0 pushq $-1 /* ORIG_RAX: no syscall to restart */ .endif ALLOC_PT_GPREGS_ON_STACK - .if \paranoid - .if \paranoid == 1 + .if \paranoid < 2 testb $3, CS(%rsp) /* If coming from userspace, switch stacks */ - jnz 1f + jnz .Lfrom_usermode_switch_stack_\@ .endif + + .if \paranoid call paranoid_entry .else call error_entry .endif + UNWIND_HINT_REGS /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ .if \paranoid @@ -799,20 +1093,15 @@ jmp error_exit .endif - .if \paranoid == 1 + .if \paranoid < 2 /* - * Paranoid entry from userspace. Switch stacks and treat it + * Entry from userspace. Switch stacks and treat it * as a normal entry. This means that paranoid handlers * run in real process context if user_mode(regs). */ -1: +.Lfrom_usermode_switch_stack_\@: call error_entry - - movq %rsp, %rdi /* pt_regs pointer */ - call sync_regs - movq %rax, %rsp /* switch stack */ - movq %rsp, %rdi /* pt_regs pointer */ .if \has_error_code @@ -829,17 +1118,6 @@ END(\sym) .endm -#ifdef CONFIG_TRACING -.macro trace_idtentry sym do_sym has_error_code:req -idtentry trace(\sym) trace(\do_sym) has_error_code=\has_error_code -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#else -.macro trace_idtentry sym do_sym has_error_code:req -idtentry \sym \do_sym has_error_code=\has_error_code -.endm -#endif - idtentry divide_error do_divide_error has_error_code=0 idtentry overflow do_overflow has_error_code=0 idtentry bounds do_bounds has_error_code=0 @@ -860,6 +1138,7 @@ * edi: new selector */ ENTRY(native_load_gs_index) + FRAME_BEGIN pushfq DISABLE_INTERRUPTS(CLBR_ANY & ~CLBR_RDI) SWAPGS @@ -868,8 +1147,9 @@ 2: ALTERNATIVE "", "mfence", X86_BUG_SWAPGS_FENCE SWAPGS popfq + FRAME_END ret -END(native_load_gs_index) +ENDPROC(native_load_gs_index) EXPORT_SYMBOL(native_load_gs_index) _ASM_EXTABLE(.Lgs_change, bad_gs) @@ -892,17 +1172,15 @@ ENTRY(do_softirq_own_stack) pushq %rbp mov %rsp, %rbp - incl PER_CPU_VAR(irq_count) - cmove PER_CPU_VAR(irq_stack_ptr), %rsp - push %rbp /* frame pointer backlink */ + ENTER_IRQ_STACK regs=0 old_rsp=%r11 call __do_softirq + LEAVE_IRQ_STACK regs=0 leaveq - decl PER_CPU_VAR(irq_count) ret -END(do_softirq_own_stack) +ENDPROC(do_softirq_own_stack) #ifdef CONFIG_XEN -idtentry xen_hypervisor_callback xen_do_hypervisor_callback has_error_code=0 +idtentry hypervisor_callback xen_do_hypervisor_callback has_error_code=0 /* * A note on the "critical region" in our callback handler. @@ -923,14 +1201,14 @@ * Since we don't modify %rdi, evtchn_do_upall(struct *pt_regs) will * see the correct pointer to the pt_regs */ + UNWIND_HINT_FUNC movq %rdi, %rsp /* we don't return, adjust the stack frame */ -11: incl PER_CPU_VAR(irq_count) - movq %rsp, %rbp - cmovzq PER_CPU_VAR(irq_stack_ptr), %rsp - pushq %rbp /* frame pointer backlink */ + UNWIND_HINT_REGS + + ENTER_IRQ_STACK old_rsp=%r10 call xen_evtchn_do_upcall - popq %rsp - decl PER_CPU_VAR(irq_count) + LEAVE_IRQ_STACK + #ifndef CONFIG_PREEMPT call xen_maybe_preempt_hcall #endif @@ -951,6 +1229,7 @@ * with its current contents: any discrepancy means we in category 1. */ ENTRY(xen_failsafe_callback) + UNWIND_HINT_EMPTY movl %ds, %ecx cmpw %cx, 0x10(%rsp) jne 1f @@ -968,13 +1247,13 @@ movq 8(%rsp), %r11 addq $0x30, %rsp pushq $0 /* RIP */ - pushq %r11 - pushq %rcx + UNWIND_HINT_IRET_REGS offset=8 jmp general_protection 1: /* Segment mismatch => Category 1 (Bad segment). Retry the IRET. */ movq (%rsp), %rcx movq 8(%rsp), %r11 addq $0x30, %rsp + UNWIND_HINT_IRET_REGS pushq $-1 /* orig_ax = -1 => not a system call */ ALLOC_PT_GPREGS_ON_STACK SAVE_C_REGS @@ -998,13 +1277,13 @@ idtentry stack_segment do_stack_segment has_error_code=1 #ifdef CONFIG_XEN -idtentry xen_debug do_debug has_error_code=0 -idtentry xen_int3 do_int3 has_error_code=0 -idtentry xen_stack_segment do_stack_segment has_error_code=1 +idtentry xennmi do_nmi has_error_code=0 +idtentry xendebug do_debug has_error_code=0 +idtentry xenint3 do_int3 has_error_code=0 #endif idtentry general_protection do_general_protection has_error_code=1 -trace_idtentry page_fault do_page_fault has_error_code=1 +idtentry page_fault do_page_fault has_error_code=1 #ifdef CONFIG_KVM_GUEST idtentry async_page_fault do_async_page_fault has_error_code=1 @@ -1020,9 +1299,14 @@ * Return: ebx=0: need swapgs on exit, ebx=1: otherwise */ ENTRY(paranoid_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + /* + * Do the stuffing unconditionally from user/kernel to be safe + */ + STUFF_RSB ENCODE_FRAME_POINTER 8 movl $1, %ebx movl $MSR_GS_BASE, %ecx @@ -1031,7 +1315,12 @@ js 1f /* negative -> in kernel */ SWAPGS xorl %ebx, %ebx -1: ret + +1: + SAVE_AND_SWITCH_TO_KERNEL_CR3 scratch_reg=%rax save_reg=%r14 + ENABLE_IBRS_CLOBBER + + ret END(paranoid_entry) /* @@ -1047,20 +1336,19 @@ * On entry, ebx is "no swapgs" flag (1: don't need swapgs, 0: need it) */ ENTRY(paranoid_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF_DEBUG testl %ebx, %ebx /* swapgs needed? */ - jnz paranoid_exit_no_swapgs + jnz .Lparanoid_exit_no_swapgs TRACE_IRQS_IRETQ + RESTORE_CR3 scratch_reg=%rbx save_reg=%r14 SWAPGS_UNSAFE_STACK - jmp paranoid_exit_restore -paranoid_exit_no_swapgs: + jmp .Lparanoid_exit_restore +.Lparanoid_exit_no_swapgs: TRACE_IRQS_IRETQ_DEBUG -paranoid_exit_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS - REMOVE_PT_GPREGS_FROM_STACK 8 - INTERRUPT_RETURN +.Lparanoid_exit_restore: + jmp restore_regs_and_return_to_kernel END(paranoid_exit) /* @@ -1068,9 +1356,11 @@ * Return: EBX=0: came from user mode; EBX=1: otherwise */ ENTRY(error_entry) + UNWIND_HINT_FUNC cld SAVE_C_REGS 8 SAVE_EXTRA_REGS 8 + STUFF_RSB ENCODE_FRAME_POINTER 8 xorl %ebx, %ebx testb $3, CS+8(%rsp) @@ -1081,8 +1371,20 @@ * from user mode due to an IRET fault. */ SWAPGS + /* We have user CR3. Change to kernel CR3. */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + + ENABLE_IBRS .Lerror_entry_from_usermode_after_swapgs: + /* Put us onto the real thread stack. */ + popq %r12 /* save return addr in %12 */ + movq %rsp, %rdi /* arg0 = pt_regs pointer */ + call sync_regs + movq %rax, %rsp /* switch stack */ + ENCODE_FRAME_POINTER + pushq %r12 + /* * We need to tell lockdep that IRQs are off. We can't do this until * we fix gsbase, and we should do it before enter_from_user_mode @@ -1119,6 +1421,8 @@ * .Lgs_change's error handler with kernel gsbase. */ SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + ENABLE_IBRS_CLOBBER jmp .Lerror_entry_done .Lbstep_iret: @@ -1128,10 +1432,12 @@ .Lerror_bad_iret: /* - * We came from an IRET to user mode, so we have user gsbase. - * Switch to kernel gsbase: + * We came from an IRET to user mode, so we have user + * gsbase and CR3. Switch to kernel gsbase and CR3: */ SWAPGS + SWITCH_TO_KERNEL_CR3 scratch_reg=%rax + ENABLE_IBRS_CLOBBER /* * Pretend that the exception came from user mode: set up pt_regs @@ -1152,6 +1458,7 @@ * 0: user gsbase is loaded, we need SWAPGS and standard preparation for return to usermode */ ENTRY(error_exit) + UNWIND_HINT_REGS DISABLE_INTERRUPTS(CLBR_ANY) TRACE_IRQS_OFF testl %ebx, %ebx @@ -1159,19 +1466,16 @@ jmp retint_user END(error_exit) -/* Runs on exception stack */ +/* + * Runs on exception stack. Xen PV does not go through this path at all, + * so we can use real assembly here. + * + * Registers: + * %r14: Used to save/restore the CR3 of the interrupted context + * when PAGE_TABLE_ISOLATION is in use. Do not clobber. + */ ENTRY(nmi) - /* - * Fix up the exception frame if we're on Xen. - * PARAVIRT_ADJUST_EXCEPTION_FRAME is guaranteed to push at most - * one value to the stack on native, so it may clobber the rdx - * scratch slot, but it won't clobber any of the important - * slots past it. - * - * Xen is a different story, because the Xen frame itself overlaps - * the "NMI executing" variable. - */ - PARAVIRT_ADJUST_EXCEPTION_FRAME + UNWIND_HINT_IRET_REGS /* * We allow breakpoints in NMIs. If a breakpoint occurs, then @@ -1230,15 +1534,18 @@ * stacks lest we corrupt the "NMI executing" variable. */ - SWAPGS_UNSAFE_STACK + swapgs cld + SWITCH_TO_KERNEL_CR3 scratch_reg=%rdx movq %rsp, %rdx movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp + UNWIND_HINT_IRET_REGS base=%rdx offset=8 pushq 5*8(%rdx) /* pt_regs->ss */ pushq 4*8(%rdx) /* pt_regs->rsp */ pushq 3*8(%rdx) /* pt_regs->flags */ pushq 2*8(%rdx) /* pt_regs->cs */ pushq 1*8(%rdx) /* pt_regs->rip */ + UNWIND_HINT_IRET_REGS pushq $-1 /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -1255,8 +1562,10 @@ pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS ENCODE_FRAME_POINTER + ENABLE_IBRS /* * At this point we no longer need to worry about stack damage * due to nesting -- we're on the normal thread stack and we're @@ -1271,8 +1580,7 @@ * Return back to user mode. We must *not* do the normal exit * work, because we don't want to enable interrupts. */ - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode .Lnmi_from_kernel: /* @@ -1393,7 +1701,7 @@ popq %rdx /* We are returning to kernel mode, so this cannot result in a fault. */ - INTERRUPT_RETURN + iretq first_nmi: /* Restore rdx. */ @@ -1409,6 +1717,7 @@ .rept 5 pushq 11*8(%rsp) .endr + UNWIND_HINT_IRET_REGS /* Everything up to here is safe from nested NMIs */ @@ -1423,7 +1732,8 @@ pushfq /* RFLAGS */ pushq $__KERNEL_CS /* CS */ pushq $1f /* RIP */ - INTERRUPT_RETURN /* continues at repeat_nmi below */ + iretq /* continues at repeat_nmi below */ + UNWIND_HINT_IRET_REGS 1: #endif @@ -1473,54 +1783,64 @@ * exceptions might do. */ call paranoid_entry + UNWIND_HINT_REGS /* paranoidentry do_nmi, 0; without TRACE_IRQS_OFF */ movq %rsp, %rdi movq $-1, %rsi call do_nmi + RESTORE_CR3 scratch_reg=%r15 save_reg=%r14 + testl %ebx, %ebx /* swapgs needed? */ jnz nmi_restore nmi_swapgs: SWAPGS_UNSAFE_STACK nmi_restore: - RESTORE_EXTRA_REGS - RESTORE_C_REGS + POP_EXTRA_REGS + POP_C_REGS - /* Point RSP at the "iret" frame. */ - REMOVE_PT_GPREGS_FROM_STACK 6*8 + /* + * Skip orig_ax and the "outermost" frame to point RSP at the "iret" + * at the "iret" frame. + */ + addq $6*8, %rsp /* * Clear "NMI executing". Set DF first so that we can easily * distinguish the remaining code between here and IRET from - * the SYSCALL entry and exit paths. On a native kernel, we - * could just inspect RIP, but, on paravirt kernels, - * INTERRUPT_RETURN can translate into a jump into a - * hypercall page. + * the SYSCALL entry and exit paths. + * + * We arguably should just inspect RIP instead, but I (Andy) wrote + * this code when I had the misapprehension that Xen PV supported + * NMIs, and Xen PV would break that approach. */ std movq $0, 5*8(%rsp) /* clear "NMI executing" */ /* - * INTERRUPT_RETURN reads the "iret" frame and exits the NMI - * stack in a single instruction. We are returning to kernel - * mode, so this cannot result in a fault. + * iretq reads the "iret" frame and exits the NMI stack in a + * single instruction. We are returning to kernel mode, so this + * cannot result in a fault. Similarly, we don't need to worry + * about espfix64 on the way back to kernel mode. */ - INTERRUPT_RETURN + iretq END(nmi) ENTRY(ignore_sysret) + UNWIND_HINT_EMPTY mov $-ENOSYS, %eax sysret END(ignore_sysret) ENTRY(rewind_stack_do_exit) + UNWIND_HINT_FUNC /* Prevent any naive code from trying to unwind to our caller. */ xorl %ebp, %ebp movq PER_CPU_VAR(cpu_current_top_of_stack), %rax - leaq -TOP_OF_KERNEL_STACK_PADDING-PTREGS_SIZE(%rax), %rsp + leaq -PTREGS_SIZE(%rax), %rsp + UNWIND_HINT_FUNC sp_offset=PTREGS_SIZE call do_exit -1: jmp 1b END(rewind_stack_do_exit) --- linux-azure-4.13.0.orig/arch/x86/entry/entry_64_compat.S +++ linux-azure-4.13.0/arch/x86/entry/entry_64_compat.S @@ -13,6 +13,7 @@ #include #include #include +#include #include #include @@ -47,7 +48,11 @@ */ ENTRY(entry_SYSENTER_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + SWAPGS + + /* We are about to clobber %rsp anyway, clobbering here is OK */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* @@ -91,6 +96,9 @@ pushq $0 /* pt_regs->r15 = 0 */ cld + ENABLE_IBRS + STUFF_RSB + /* * SYSENTER doesn't filter flags, so we need to clear NT and AC * ourselves. To save a few cycles, we can check whether @@ -183,14 +191,17 @@ */ ENTRY(entry_SYSCALL_compat) /* Interrupts are off on entry. */ - SWAPGS_UNSAFE_STACK + swapgs - /* Stash user ESP and switch to the kernel stack. */ + /* Stash user ESP.*/ movl %esp, %r8d - movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp - /* Zero-extending 32-bit regs, do not remove */ - movl %eax, %eax + /* Use %rsp as scratch reg. User ESP is stashed in r8 */ + SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp + ENABLE_IBRS + + /* Switch to the kernel stack */ + movq PER_CPU_VAR(cpu_current_top_of_stack), %rsp /* Construct struct pt_regs on stack */ pushq $__USER32_DS /* pt_regs->ss */ @@ -198,6 +209,8 @@ pushq %r11 /* pt_regs->flags */ pushq $__USER32_CS /* pt_regs->cs */ pushq %rcx /* pt_regs->ip */ +GLOBAL(entry_SYSCALL_compat_after_hwframe) + movl %eax, %eax /* discard orig_ax high bits */ pushq %rax /* pt_regs->orig_ax */ pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ @@ -215,6 +228,8 @@ pushq $0 /* pt_regs->r14 = 0 */ pushq $0 /* pt_regs->r15 = 0 */ + STUFF_RSB + /* * User mode is traced as though IRQs are on, and SYSENTER * turned them off. @@ -241,6 +256,7 @@ popq %rsi /* pt_regs->si */ popq %rdi /* pt_regs->di */ + DISABLE_IBRS /* * USERGS_SYSRET32 does: * GSBASE = user's GS base @@ -256,10 +272,22 @@ * when the system call started, which is already known to user * code. We zero R8-R10 to avoid info leaks. */ + movq RSP-ORIG_RAX(%rsp), %rsp + + /* + * The original userspace %rsp (RSP-ORIG_RAX(%rsp)) is stored + * on the process stack which is not mapped to userspace and + * not readable after we SWITCH_TO_USER_CR3. Delay the CR3 + * switch until after after the last reference to the process + * stack. + * + * %r8/%r9 are zeroed before the sysret, thus safe to clobber. + */ + SWITCH_TO_USER_CR3_NOSTACK scratch_reg=%r8 scratch_reg2=%r9 + xorq %r8, %r8 xorq %r9, %r9 xorq %r10, %r10 - movq RSP-ORIG_RAX(%rsp), %rsp swapgs sysretl END(entry_SYSCALL_compat) @@ -294,7 +322,6 @@ /* * Interrupts are off on entry. */ - PARAVIRT_ADJUST_EXCEPTION_FRAME ASM_CLAC /* Do this early to minimize exposure */ SWAPGS @@ -307,8 +334,11 @@ */ movl %eax, %eax - /* Construct struct pt_regs on stack (iret frame is already on stack) */ pushq %rax /* pt_regs->orig_ax */ + + /* switch to thread stack expects orig_ax to be pushed */ + call switch_to_thread_stack + pushq %rdi /* pt_regs->di */ pushq %rsi /* pt_regs->si */ pushq %rdx /* pt_regs->dx */ @@ -326,6 +356,9 @@ pushq %r15 /* pt_regs->r15 */ cld + ENABLE_IBRS + STUFF_RSB + /* * User mode is traced as though IRQs are on, and the interrupt * gate turned them off. @@ -338,8 +371,7 @@ /* Go back to user mode. */ TRACE_IRQS_ON - SWAPGS - jmp restore_regs_and_iret + jmp swapgs_restore_regs_and_return_to_usermode END(entry_INT80_compat) ALIGN --- linux-azure-4.13.0.orig/arch/x86/entry/syscalls/Makefile +++ linux-azure-4.13.0/arch/x86/entry/syscalls/Makefile @@ -1,5 +1,5 @@ -out := $(obj)/../../include/generated/asm -uapi := $(obj)/../../include/generated/uapi/asm +out := arch/$(SRCARCH)/include/generated/asm +uapi := arch/$(SRCARCH)/include/generated/uapi/asm # Create output directory if not already present _dummy := $(shell [ -d '$(out)' ] || mkdir -p '$(out)') \ --- linux-azure-4.13.0.orig/arch/x86/entry/vsyscall/vsyscall_64.c +++ linux-azure-4.13.0/arch/x86/entry/vsyscall/vsyscall_64.c @@ -36,6 +36,7 @@ #include #include #include +#include #define CREATE_TRACE_POINTS #include "vsyscall_trace.h" @@ -137,6 +138,10 @@ WARN_ON_ONCE(address != regs->ip); + /* This should be unreachable in NATIVE mode. */ + if (WARN_ON(vsyscall_mode == NATIVE)) + return false; + if (vsyscall_mode == NONE) { warn_bad_vsyscall(KERN_INFO, regs, "vsyscall attempted with vsyscall=none"); @@ -328,16 +333,47 @@ return vsyscall_mode != NONE && (addr & PAGE_MASK) == VSYSCALL_ADDR; } +/* + * The VSYSCALL page is the only user-accessible page in the kernel address + * range. Normally, the kernel page tables can have _PAGE_USER clear, but + * the tables covering VSYSCALL_ADDR need _PAGE_USER set if vsyscalls + * are enabled. + * + * Some day we may create a "minimal" vsyscall mode in which we emulate + * vsyscalls but leave the page not present. If so, we skip calling + * this. + */ +void __init set_vsyscall_pgtable_user_bits(pgd_t *root) +{ + pgd_t *pgd; + p4d_t *p4d; + pud_t *pud; + pmd_t *pmd; + + pgd = pgd_offset_pgd(root, VSYSCALL_ADDR); + set_pgd(pgd, __pgd(pgd_val(*pgd) | _PAGE_USER)); + p4d = p4d_offset(pgd, VSYSCALL_ADDR); +#if CONFIG_PGTABLE_LEVELS >= 5 + p4d->p4d |= _PAGE_USER; +#endif + pud = pud_offset(p4d, VSYSCALL_ADDR); + set_pud(pud, __pud(pud_val(*pud) | _PAGE_USER)); + pmd = pmd_offset(pud, VSYSCALL_ADDR); + set_pmd(pmd, __pmd(pmd_val(*pmd) | _PAGE_USER)); +} + void __init map_vsyscall(void) { extern char __vsyscall_page; unsigned long physaddr_vsyscall = __pa_symbol(&__vsyscall_page); - if (vsyscall_mode != NONE) + if (vsyscall_mode != NONE) { __set_fixmap(VSYSCALL_PAGE, physaddr_vsyscall, vsyscall_mode == NATIVE ? PAGE_KERNEL_VSYSCALL : PAGE_KERNEL_VVAR); + set_vsyscall_pgtable_user_bits(swapper_pg_dir); + } BUILD_BUG_ON((unsigned long)__fix_to_virt(VSYSCALL_PAGE) != (unsigned long)VSYSCALL_ADDR); --- linux-azure-4.13.0.orig/arch/x86/events/core.c +++ linux-azure-4.13.0/arch/x86/events/core.c @@ -2336,7 +2336,7 @@ struct ldt_struct *ldt; /* IRQs are off, so this synchronizes with smp_store_release */ - ldt = lockless_dereference(current->active_mm->context.ldt); + ldt = READ_ONCE(current->active_mm->context.ldt); if (!ldt || idx >= ldt->nr_entries) return 0; --- linux-azure-4.13.0.orig/arch/x86/events/intel/Makefile +++ linux-azure-4.13.0/arch/x86/events/intel/Makefile @@ -1,4 +1,4 @@ -obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o cqm.o +obj-$(CONFIG_CPU_SUP_INTEL) += core.o bts.o obj-$(CONFIG_CPU_SUP_INTEL) += ds.o knc.o obj-$(CONFIG_CPU_SUP_INTEL) += lbr.o p4.o p6.o pt.o obj-$(CONFIG_PERF_EVENTS_INTEL_RAPL) += intel-rapl-perf.o --- linux-azure-4.13.0.orig/arch/x86/events/intel/bts.c +++ linux-azure-4.13.0/arch/x86/events/intel/bts.c @@ -582,6 +582,24 @@ if (!boot_cpu_has(X86_FEATURE_DTES64) || !x86_pmu.bts) return -ENODEV; + if (boot_cpu_has(X86_FEATURE_PTI)) { + /* + * BTS hardware writes through a virtual memory map we must + * either use the kernel physical map, or the user mapping of + * the AUX buffer. + * + * However, since this driver supports per-CPU and per-task inherit + * we cannot use the user mapping since it will not be availble + * if we're not running the owning process. + * + * With PTI we can't use the kernal map either, because its not + * there when we run userspace. + * + * For now, disable this driver when using PTI. + */ + return -ENODEV; + } + bts_pmu.capabilities = PERF_PMU_CAP_AUX_NO_SG | PERF_PMU_CAP_ITRACE | PERF_PMU_CAP_EXCLUSIVE; bts_pmu.task_ctx_nr = perf_sw_context; --- linux-azure-4.13.0.orig/arch/x86/events/intel/core.c +++ linux-azure-4.13.0/arch/x86/events/intel/core.c @@ -2958,6 +2958,10 @@ if (event->attr.use_clockid) flags &= ~PERF_SAMPLE_TIME; + if (!event->attr.exclude_kernel) + flags &= ~PERF_SAMPLE_REGS_USER; + if (event->attr.sample_regs_user & ~PEBS_REGS) + flags &= ~(PERF_SAMPLE_REGS_USER | PERF_SAMPLE_REGS_INTR); return flags; } @@ -3905,6 +3909,7 @@ intel_pmu_pebs_data_source_nhm(); x86_add_quirk(intel_nehalem_quirk); + x86_pmu.pebs_no_tlb = 1; pr_cont("Nehalem events, "); break; @@ -4207,6 +4212,8 @@ skl_format_attr); WARN_ON(!x86_pmu.format_attrs); x86_pmu.cpu_events = hsw_events_attrs; + intel_pmu_pebs_data_source_skl( + boot_cpu_data.x86_model == INTEL_FAM6_SKYLAKE_X); pr_cont("Skylake events, "); break; --- linux-azure-4.13.0.orig/arch/x86/events/intel/ds.c +++ linux-azure-4.13.0/arch/x86/events/intel/ds.c @@ -2,16 +2,19 @@ #include #include +#include #include +#include #include #include "../perf_event.h" +/* Waste a full page so it can be mapped into the cpu_entry_area */ +DEFINE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); + /* The size of a BTS record in bytes: */ #define BTS_RECORD_SIZE 24 -#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) -#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) #define PEBS_FIXUP_SIZE PAGE_SIZE /* @@ -49,34 +52,47 @@ */ #define P(a, b) PERF_MEM_S(a, b) #define OP_LH (P(OP, LOAD) | P(LVL, HIT)) +#define LEVEL(x) P(LVLNUM, x) +#define REM P(REMOTE, REMOTE) #define SNOOP_NONE_MISS (P(SNOOP, NONE) | P(SNOOP, MISS)) /* Version for Sandy Bridge and later */ static u64 pebs_data_source[] = { - P(OP, LOAD) | P(LVL, MISS) | P(LVL, L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ - OP_LH | P(LVL, L1) | P(SNOOP, NONE), /* 0x01: L1 local */ - OP_LH | P(LVL, LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ - OP_LH | P(LVL, L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ - OP_LH | P(LVL, L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ - OP_LH | P(LVL, L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ - OP_LH | P(LVL, L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ - OP_LH | P(LVL, REM_CCE1) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ - OP_LH | P(LVL, LOC_RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ - OP_LH | P(LVL, REM_RAM1) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ - OP_LH | P(LVL, LOC_RAM) | SNOOP_NONE_MISS,/* 0x0c: L3 miss, excl */ - OP_LH | P(LVL, REM_RAM1) | SNOOP_NONE_MISS,/* 0x0d: L3 miss, excl */ - OP_LH | P(LVL, IO) | P(SNOOP, NONE), /* 0x0e: I/O */ - OP_LH | P(LVL, UNC) | P(SNOOP, NONE), /* 0x0f: uncached */ + P(OP, LOAD) | P(LVL, MISS) | LEVEL(L3) | P(SNOOP, NA),/* 0x00:ukn L3 */ + OP_LH | P(LVL, L1) | LEVEL(L1) | P(SNOOP, NONE), /* 0x01: L1 local */ + OP_LH | P(LVL, LFB) | LEVEL(LFB) | P(SNOOP, NONE), /* 0x02: LFB hit */ + OP_LH | P(LVL, L2) | LEVEL(L2) | P(SNOOP, NONE), /* 0x03: L2 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, NONE), /* 0x04: L3 hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, MISS), /* 0x05: L3 hit, snoop miss */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT), /* 0x06: L3 hit, snoop hit */ + OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM), /* 0x07: L3 hit, snoop hitm */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x08: L3 miss snoop hit */ + OP_LH | P(LVL, REM_CCE1) | REM | LEVEL(L3) | P(SNOOP, HITM), /* 0x09: L3 miss snoop hitm*/ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | P(SNOOP, HIT), /* 0x0a: L3 miss, shared */ + OP_LH | P(LVL, REM_RAM1) | REM | LEVEL(L3) | P(SNOOP, HIT), /* 0x0b: L3 miss, shared */ + OP_LH | P(LVL, LOC_RAM) | LEVEL(RAM) | SNOOP_NONE_MISS, /* 0x0c: L3 miss, excl */ + OP_LH | P(LVL, REM_RAM1) | LEVEL(RAM) | REM | SNOOP_NONE_MISS, /* 0x0d: L3 miss, excl */ + OP_LH | P(LVL, IO) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0e: I/O */ + OP_LH | P(LVL, UNC) | LEVEL(NA) | P(SNOOP, NONE), /* 0x0f: uncached */ }; /* Patch up minor differences in the bits */ void __init intel_pmu_pebs_data_source_nhm(void) { - pebs_data_source[0x05] = OP_LH | P(LVL, L3) | P(SNOOP, HIT); - pebs_data_source[0x06] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); - pebs_data_source[0x07] = OP_LH | P(LVL, L3) | P(SNOOP, HITM); + pebs_data_source[0x05] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HIT); + pebs_data_source[0x06] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); + pebs_data_source[0x07] = OP_LH | P(LVL, L3) | LEVEL(L3) | P(SNOOP, HITM); +} + +void __init intel_pmu_pebs_data_source_skl(bool pmem) +{ + u64 pmem_or_l4 = pmem ? LEVEL(PMEM) : LEVEL(L4); + + pebs_data_source[0x08] = OP_LH | pmem_or_l4 | P(SNOOP, HIT); + pebs_data_source[0x09] = OP_LH | pmem_or_l4 | REM | P(SNOOP, HIT); + pebs_data_source[0x0b] = OP_LH | LEVEL(RAM) | REM | P(SNOOP, NONE); + pebs_data_source[0x0c] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOPX, FWD); + pebs_data_source[0x0d] = OP_LH | LEVEL(ANY_CACHE) | REM | P(SNOOP, HITM); } static u64 precise_store_data(u64 status) @@ -149,8 +165,6 @@ { union intel_x86_pebs_dse dse; u64 val; - int model = boot_cpu_data.x86_model; - int fam = boot_cpu_data.x86; dse.val = status; @@ -162,8 +176,7 @@ /* * Nehalem models do not support TLB, Lock infos */ - if (fam == 0x6 && (model == 26 || model == 30 - || model == 31 || model == 46)) { + if (x86_pmu.pebs_no_tlb) { val |= P(TLB, NA) | P(LOCK, NA); return val; } @@ -268,17 +281,67 @@ static DEFINE_PER_CPU(void *, insn_buffer); -static int alloc_pebs_buffer(int cpu) +static void ds_update_cea(void *cea, void *addr, size_t size, pgprot_t prot) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + unsigned long start = (unsigned long)cea; + phys_addr_t pa; + size_t msz = 0; + + pa = virt_to_phys(addr); + + preempt_disable(); + for (; msz < size; msz += PAGE_SIZE, pa += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, pa, prot); + + /* + * This is a cross-CPU update of the cpu_entry_area, we must shoot down + * all TLB entries for it. + */ + flush_tlb_kernel_range(start, start + size); + preempt_enable(); +} + +static void ds_clear_cea(void *cea, size_t size) +{ + unsigned long start = (unsigned long)cea; + size_t msz = 0; + + preempt_disable(); + for (; msz < size; msz += PAGE_SIZE, cea += PAGE_SIZE) + cea_set_pte(cea, 0, PAGE_NONE); + + flush_tlb_kernel_range(start, start + size); + preempt_enable(); +} + +static void *dsalloc_pages(size_t size, gfp_t flags, int cpu) +{ + unsigned int order = get_order(size); int node = cpu_to_node(cpu); - int max; - void *buffer, *ibuffer; + struct page *page; + + page = __alloc_pages_node(node, flags | __GFP_ZERO, order); + return page ? page_address(page) : NULL; +} + +static void dsfree_pages(const void *buffer, size_t size) +{ + if (buffer) + free_pages((unsigned long)buffer, get_order(size)); +} + +static int alloc_pebs_buffer(int cpu) +{ + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + size_t bsiz = x86_pmu.pebs_buffer_size; + int max, node = cpu_to_node(cpu); + void *buffer, *ibuffer, *cea; if (!x86_pmu.pebs) return 0; - buffer = kzalloc_node(x86_pmu.pebs_buffer_size, GFP_KERNEL, node); + buffer = dsalloc_pages(bsiz, GFP_KERNEL, cpu); if (unlikely(!buffer)) return -ENOMEM; @@ -289,25 +352,27 @@ if (x86_pmu.intel_cap.pebs_format < 2) { ibuffer = kzalloc_node(PEBS_FIXUP_SIZE, GFP_KERNEL, node); if (!ibuffer) { - kfree(buffer); + dsfree_pages(buffer, bsiz); return -ENOMEM; } per_cpu(insn_buffer, cpu) = ibuffer; } - - max = x86_pmu.pebs_buffer_size / x86_pmu.pebs_record_size; - - ds->pebs_buffer_base = (u64)(unsigned long)buffer; + hwev->ds_pebs_vaddr = buffer; + /* Update the cpu entry area mapping */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds->pebs_buffer_base = (unsigned long) cea; + ds_update_cea(cea, buffer, bsiz, PAGE_KERNEL); ds->pebs_index = ds->pebs_buffer_base; - ds->pebs_absolute_maximum = ds->pebs_buffer_base + - max * x86_pmu.pebs_record_size; - + max = x86_pmu.pebs_record_size * (bsiz / x86_pmu.pebs_record_size); + ds->pebs_absolute_maximum = ds->pebs_buffer_base + max; return 0; } static void release_pebs_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *cea; if (!ds || !x86_pmu.pebs) return; @@ -315,73 +380,70 @@ kfree(per_cpu(insn_buffer, cpu)); per_cpu(insn_buffer, cpu) = NULL; - kfree((void *)(unsigned long)ds->pebs_buffer_base); + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.pebs_buffer; + ds_clear_cea(cea, x86_pmu.pebs_buffer_size); ds->pebs_buffer_base = 0; + dsfree_pages(hwev->ds_pebs_vaddr, x86_pmu.pebs_buffer_size); + hwev->ds_pebs_vaddr = NULL; } static int alloc_bts_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - int node = cpu_to_node(cpu); - int max, thresh; - void *buffer; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *buffer, *cea; + int max; if (!x86_pmu.bts) return 0; - buffer = kzalloc_node(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, node); + buffer = dsalloc_pages(BTS_BUFFER_SIZE, GFP_KERNEL | __GFP_NOWARN, cpu); if (unlikely(!buffer)) { WARN_ONCE(1, "%s: BTS buffer allocation failure\n", __func__); return -ENOMEM; } - - max = BTS_BUFFER_SIZE / BTS_RECORD_SIZE; - thresh = max / 16; - - ds->bts_buffer_base = (u64)(unsigned long)buffer; + hwev->ds_bts_vaddr = buffer; + /* Update the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; + ds->bts_buffer_base = (unsigned long) cea; + ds_update_cea(cea, buffer, BTS_BUFFER_SIZE, PAGE_KERNEL); ds->bts_index = ds->bts_buffer_base; - ds->bts_absolute_maximum = ds->bts_buffer_base + - max * BTS_RECORD_SIZE; - ds->bts_interrupt_threshold = ds->bts_absolute_maximum - - thresh * BTS_RECORD_SIZE; - + max = BTS_RECORD_SIZE * (BTS_BUFFER_SIZE / BTS_RECORD_SIZE); + ds->bts_absolute_maximum = ds->bts_buffer_base + max; + ds->bts_interrupt_threshold = ds->bts_absolute_maximum - (max / 16); return 0; } static void release_bts_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; + struct cpu_hw_events *hwev = per_cpu_ptr(&cpu_hw_events, cpu); + struct debug_store *ds = hwev->ds; + void *cea; if (!ds || !x86_pmu.bts) return; - kfree((void *)(unsigned long)ds->bts_buffer_base); + /* Clear the fixmap */ + cea = &get_cpu_entry_area(cpu)->cpu_debug_buffers.bts_buffer; + ds_clear_cea(cea, BTS_BUFFER_SIZE); ds->bts_buffer_base = 0; + dsfree_pages(hwev->ds_bts_vaddr, BTS_BUFFER_SIZE); + hwev->ds_bts_vaddr = NULL; } static int alloc_ds_buffer(int cpu) { - int node = cpu_to_node(cpu); - struct debug_store *ds; - - ds = kzalloc_node(sizeof(*ds), GFP_KERNEL, node); - if (unlikely(!ds)) - return -ENOMEM; + struct debug_store *ds = &get_cpu_entry_area(cpu)->cpu_debug_store; + memset(ds, 0, sizeof(*ds)); per_cpu(cpu_hw_events, cpu).ds = ds; - return 0; } static void release_ds_buffer(int cpu) { - struct debug_store *ds = per_cpu(cpu_hw_events, cpu).ds; - - if (!ds) - return; - per_cpu(cpu_hw_events, cpu).ds = NULL; - kfree(ds); } void release_ds_buffers(void) --- linux-azure-4.13.0.orig/arch/x86/events/perf_event.h +++ linux-azure-4.13.0/arch/x86/events/perf_event.h @@ -14,6 +14,8 @@ #include +#include + /* To enable MSR tracing please use the generic trace points. */ /* @@ -77,38 +79,41 @@ struct event_constraint event_constraints[X86_PMC_IDX_MAX]; }; -/* The maximal number of PEBS events: */ -#define MAX_PEBS_EVENTS 8 #define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) /* * Flags PEBS can handle without an PMI. * * TID can only be handled by flushing at context switch. + * REGS_USER can be handled for events limited to ring 3. * */ #define PEBS_FREERUNNING_FLAGS \ (PERF_SAMPLE_IP | PERF_SAMPLE_TID | PERF_SAMPLE_ADDR | \ PERF_SAMPLE_ID | PERF_SAMPLE_CPU | PERF_SAMPLE_STREAM_ID | \ PERF_SAMPLE_DATA_SRC | PERF_SAMPLE_IDENTIFIER | \ - PERF_SAMPLE_TRANSACTION) + PERF_SAMPLE_TRANSACTION | \ + PERF_SAMPLE_REGS_INTR | PERF_SAMPLE_REGS_USER) -/* - * A debug store configuration. - * - * We only support architectures that use 64bit fields. - */ -struct debug_store { - u64 bts_buffer_base; - u64 bts_index; - u64 bts_absolute_maximum; - u64 bts_interrupt_threshold; - u64 pebs_buffer_base; - u64 pebs_index; - u64 pebs_absolute_maximum; - u64 pebs_interrupt_threshold; - u64 pebs_event_reset[MAX_PEBS_EVENTS]; -}; +#define PEBS_REGS \ + (PERF_REG_X86_AX | \ + PERF_REG_X86_BX | \ + PERF_REG_X86_CX | \ + PERF_REG_X86_DX | \ + PERF_REG_X86_DI | \ + PERF_REG_X86_SI | \ + PERF_REG_X86_SP | \ + PERF_REG_X86_BP | \ + PERF_REG_X86_IP | \ + PERF_REG_X86_FLAGS | \ + PERF_REG_X86_R8 | \ + PERF_REG_X86_R9 | \ + PERF_REG_X86_R10 | \ + PERF_REG_X86_R11 | \ + PERF_REG_X86_R12 | \ + PERF_REG_X86_R13 | \ + PERF_REG_X86_R14 | \ + PERF_REG_X86_R15) /* * Per register state. @@ -194,6 +199,8 @@ * Intel DebugStore bits */ struct debug_store *ds; + void *ds_pebs_vaddr; + void *ds_bts_vaddr; u64 pebs_enabled; int n_pebs; int n_large_pebs; @@ -591,7 +598,8 @@ pebs :1, pebs_active :1, pebs_broken :1, - pebs_prec_dist :1; + pebs_prec_dist :1, + pebs_no_tlb :1; int pebs_record_size; int pebs_buffer_size; void (*drain_pebs)(struct pt_regs *regs); @@ -947,6 +955,8 @@ void intel_pmu_pebs_data_source_nhm(void); +void intel_pmu_pebs_data_source_skl(bool pmem); + int intel_pmu_setup_lbr_filter(struct perf_event *event); void intel_pt_interrupt(void); --- linux-azure-4.13.0.orig/arch/x86/hyperv/Makefile +++ linux-azure-4.13.0/arch/x86/hyperv/Makefile @@ -1 +1 @@ -obj-y := hv_init.o +obj-y := hv_init.o mmu.o --- linux-azure-4.13.0.orig/arch/x86/hyperv/hv_init.c +++ linux-azure-4.13.0/arch/x86/hyperv/hv_init.c @@ -26,6 +26,18 @@ #include #include #include +#include +#include + +#include + +#ifndef PKG_ABI +/* + * Preserve the ability to 'make deb-pkg' since PKG_ABI is provided + * by the Ubuntu build rules. + */ +#define PKG_ABI 0 +#endif #ifdef CONFIG_HYPERV_TSCPAGE @@ -75,10 +87,30 @@ .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void *hypercall_pg; +void *hv_hypercall_pg; +EXPORT_SYMBOL_GPL(hv_hypercall_pg); struct clocksource *hyperv_cs; EXPORT_SYMBOL_GPL(hyperv_cs); +u32 *hv_vp_index; +EXPORT_SYMBOL_GPL(hv_vp_index); + +u32 hv_max_vp_index; + +static int hv_cpu_init(unsigned int cpu) +{ + u64 msr_vp_index; + + hv_get_vp_index(msr_vp_index); + + hv_vp_index[smp_processor_id()] = (u32)msr_vp_index; + + if (msr_vp_index > hv_max_vp_index) + hv_max_vp_index = msr_vp_index; + + return 0; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -91,28 +123,40 @@ u64 guest_id; union hv_x64_msr_hypercall_contents hypercall_msr; - if (x86_hyper != &x86_hyper_ms_hyperv) + if (x86_hyper_type != X86_HYPER_MS_HYPERV) return; + /* Allocate percpu VP index */ + hv_vp_index = kcalloc(num_possible_cpus(), sizeof(*hv_vp_index), + GFP_KERNEL); + if (!hv_vp_index) + return; + + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + hv_cpu_init, NULL) < 0) + goto free_vp_index; + /* * Setup the hypercall page and enable hypercalls. * 1. Register the guest ID * 2. Enable the hypercall and register the hypercall page */ - guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); + guest_id = generate_guest_id(0x80 /*Canonical*/, LINUX_VERSION_CODE, PKG_ABI); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); - if (hypercall_pg == NULL) { + hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - return; + goto free_vp_index; } rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - hypercall_msr.guest_physical_address = vmalloc_to_pfn(hypercall_pg); + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hyper_alloc_mmu(); + /* * Register Hyper-V specific clocksource. */ @@ -148,6 +192,12 @@ hyperv_cs = &hyperv_cs_msr; if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); + + return; + +free_vp_index: + kfree(hv_vp_index); + hv_vp_index = NULL; } /* @@ -170,51 +220,6 @@ } EXPORT_SYMBOL_GPL(hyperv_cleanup); -/* - * hv_do_hypercall- Invoke the specified hypercall - */ -u64 hv_do_hypercall(u64 control, void *input, void *output) -{ - u64 input_address = (input) ? virt_to_phys(input) : 0; - u64 output_address = (output) ? virt_to_phys(output) : 0; -#ifdef CONFIG_X86_64 - u64 hv_status = 0; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); - __asm__ __volatile__("call *%3" : "=a" (hv_status) : - "c" (control), "d" (input_address), - "m" (hypercall_pg)); - - return hv_status; - -#else - - u32 control_hi = control >> 32; - u32 control_lo = control & 0xFFFFFFFF; - u32 hv_status_hi = 1; - u32 hv_status_lo = 1; - u32 input_address_hi = input_address >> 32; - u32 input_address_lo = input_address & 0xFFFFFFFF; - u32 output_address_hi = output_address >> 32; - u32 output_address_lo = output_address & 0xFFFFFFFF; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), - "=a"(hv_status_lo) : "d" (control_hi), - "a" (control_lo), "b" (input_address_hi), - "c" (input_address_lo), "D"(output_address_hi), - "S"(output_address_lo), "m" (hypercall_pg)); - - return hv_status_lo | ((u64)hv_status_hi << 32); -#endif /* !x86_64 */ -} -EXPORT_SYMBOL_GPL(hv_do_hypercall); - void hyperv_report_panic(struct pt_regs *regs) { static bool panic_reported; --- linux-azure-4.13.0.orig/arch/x86/hyperv/mmu.c +++ linux-azure-4.13.0/arch/x86/hyperv/mmu.c @@ -0,0 +1,309 @@ +#define pr_fmt(fmt) "Hyper-V: " fmt + +#include +#include +#include +#include + +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +#include + +/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ +struct hv_flush_pcpu { + u64 address_space; + u64 flags; + u64 processor_mask; + u64 gva_list[]; +}; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_flush_pcpu_ex { + u64 address_space; + u64 flags; + struct { + u64 format; + u64 valid_bank_mask; + u64 bank_contents[]; + } hv_vp_set; + u64 gva_list[]; +}; + +/* Each gva in gva_list encodes up to 4096 pages to flush */ +#define HV_TLB_FLUSH_UNIT (4096 * PAGE_SIZE) + +static struct hv_flush_pcpu __percpu **pcpu_flush; + +static struct hv_flush_pcpu_ex __percpu **pcpu_flush_ex; + +/* + * Fills in gva_list starting from offset. Returns the number of items added. + */ +static inline int fill_gva_list(u64 gva_list[], int offset, + unsigned long start, unsigned long end) +{ + int gva_n = offset; + unsigned long cur = start, diff; + + do { + diff = end > cur ? end - cur : 0; + + gva_list[gva_n] = cur & PAGE_MASK; + /* + * Lower 12 bits encode the number of additional + * pages to flush (in addition to the 'cur' page). + */ + if (diff >= HV_TLB_FLUSH_UNIT) + gva_list[gva_n] |= ~PAGE_MASK; + else if (diff) + gva_list[gva_n] |= (diff - 1) >> PAGE_SHIFT; + + cur += HV_TLB_FLUSH_UNIT; + gva_n++; + + } while (cur < end); + + return gva_n - offset; +} + +/* Return the number of banks in the resulting vp_set */ +static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, + const struct cpumask *cpus) +{ + int cpu, vcpu, vcpu_bank, vcpu_offset, nr_bank = 1; + + /* valid_bank_mask can represent up to 64 banks */ + if (hv_max_vp_index / 64 >= 64) + return 0; + + /* + * Clear all banks up to the maximum possible bank as hv_flush_pcpu_ex + * structs are not cleared between calls, we risk flushing unneeded + * vCPUs otherwise. + */ + for (vcpu_bank = 0; vcpu_bank <= hv_max_vp_index / 64; vcpu_bank++) + flush->hv_vp_set.bank_contents[vcpu_bank] = 0; + + /* + * Some banks may end up being empty but this is acceptable. + */ + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + vcpu_bank = vcpu / 64; + vcpu_offset = vcpu % 64; + __set_bit(vcpu_offset, (unsigned long *) + &flush->hv_vp_set.bank_contents[vcpu_bank]); + if (vcpu_bank >= nr_bank) + nr_bank = vcpu_bank + 1; + } + flush->hv_vp_set.valid_bank_mask = GENMASK_ULL(nr_bank - 1, 0); + + return nr_bank; +} + +static void hyperv_flush_tlb_others(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int cpu, vcpu, gva_n, max_gvas; + struct hv_flush_pcpu **flush_pcpu; + struct hv_flush_pcpu *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, info); + + if (!pcpu_flush || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush_pcpu = this_cpu_ptr(pcpu_flush); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } + + if (info->mm) { + /* + * AddressSpace argument must match the CR3 with PCID bits + * stripped out. + */ + flush->address_space = virt_to_phys(info->mm->pgd); + flush->address_space &= CR3_ADDR_MASK; + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->processor_mask = 0; + if (cpumask_equal(cpus, cpu_present_mask)) { + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } else { + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + if (vcpu >= 64) + goto do_native; + + __set_bit(vcpu, (unsigned long *) + &flush->processor_mask); + } + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = (PAGE_SIZE - sizeof(*flush)) / sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, 0, + info->start, info->end); + status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, + gva_n, 0, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, info); +} + +static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + const struct flush_tlb_info *info) +{ + int nr_bank = 0, max_gvas, gva_n; + struct hv_flush_pcpu_ex **flush_pcpu; + struct hv_flush_pcpu_ex *flush; + u64 status = U64_MAX; + unsigned long flags; + + trace_hyperv_mmu_flush_tlb_others(cpus, info); + + if (!pcpu_flush_ex || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush_pcpu = this_cpu_ptr(pcpu_flush_ex); + + if (unlikely(!*flush_pcpu)) + *flush_pcpu = page_address(alloc_page(GFP_ATOMIC)); + + flush = *flush_pcpu; + + if (unlikely(!flush)) { + local_irq_restore(flags); + goto do_native; + } + + if (info->mm) { + /* + * AddressSpace argument must match the CR3 with PCID bits + * stripped out. + */ + flush->address_space = virt_to_phys(info->mm->pgd); + flush->address_space &= CR3_ADDR_MASK; + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->hv_vp_set.valid_bank_mask = 0; + + if (!cpumask_equal(cpus, cpu_present_mask)) { + flush->hv_vp_set.format = HV_GENERIC_SET_SPARCE_4K; + nr_bank = cpumask_to_vp_set(flush, cpus); + } + + if (!nr_bank) { + flush->hv_vp_set.format = HV_GENERIC_SET_ALL; + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = + (PAGE_SIZE - sizeof(*flush) - nr_bank * + sizeof(flush->hv_vp_set.bank_contents[0])) / + sizeof(flush->gva_list[0]); + + if (info->end == TLB_FLUSH_ALL) { + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank, flush, NULL); + } else if (info->end && + ((info->end - info->start)/HV_TLB_FLUSH_UNIT) > max_gvas) { + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank, flush, NULL); + } else { + gva_n = fill_gva_list(flush->gva_list, nr_bank, + info->start, info->end); + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, + gva_n, nr_bank, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & HV_HYPERCALL_RESULT_MASK)) + return; +do_native: + native_flush_tlb_others(cpus, info); +} + +void hyperv_setup_mmu_ops(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) { + pr_info("Using hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; + } else { + pr_info("Using ext hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others_ex; + } +} + +void hyper_alloc_mmu(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + pcpu_flush = alloc_percpu(struct hv_flush_pcpu *); + else + pcpu_flush_ex = alloc_percpu(struct hv_flush_pcpu_ex *); +} --- linux-azure-4.13.0.orig/arch/x86/include/asm/alternative-asm.h +++ linux-azure-4.13.0/arch/x86/include/asm/alternative-asm.h @@ -62,8 +62,10 @@ #define new_len2 145f-144f /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax + * + * The additional "-" is needed because gas uses a "true" value of -1. */ #define alt_max_short(a, b) ((a) ^ (((a) ^ (b)) & -(-((a) < (b))))) --- linux-azure-4.13.0.orig/arch/x86/include/asm/alternative.h +++ linux-azure-4.13.0/arch/x86/include/asm/alternative.h @@ -103,12 +103,12 @@ alt_end_marker ":\n" /* - * max without conditionals. Idea adapted from: + * gas compatible max based on the idea from: * http://graphics.stanford.edu/~seander/bithacks.html#IntegerMinOrMax * - * The additional "-" is needed because gas works with s32s. + * The additional "-" is needed because gas uses a "true" value of -1. */ -#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") - (" b ")))))" +#define alt_max_short(a, b) "((" a ") ^ (((" a ") ^ (" b ")) & -(-((" a ") < (" b ")))))" /* * Pad the second replacement alternative with additional NOPs if it is @@ -139,7 +139,7 @@ ".popsection\n" \ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr, feature, 1) \ - ".popsection" + ".popsection\n" #define ALTERNATIVE_2(oldinstr, newinstr1, feature1, newinstr2, feature2)\ OLDINSTR_2(oldinstr, 1, 2) \ @@ -150,7 +150,7 @@ ".pushsection .altinstr_replacement, \"ax\"\n" \ ALTINSTR_REPLACEMENT(newinstr1, feature1, 1) \ ALTINSTR_REPLACEMENT(newinstr2, feature2, 2) \ - ".popsection" + ".popsection\n" /* * Alternative instructions for different CPU types or capabilities. @@ -218,10 +218,9 @@ #define alternative_call_2(oldfunc, newfunc1, feature1, newfunc2, feature2, \ output, input...) \ { \ - register void *__sp asm(_ASM_SP); \ asm volatile (ALTERNATIVE_2("call %P[old]", "call %P[new1]", feature1,\ "call %P[new2]", feature2) \ - : output, "+r" (__sp) \ + : output, ASM_CALL_CONSTRAINT \ : [old] "i" (oldfunc), [new1] "i" (newfunc1), \ [new2] "i" (newfunc2), ## input); \ } --- linux-azure-4.13.0.orig/arch/x86/include/asm/archrandom.h +++ linux-azure-4.13.0/arch/x86/include/asm/archrandom.h @@ -45,7 +45,7 @@ bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_LONG "\n\t" + asm volatile(RDRAND_LONG CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); if (ok) @@ -59,7 +59,7 @@ bool ok; unsigned int retry = RDRAND_RETRY_LOOPS; do { - asm volatile(RDRAND_INT "\n\t" + asm volatile(RDRAND_INT CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); if (ok) @@ -71,7 +71,7 @@ static inline bool rdseed_long(unsigned long *v) { bool ok; - asm volatile(RDSEED_LONG "\n\t" + asm volatile(RDSEED_LONG CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); return ok; @@ -80,7 +80,7 @@ static inline bool rdseed_int(unsigned int *v) { bool ok; - asm volatile(RDSEED_INT "\n\t" + asm volatile(RDSEED_INT CC_SET(c) : CC_OUT(c) (ok), "=a" (*v)); return ok; --- linux-azure-4.13.0.orig/arch/x86/include/asm/asm-prototypes.h +++ linux-azure-4.13.0/arch/x86/include/asm/asm-prototypes.h @@ -10,7 +10,34 @@ #include #include #include +#include #ifndef CONFIG_X86_CMPXCHG64 extern void cmpxchg8b_emu(void); #endif + +#ifdef CONFIG_RETPOLINE +#ifdef CONFIG_X86_32 +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_e ## reg(void); +#else +#define INDIRECT_THUNK(reg) extern asmlinkage void __x86_indirect_thunk_r ## reg(void); +INDIRECT_THUNK(8) +INDIRECT_THUNK(9) +INDIRECT_THUNK(10) +INDIRECT_THUNK(11) +INDIRECT_THUNK(12) +INDIRECT_THUNK(13) +INDIRECT_THUNK(14) +INDIRECT_THUNK(15) +#endif +INDIRECT_THUNK(ax) +INDIRECT_THUNK(bx) +INDIRECT_THUNK(cx) +INDIRECT_THUNK(dx) +INDIRECT_THUNK(si) +INDIRECT_THUNK(di) +INDIRECT_THUNK(bp) +asmlinkage void __fill_rsb(void); +asmlinkage void __clear_rsb(void); + +#endif /* CONFIG_RETPOLINE */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/asm.h +++ linux-azure-4.13.0/arch/x86/include/asm/asm.h @@ -126,4 +126,15 @@ /* For C file, we already have NOKPROBE_SYMBOL macro */ #endif +#ifndef __ASSEMBLY__ +/* + * This output constraint should be used for any inline asm which has a "call" + * instruction. Otherwise the asm may be inserted before the frame pointer + * gets set up by the containing function. If you forget to do this, objtool + * may print a "call without frame pointer save/setup" warning. + */ +register unsigned int __asm_call_sp asm("esp"); +#define ASM_CALL_CONSTRAINT "+r" (__asm_call_sp) +#endif + #endif /* _ASM_X86_ASM_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/barrier.h +++ linux-azure-4.13.0/arch/x86/include/asm/barrier.h @@ -76,6 +76,8 @@ #endif +#define osb() alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC) + /* Atomic operations are already serializing on x86 */ #define __smp_mb__before_atomic() barrier() #define __smp_mb__after_atomic() barrier() --- linux-azure-4.13.0.orig/arch/x86/include/asm/bitops.h +++ linux-azure-4.13.0/arch/x86/include/asm/bitops.h @@ -142,7 +142,7 @@ static __always_inline bool clear_bit_unlock_is_negative_byte(long nr, volatile unsigned long *addr) { bool negative; - asm volatile(LOCK_PREFIX "andb %2,%1\n\t" + asm volatile(LOCK_PREFIX "andb %2,%1" CC_SET(s) : CC_OUT(s) (negative), ADDR : "ir" ((char) ~(1 << nr)) : "memory"); @@ -245,7 +245,7 @@ { bool oldbit; - asm("bts %2,%1\n\t" + asm("bts %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -285,7 +285,7 @@ { bool oldbit; - asm volatile("btr %2,%1\n\t" + asm volatile("btr %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr)); @@ -297,7 +297,7 @@ { bool oldbit; - asm volatile("btc %2,%1\n\t" + asm volatile("btc %2,%1" CC_SET(c) : CC_OUT(c) (oldbit), ADDR : "Ir" (nr) : "memory"); @@ -328,7 +328,7 @@ { bool oldbit; - asm volatile("bt %2,%1\n\t" + asm volatile("bt %2,%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long *)addr), "Ir" (nr)); --- linux-azure-4.13.0.orig/arch/x86/include/asm/cmdline.h +++ linux-azure-4.13.0/arch/x86/include/asm/cmdline.h @@ -2,5 +2,7 @@ #define _ASM_X86_CMDLINE_H int cmdline_find_option_bool(const char *cmdline_ptr, const char *option); +int cmdline_find_option(const char *cmdline_ptr, const char *option, + char *buffer, int bufsize); #endif /* _ASM_X86_CMDLINE_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/compat.h +++ linux-azure-4.13.0/arch/x86/include/asm/compat.h @@ -6,6 +6,7 @@ */ #include #include +#include #include #include #include --- linux-azure-4.13.0.orig/arch/x86/include/asm/cpu_entry_area.h +++ linux-azure-4.13.0/arch/x86/include/asm/cpu_entry_area.h @@ -0,0 +1,81 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef _ASM_X86_CPU_ENTRY_AREA_H +#define _ASM_X86_CPU_ENTRY_AREA_H + +#include +#include +#include + +/* + * cpu_entry_area is a percpu region that contains things needed by the CPU + * and early entry/exit code. Real types aren't used for all fields here + * to avoid circular header dependencies. + * + * Every field is a virtual alias of some other allocated backing store. + * There is no direct allocation of a struct cpu_entry_area. + */ +struct cpu_entry_area { + char gdt[PAGE_SIZE]; + + /* + * The GDT is just below entry_stack and thus serves (on x86_64) as + * a a read-only guard page. + */ + struct entry_stack_page entry_stack_page; + + /* + * On x86_64, the TSS is mapped RO. On x86_32, it's mapped RW because + * we need task switches to work, and task switches write to the TSS. + */ + struct tss_struct tss; + + char entry_trampoline[PAGE_SIZE]; + +#ifdef CONFIG_X86_64 + /* + * Exception stacks used for IST entries. + * + * In the future, this should have a separate slot for each stack + * with guard pages between them. + */ + char exception_stacks[(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]; +#endif +#ifdef CONFIG_CPU_SUP_INTEL + /* + * Per CPU debug store for Intel performance monitoring. Wastes a + * full page at the moment. + */ + struct debug_store cpu_debug_store; + /* + * The actual PEBS/BTS buffers must be mapped to user space + * Reserve enough fixmap PTEs. + */ + struct debug_store_buffers cpu_debug_buffers; +#endif +}; + +#define CPU_ENTRY_AREA_SIZE (sizeof(struct cpu_entry_area)) +#define CPU_ENTRY_AREA_TOT_SIZE (CPU_ENTRY_AREA_SIZE * NR_CPUS) + +DECLARE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); + +extern void setup_cpu_entry_areas(void); +extern void cea_set_pte(void *cea_vaddr, phys_addr_t pa, pgprot_t flags); + +#define CPU_ENTRY_AREA_RO_IDT CPU_ENTRY_AREA_BASE +#define CPU_ENTRY_AREA_PER_CPU (CPU_ENTRY_AREA_RO_IDT + PAGE_SIZE) + +#define CPU_ENTRY_AREA_RO_IDT_VADDR ((void *)CPU_ENTRY_AREA_RO_IDT) + +#define CPU_ENTRY_AREA_MAP_SIZE \ + (CPU_ENTRY_AREA_PER_CPU + CPU_ENTRY_AREA_TOT_SIZE - CPU_ENTRY_AREA_BASE) + +extern struct cpu_entry_area *get_cpu_entry_area(int cpu); + +static inline struct entry_stack *cpu_entry_stack(int cpu) +{ + return &get_cpu_entry_area(cpu)->entry_stack_page.stack; +} + +#endif --- linux-azure-4.13.0.orig/arch/x86/include/asm/cpufeature.h +++ linux-azure-4.13.0/arch/x86/include/asm/cpufeature.h @@ -125,16 +125,17 @@ #define boot_cpu_has(bit) cpu_has(&boot_cpu_data, bit) #define set_cpu_cap(c, bit) set_bit(bit, (unsigned long *)((c)->x86_capability)) -#define clear_cpu_cap(c, bit) clear_bit(bit, (unsigned long *)((c)->x86_capability)) -#define setup_clear_cpu_cap(bit) do { \ - clear_cpu_cap(&boot_cpu_data, bit); \ - set_bit(bit, (unsigned long *)cpu_caps_cleared); \ -} while (0) + +extern void setup_clear_cpu_cap(unsigned int bit); +extern void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int bit); + #define setup_force_cpu_cap(bit) do { \ set_cpu_cap(&boot_cpu_data, bit); \ set_bit(bit, (unsigned long *)cpu_caps_set); \ } while (0) +#define setup_force_cpu_bug(bit) setup_force_cpu_cap(bit) + #if defined(CC_HAVE_ASM_GOTO) && defined(CONFIG_X86_FAST_FEATURE_TESTS) /* * Static testing of CPU features. Used the same as boot_cpu_has(). --- linux-azure-4.13.0.orig/arch/x86/include/asm/cpufeatures.h +++ linux-azure-4.13.0/arch/x86/include/asm/cpufeatures.h @@ -12,173 +12,176 @@ /* * Defines x86 CPU feature bits */ -#define NCAPINTS 18 /* N 32-bit words worth of info */ -#define NBUGINTS 1 /* N 32-bit bug flags */ +#define NCAPINTS 18 /* N 32-bit words worth of info */ +#define NBUGINTS 1 /* N 32-bit bug flags */ /* * Note: If the comment begins with a quoted string, that string is used * in /proc/cpuinfo instead of the macro name. If the string is "", * this feature bit is not displayed in /proc/cpuinfo at all. + * + * When adding new features here that depend on other features, + * please update the table in kernel/cpu/cpuid-deps.c as well. */ -/* Intel-defined CPU features, CPUID level 0x00000001 (edx), word 0 */ -#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ -#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ -#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ -#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ -#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ -#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ -#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ -#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ -#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ -#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ -#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ -#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ -#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ -#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ -#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions */ - /* (plus FCMOVcc, FCOMI with FPU) */ -#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ -#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ -#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ -#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ -#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ -#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ -#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ -#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ -#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ -#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ -#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ -#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ -#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ -#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ -#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ +/* Intel-defined CPU features, CPUID level 0x00000001 (EDX), word 0 */ +#define X86_FEATURE_FPU ( 0*32+ 0) /* Onboard FPU */ +#define X86_FEATURE_VME ( 0*32+ 1) /* Virtual Mode Extensions */ +#define X86_FEATURE_DE ( 0*32+ 2) /* Debugging Extensions */ +#define X86_FEATURE_PSE ( 0*32+ 3) /* Page Size Extensions */ +#define X86_FEATURE_TSC ( 0*32+ 4) /* Time Stamp Counter */ +#define X86_FEATURE_MSR ( 0*32+ 5) /* Model-Specific Registers */ +#define X86_FEATURE_PAE ( 0*32+ 6) /* Physical Address Extensions */ +#define X86_FEATURE_MCE ( 0*32+ 7) /* Machine Check Exception */ +#define X86_FEATURE_CX8 ( 0*32+ 8) /* CMPXCHG8 instruction */ +#define X86_FEATURE_APIC ( 0*32+ 9) /* Onboard APIC */ +#define X86_FEATURE_SEP ( 0*32+11) /* SYSENTER/SYSEXIT */ +#define X86_FEATURE_MTRR ( 0*32+12) /* Memory Type Range Registers */ +#define X86_FEATURE_PGE ( 0*32+13) /* Page Global Enable */ +#define X86_FEATURE_MCA ( 0*32+14) /* Machine Check Architecture */ +#define X86_FEATURE_CMOV ( 0*32+15) /* CMOV instructions (plus FCMOVcc, FCOMI with FPU) */ +#define X86_FEATURE_PAT ( 0*32+16) /* Page Attribute Table */ +#define X86_FEATURE_PSE36 ( 0*32+17) /* 36-bit PSEs */ +#define X86_FEATURE_PN ( 0*32+18) /* Processor serial number */ +#define X86_FEATURE_CLFLUSH ( 0*32+19) /* CLFLUSH instruction */ +#define X86_FEATURE_DS ( 0*32+21) /* "dts" Debug Store */ +#define X86_FEATURE_ACPI ( 0*32+22) /* ACPI via MSR */ +#define X86_FEATURE_MMX ( 0*32+23) /* Multimedia Extensions */ +#define X86_FEATURE_FXSR ( 0*32+24) /* FXSAVE/FXRSTOR, CR4.OSFXSR */ +#define X86_FEATURE_XMM ( 0*32+25) /* "sse" */ +#define X86_FEATURE_XMM2 ( 0*32+26) /* "sse2" */ +#define X86_FEATURE_SELFSNOOP ( 0*32+27) /* "ss" CPU self snoop */ +#define X86_FEATURE_HT ( 0*32+28) /* Hyper-Threading */ +#define X86_FEATURE_ACC ( 0*32+29) /* "tm" Automatic clock control */ +#define X86_FEATURE_IA64 ( 0*32+30) /* IA-64 processor */ +#define X86_FEATURE_PBE ( 0*32+31) /* Pending Break Enable */ /* AMD-defined CPU features, CPUID level 0x80000001, word 1 */ /* Don't duplicate feature flags which are redundant with Intel! */ -#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ -#define X86_FEATURE_MP ( 1*32+19) /* MP Capable. */ -#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ -#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ -#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ -#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ -#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ -#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64) */ -#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow! extensions */ -#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow! */ +#define X86_FEATURE_SYSCALL ( 1*32+11) /* SYSCALL/SYSRET */ +#define X86_FEATURE_MP ( 1*32+19) /* MP Capable */ +#define X86_FEATURE_NX ( 1*32+20) /* Execute Disable */ +#define X86_FEATURE_MMXEXT ( 1*32+22) /* AMD MMX extensions */ +#define X86_FEATURE_FXSR_OPT ( 1*32+25) /* FXSAVE/FXRSTOR optimizations */ +#define X86_FEATURE_GBPAGES ( 1*32+26) /* "pdpe1gb" GB pages */ +#define X86_FEATURE_RDTSCP ( 1*32+27) /* RDTSCP */ +#define X86_FEATURE_LM ( 1*32+29) /* Long Mode (x86-64, 64-bit support) */ +#define X86_FEATURE_3DNOWEXT ( 1*32+30) /* AMD 3DNow extensions */ +#define X86_FEATURE_3DNOW ( 1*32+31) /* 3DNow */ /* Transmeta-defined CPU features, CPUID level 0x80860001, word 2 */ -#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ -#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ -#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ +#define X86_FEATURE_RECOVERY ( 2*32+ 0) /* CPU in recovery mode */ +#define X86_FEATURE_LONGRUN ( 2*32+ 1) /* Longrun power control */ +#define X86_FEATURE_LRTI ( 2*32+ 3) /* LongRun table interface */ /* Other features, Linux-defined mapping, word 3 */ /* This range is used for feature bits which conflict or are synthesized */ -#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ -#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ -#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ -#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ -/* cpu types for specific tunings: */ -#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ -#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ -#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ -#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ -#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ -#define X86_FEATURE_UP ( 3*32+ 9) /* smp kernel running on up */ -#define X86_FEATURE_ART ( 3*32+10) /* Platform has always running timer (ART) */ -#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ -#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ -#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ -#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in ia32 userspace */ -#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in ia32 userspace */ -#define X86_FEATURE_REP_GOOD ( 3*32+16) /* rep microcode works well */ -#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" Mfence synchronizes RDTSC */ -#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" Lfence synchronizes RDTSC */ -#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ -#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ -#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ -#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* cpu topology enum extensions */ -#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ -#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ -#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ -#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* has extended APICID (8 bits) */ -#define X86_FEATURE_AMD_DCM ( 3*32+27) /* multi-node processor */ -#define X86_FEATURE_APERFMPERF ( 3*32+28) /* APERFMPERF */ -#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ -#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ - -/* Intel-defined CPU features, CPUID level 0x00000001 (ecx), word 4 */ -#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ -#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ -#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ -#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" Monitor/Mwait support */ -#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL Qual. Debug Store */ -#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ -#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer mode */ -#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ -#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ -#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ -#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ -#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ -#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ -#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B */ -#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ -#define X86_FEATURE_PDCM ( 4*32+15) /* Performance Capabilities */ -#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ -#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ -#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ -#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ -#define X86_FEATURE_X2APIC ( 4*32+21) /* x2APIC */ -#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ -#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ -#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* Tsc deadline timer */ -#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ -#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV */ -#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE enabled in the OS */ -#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ -#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit fp conversions */ -#define X86_FEATURE_RDRAND ( 4*32+30) /* The RDRAND instruction */ -#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ +#define X86_FEATURE_CXMMX ( 3*32+ 0) /* Cyrix MMX extensions */ +#define X86_FEATURE_K6_MTRR ( 3*32+ 1) /* AMD K6 nonstandard MTRRs */ +#define X86_FEATURE_CYRIX_ARR ( 3*32+ 2) /* Cyrix ARRs (= MTRRs) */ +#define X86_FEATURE_CENTAUR_MCR ( 3*32+ 3) /* Centaur MCRs (= MTRRs) */ + +/* CPU types for specific tunings: */ +#define X86_FEATURE_K8 ( 3*32+ 4) /* "" Opteron, Athlon64 */ +#define X86_FEATURE_K7 ( 3*32+ 5) /* "" Athlon */ +#define X86_FEATURE_P3 ( 3*32+ 6) /* "" P3 */ +#define X86_FEATURE_P4 ( 3*32+ 7) /* "" P4 */ +#define X86_FEATURE_CONSTANT_TSC ( 3*32+ 8) /* TSC ticks at a constant rate */ +#define X86_FEATURE_UP ( 3*32+ 9) /* SMP kernel running on UP */ +#define X86_FEATURE_ART ( 3*32+10) /* Always running timer (ART) */ +#define X86_FEATURE_ARCH_PERFMON ( 3*32+11) /* Intel Architectural PerfMon */ +#define X86_FEATURE_PEBS ( 3*32+12) /* Precise-Event Based Sampling */ +#define X86_FEATURE_BTS ( 3*32+13) /* Branch Trace Store */ +#define X86_FEATURE_SYSCALL32 ( 3*32+14) /* "" syscall in IA32 userspace */ +#define X86_FEATURE_SYSENTER32 ( 3*32+15) /* "" sysenter in IA32 userspace */ +#define X86_FEATURE_REP_GOOD ( 3*32+16) /* REP microcode works well */ +#define X86_FEATURE_MFENCE_RDTSC ( 3*32+17) /* "" MFENCE synchronizes RDTSC */ +#define X86_FEATURE_LFENCE_RDTSC ( 3*32+18) /* "" LFENCE synchronizes RDTSC */ +#define X86_FEATURE_ACC_POWER ( 3*32+19) /* AMD Accumulated Power Mechanism */ +#define X86_FEATURE_NOPL ( 3*32+20) /* The NOPL (0F 1F) instructions */ +#define X86_FEATURE_ALWAYS ( 3*32+21) /* "" Always-present feature */ +#define X86_FEATURE_XTOPOLOGY ( 3*32+22) /* CPU topology enum extensions */ +#define X86_FEATURE_TSC_RELIABLE ( 3*32+23) /* TSC is known to be reliable */ +#define X86_FEATURE_NONSTOP_TSC ( 3*32+24) /* TSC does not stop in C states */ +#define X86_FEATURE_CPUID ( 3*32+25) /* CPU has CPUID instruction itself */ +#define X86_FEATURE_EXTD_APICID ( 3*32+26) /* Extended APICID (8 bits) */ +#define X86_FEATURE_AMD_DCM ( 3*32+27) /* AMD multi-node processor */ +#define X86_FEATURE_APERFMPERF ( 3*32+28) /* P-State hardware coordination feedback capability (APERF/MPERF MSRs) */ +#define X86_FEATURE_NONSTOP_TSC_S3 ( 3*32+30) /* TSC doesn't stop in S3 state */ +#define X86_FEATURE_TSC_KNOWN_FREQ ( 3*32+31) /* TSC has known frequency */ + +/* Intel-defined CPU features, CPUID level 0x00000001 (ECX), word 4 */ +#define X86_FEATURE_XMM3 ( 4*32+ 0) /* "pni" SSE-3 */ +#define X86_FEATURE_PCLMULQDQ ( 4*32+ 1) /* PCLMULQDQ instruction */ +#define X86_FEATURE_DTES64 ( 4*32+ 2) /* 64-bit Debug Store */ +#define X86_FEATURE_MWAIT ( 4*32+ 3) /* "monitor" MONITOR/MWAIT support */ +#define X86_FEATURE_DSCPL ( 4*32+ 4) /* "ds_cpl" CPL-qualified (filtered) Debug Store */ +#define X86_FEATURE_VMX ( 4*32+ 5) /* Hardware virtualization */ +#define X86_FEATURE_SMX ( 4*32+ 6) /* Safer Mode eXtensions */ +#define X86_FEATURE_EST ( 4*32+ 7) /* Enhanced SpeedStep */ +#define X86_FEATURE_TM2 ( 4*32+ 8) /* Thermal Monitor 2 */ +#define X86_FEATURE_SSSE3 ( 4*32+ 9) /* Supplemental SSE-3 */ +#define X86_FEATURE_CID ( 4*32+10) /* Context ID */ +#define X86_FEATURE_SDBG ( 4*32+11) /* Silicon Debug */ +#define X86_FEATURE_FMA ( 4*32+12) /* Fused multiply-add */ +#define X86_FEATURE_CX16 ( 4*32+13) /* CMPXCHG16B instruction */ +#define X86_FEATURE_XTPR ( 4*32+14) /* Send Task Priority Messages */ +#define X86_FEATURE_PDCM ( 4*32+15) /* Perf/Debug Capabilities MSR */ +#define X86_FEATURE_PCID ( 4*32+17) /* Process Context Identifiers */ +#define X86_FEATURE_DCA ( 4*32+18) /* Direct Cache Access */ +#define X86_FEATURE_XMM4_1 ( 4*32+19) /* "sse4_1" SSE-4.1 */ +#define X86_FEATURE_XMM4_2 ( 4*32+20) /* "sse4_2" SSE-4.2 */ +#define X86_FEATURE_X2APIC ( 4*32+21) /* X2APIC */ +#define X86_FEATURE_MOVBE ( 4*32+22) /* MOVBE instruction */ +#define X86_FEATURE_POPCNT ( 4*32+23) /* POPCNT instruction */ +#define X86_FEATURE_TSC_DEADLINE_TIMER ( 4*32+24) /* TSC deadline timer */ +#define X86_FEATURE_AES ( 4*32+25) /* AES instructions */ +#define X86_FEATURE_XSAVE ( 4*32+26) /* XSAVE/XRSTOR/XSETBV/XGETBV instructions */ +#define X86_FEATURE_OSXSAVE ( 4*32+27) /* "" XSAVE instruction enabled in the OS */ +#define X86_FEATURE_AVX ( 4*32+28) /* Advanced Vector Extensions */ +#define X86_FEATURE_F16C ( 4*32+29) /* 16-bit FP conversions */ +#define X86_FEATURE_RDRAND ( 4*32+30) /* RDRAND instruction */ +#define X86_FEATURE_HYPERVISOR ( 4*32+31) /* Running on a hypervisor */ /* VIA/Cyrix/Centaur-defined CPU features, CPUID level 0xC0000001, word 5 */ -#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ -#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ -#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ -#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ -#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ -#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ -#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ -#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ -#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ -#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ - -/* More extended AMD flags: CPUID level 0x80000001, ecx, word 6 */ -#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ -#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ -#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure virtual machine */ -#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ -#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ -#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ -#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ -#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ -#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ -#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ -#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ -#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ -#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ -#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ -#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ -#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ -#define X86_FEATURE_TCE ( 6*32+17) /* translation cache extension */ -#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ -#define X86_FEATURE_TBM ( 6*32+21) /* trailing bit manipulations */ -#define X86_FEATURE_TOPOEXT ( 6*32+22) /* topology extensions CPUID leafs */ -#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* core performance counter extensions */ -#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ -#define X86_FEATURE_BPEXT (6*32+26) /* data breakpoint extension */ -#define X86_FEATURE_PTSC ( 6*32+27) /* performance time-stamp counter */ -#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* L2 performance counter extensions */ -#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX) */ +#define X86_FEATURE_XSTORE ( 5*32+ 2) /* "rng" RNG present (xstore) */ +#define X86_FEATURE_XSTORE_EN ( 5*32+ 3) /* "rng_en" RNG enabled */ +#define X86_FEATURE_XCRYPT ( 5*32+ 6) /* "ace" on-CPU crypto (xcrypt) */ +#define X86_FEATURE_XCRYPT_EN ( 5*32+ 7) /* "ace_en" on-CPU crypto enabled */ +#define X86_FEATURE_ACE2 ( 5*32+ 8) /* Advanced Cryptography Engine v2 */ +#define X86_FEATURE_ACE2_EN ( 5*32+ 9) /* ACE v2 enabled */ +#define X86_FEATURE_PHE ( 5*32+10) /* PadLock Hash Engine */ +#define X86_FEATURE_PHE_EN ( 5*32+11) /* PHE enabled */ +#define X86_FEATURE_PMM ( 5*32+12) /* PadLock Montgomery Multiplier */ +#define X86_FEATURE_PMM_EN ( 5*32+13) /* PMM enabled */ + +/* More extended AMD flags: CPUID level 0x80000001, ECX, word 6 */ +#define X86_FEATURE_LAHF_LM ( 6*32+ 0) /* LAHF/SAHF in long mode */ +#define X86_FEATURE_CMP_LEGACY ( 6*32+ 1) /* If yes HyperThreading not valid */ +#define X86_FEATURE_SVM ( 6*32+ 2) /* Secure Virtual Machine */ +#define X86_FEATURE_EXTAPIC ( 6*32+ 3) /* Extended APIC space */ +#define X86_FEATURE_CR8_LEGACY ( 6*32+ 4) /* CR8 in 32-bit mode */ +#define X86_FEATURE_ABM ( 6*32+ 5) /* Advanced bit manipulation */ +#define X86_FEATURE_SSE4A ( 6*32+ 6) /* SSE-4A */ +#define X86_FEATURE_MISALIGNSSE ( 6*32+ 7) /* Misaligned SSE mode */ +#define X86_FEATURE_3DNOWPREFETCH ( 6*32+ 8) /* 3DNow prefetch instructions */ +#define X86_FEATURE_OSVW ( 6*32+ 9) /* OS Visible Workaround */ +#define X86_FEATURE_IBS ( 6*32+10) /* Instruction Based Sampling */ +#define X86_FEATURE_XOP ( 6*32+11) /* extended AVX instructions */ +#define X86_FEATURE_SKINIT ( 6*32+12) /* SKINIT/STGI instructions */ +#define X86_FEATURE_WDT ( 6*32+13) /* Watchdog timer */ +#define X86_FEATURE_LWP ( 6*32+15) /* Light Weight Profiling */ +#define X86_FEATURE_FMA4 ( 6*32+16) /* 4 operands MAC instructions */ +#define X86_FEATURE_TCE ( 6*32+17) /* Translation Cache Extension */ +#define X86_FEATURE_NODEID_MSR ( 6*32+19) /* NodeId MSR */ +#define X86_FEATURE_TBM ( 6*32+21) /* Trailing Bit Manipulations */ +#define X86_FEATURE_TOPOEXT ( 6*32+22) /* Topology extensions CPUID leafs */ +#define X86_FEATURE_PERFCTR_CORE ( 6*32+23) /* Core performance counter extensions */ +#define X86_FEATURE_PERFCTR_NB ( 6*32+24) /* NB performance counter extensions */ +#define X86_FEATURE_BPEXT ( 6*32+26) /* Data breakpoint extension */ +#define X86_FEATURE_PTSC ( 6*32+27) /* Performance time-stamp counter */ +#define X86_FEATURE_PERFCTR_L2 ( 6*32+28) /* Last Level Cache performance counter extensions */ +#define X86_FEATURE_MWAITX ( 6*32+29) /* MWAIT extension (MONITORX/MWAITX instructions) */ /* * Auxiliary flags: Linux defined - For features scattered in various @@ -186,144 +189,164 @@ * * Reuse free bits when adding new feature flags! */ -#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT */ -#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ -#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ -#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ -#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ -#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ -#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ - -#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ -#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ - -#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ -#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ -#define X86_FEATURE_AVX512_4VNNIW (7*32+16) /* AVX-512 Neural Network Instructions */ -#define X86_FEATURE_AVX512_4FMAPS (7*32+17) /* AVX-512 Multiply Accumulation Single precision */ - -#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ +#define X86_FEATURE_RING3MWAIT ( 7*32+ 0) /* Ring 3 MONITOR/MWAIT instructions */ +#define X86_FEATURE_CPUID_FAULT ( 7*32+ 1) /* Intel CPUID faulting */ +#define X86_FEATURE_CPB ( 7*32+ 2) /* AMD Core Performance Boost */ +#define X86_FEATURE_EPB ( 7*32+ 3) /* IA32_ENERGY_PERF_BIAS support */ +#define X86_FEATURE_CAT_L3 ( 7*32+ 4) /* Cache Allocation Technology L3 */ +#define X86_FEATURE_CAT_L2 ( 7*32+ 5) /* Cache Allocation Technology L2 */ +#define X86_FEATURE_CDP_L3 ( 7*32+ 6) /* Code and Data Prioritization L3 */ +#define X86_FEATURE_INVPCID_SINGLE ( 7*32+ 7) /* Effectively INVPCID && CR4.PCIDE=1 */ + +#define X86_FEATURE_HW_PSTATE ( 7*32+ 8) /* AMD HW-PState */ +#define X86_FEATURE_PROC_FEEDBACK ( 7*32+ 9) /* AMD ProcFeedbackInterface */ +#define X86_FEATURE_SME ( 7*32+10) /* AMD Secure Memory Encryption */ +#define X86_FEATURE_PTI ( 7*32+11) /* Kernel Page Table Isolation enabled */ +#define X86_FEATURE_RETPOLINE ( 7*32+12) /* Generic Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_RETPOLINE_AMD ( 7*32+13) /* AMD Retpoline mitigation for Spectre variant 2 */ +#define X86_FEATURE_INTEL_PPIN ( 7*32+14) /* Intel Processor Inventory Number */ +#define X86_FEATURE_INTEL_PT ( 7*32+15) /* Intel Processor Trace */ +#define X86_FEATURE_AVX512_4VNNIW ( 7*32+16) /* AVX-512 Neural Network Instructions */ +#define X86_FEATURE_AVX512_4FMAPS ( 7*32+17) /* AVX-512 Multiply Accumulation Single precision */ + +#define X86_FEATURE_MBA ( 7*32+18) /* Memory Bandwidth Allocation */ +#define X86_FEATURE_RSB_CTXSW ( 7*32+19) /* Fill RSB on context switches */ +#define X86_FEATURE_SPEC_CTRL ( 7*32+20) /* Control Speculation Control */ /* Virtualization flags: Linux defined, word 8 */ -#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ -#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ -#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ -#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ -#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ - -#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer vmmcall to vmcall */ -#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ - - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ebx), word 9 */ -#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* {RD/WR}{FS/GS}BASE instructions*/ -#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3b */ -#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ -#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ -#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ -#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ -#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ -#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB */ -#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ -#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ -#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ -#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ -#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ -#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ -#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ -#define X86_FEATURE_RDSEED ( 9*32+18) /* The RDSEED instruction */ -#define X86_FEATURE_ADX ( 9*32+19) /* The ADCX and ADOX instructions */ -#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ -#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ -#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ -#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ -#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ -#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ -#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ -#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ -#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ -#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ - -/* Extended state features, CPUID level 0x0000000d:1 (eax), word 10 */ -#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT */ -#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC */ -#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 */ -#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (edx), word 11 */ -#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ - -/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (edx), word 12 */ -#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring if 1 */ -#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ -#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ - -/* AMD-defined CPU features, CPUID level 0x80000008 (ebx), word 13 */ -#define X86_FEATURE_CLZERO (13*32+0) /* CLZERO instruction */ -#define X86_FEATURE_IRPERF (13*32+1) /* Instructions Retired Count */ - -/* Thermal and Power Management Leaf, CPUID level 0x00000006 (eax), word 14 */ -#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ -#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ -#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ -#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ -#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ -#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ -#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ -#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ -#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ -#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ - -/* AMD SVM Feature Identification, CPUID level 0x8000000a (edx), word 15 */ -#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ -#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ -#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ -#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ -#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ -#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ -#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ -#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ -#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ -#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ -#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ -#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ - -/* Intel-defined CPU features, CPUID level 0x00000007:0 (ecx), word 16 */ -#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ -#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ -#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ -#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ -#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ -#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ - -/* AMD-defined CPU features, CPUID level 0x80000007 (ebx), word 17 */ -#define X86_FEATURE_OVERFLOW_RECOV (17*32+0) /* MCA overflow recovery support */ -#define X86_FEATURE_SUCCOR (17*32+1) /* Uncorrectable error containment and recovery */ -#define X86_FEATURE_SMCA (17*32+3) /* Scalable MCA */ +#define X86_FEATURE_TPR_SHADOW ( 8*32+ 0) /* Intel TPR Shadow */ +#define X86_FEATURE_VNMI ( 8*32+ 1) /* Intel Virtual NMI */ +#define X86_FEATURE_FLEXPRIORITY ( 8*32+ 2) /* Intel FlexPriority */ +#define X86_FEATURE_EPT ( 8*32+ 3) /* Intel Extended Page Table */ +#define X86_FEATURE_VPID ( 8*32+ 4) /* Intel Virtual Processor ID */ + +#define X86_FEATURE_VMMCALL ( 8*32+15) /* Prefer VMMCALL to VMCALL */ +#define X86_FEATURE_XENPV ( 8*32+16) /* "" Xen paravirtual guest */ + + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (EBX), word 9 */ +#define X86_FEATURE_FSGSBASE ( 9*32+ 0) /* RDFSBASE, WRFSBASE, RDGSBASE, WRGSBASE instructions*/ +#define X86_FEATURE_TSC_ADJUST ( 9*32+ 1) /* TSC adjustment MSR 0x3B */ +#define X86_FEATURE_BMI1 ( 9*32+ 3) /* 1st group bit manipulation extensions */ +#define X86_FEATURE_HLE ( 9*32+ 4) /* Hardware Lock Elision */ +#define X86_FEATURE_AVX2 ( 9*32+ 5) /* AVX2 instructions */ +#define X86_FEATURE_SMEP ( 9*32+ 7) /* Supervisor Mode Execution Protection */ +#define X86_FEATURE_BMI2 ( 9*32+ 8) /* 2nd group bit manipulation extensions */ +#define X86_FEATURE_ERMS ( 9*32+ 9) /* Enhanced REP MOVSB/STOSB instructions */ +#define X86_FEATURE_INVPCID ( 9*32+10) /* Invalidate Processor Context ID */ +#define X86_FEATURE_RTM ( 9*32+11) /* Restricted Transactional Memory */ +#define X86_FEATURE_CQM ( 9*32+12) /* Cache QoS Monitoring */ +#define X86_FEATURE_MPX ( 9*32+14) /* Memory Protection Extension */ +#define X86_FEATURE_RDT_A ( 9*32+15) /* Resource Director Technology Allocation */ +#define X86_FEATURE_AVX512F ( 9*32+16) /* AVX-512 Foundation */ +#define X86_FEATURE_AVX512DQ ( 9*32+17) /* AVX-512 DQ (Double/Quad granular) Instructions */ +#define X86_FEATURE_RDSEED ( 9*32+18) /* RDSEED instruction */ +#define X86_FEATURE_ADX ( 9*32+19) /* ADCX and ADOX instructions */ +#define X86_FEATURE_SMAP ( 9*32+20) /* Supervisor Mode Access Prevention */ +#define X86_FEATURE_AVX512IFMA ( 9*32+21) /* AVX-512 Integer Fused Multiply-Add instructions */ +#define X86_FEATURE_CLFLUSHOPT ( 9*32+23) /* CLFLUSHOPT instruction */ +#define X86_FEATURE_CLWB ( 9*32+24) /* CLWB instruction */ +#define X86_FEATURE_AVX512PF ( 9*32+26) /* AVX-512 Prefetch */ +#define X86_FEATURE_AVX512ER ( 9*32+27) /* AVX-512 Exponential and Reciprocal */ +#define X86_FEATURE_AVX512CD ( 9*32+28) /* AVX-512 Conflict Detection */ +#define X86_FEATURE_SHA_NI ( 9*32+29) /* SHA1/SHA256 Instruction Extensions */ +#define X86_FEATURE_AVX512BW ( 9*32+30) /* AVX-512 BW (Byte/Word granular) Instructions */ +#define X86_FEATURE_AVX512VL ( 9*32+31) /* AVX-512 VL (128/256 Vector Length) Extensions */ + +/* Extended state features, CPUID level 0x0000000d:1 (EAX), word 10 */ +#define X86_FEATURE_XSAVEOPT (10*32+ 0) /* XSAVEOPT instruction */ +#define X86_FEATURE_XSAVEC (10*32+ 1) /* XSAVEC instruction */ +#define X86_FEATURE_XGETBV1 (10*32+ 2) /* XGETBV with ECX = 1 instruction */ +#define X86_FEATURE_XSAVES (10*32+ 3) /* XSAVES/XRSTORS instructions */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:0 (EDX), word 11 */ +#define X86_FEATURE_CQM_LLC (11*32+ 1) /* LLC QoS if 1 */ + +/* Intel-defined CPU QoS Sub-leaf, CPUID level 0x0000000F:1 (EDX), word 12 */ +#define X86_FEATURE_CQM_OCCUP_LLC (12*32+ 0) /* LLC occupancy monitoring */ +#define X86_FEATURE_CQM_MBM_TOTAL (12*32+ 1) /* LLC Total MBM monitoring */ +#define X86_FEATURE_CQM_MBM_LOCAL (12*32+ 2) /* LLC Local MBM monitoring */ + +/* AMD-defined CPU features, CPUID level 0x80000008 (EBX), word 13 */ +#define X86_FEATURE_CLZERO (13*32+ 0) /* CLZERO instruction */ +#define X86_FEATURE_IRPERF (13*32+ 1) /* Instructions Retired Count */ +#define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* Always save/restore FP error pointers */ +#define X86_FEATURE_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ + +/* Thermal and Power Management Leaf, CPUID level 0x00000006 (EAX), word 14 */ +#define X86_FEATURE_DTHERM (14*32+ 0) /* Digital Thermal Sensor */ +#define X86_FEATURE_IDA (14*32+ 1) /* Intel Dynamic Acceleration */ +#define X86_FEATURE_ARAT (14*32+ 2) /* Always Running APIC Timer */ +#define X86_FEATURE_PLN (14*32+ 4) /* Intel Power Limit Notification */ +#define X86_FEATURE_PTS (14*32+ 6) /* Intel Package Thermal Status */ +#define X86_FEATURE_HWP (14*32+ 7) /* Intel Hardware P-states */ +#define X86_FEATURE_HWP_NOTIFY (14*32+ 8) /* HWP Notification */ +#define X86_FEATURE_HWP_ACT_WINDOW (14*32+ 9) /* HWP Activity Window */ +#define X86_FEATURE_HWP_EPP (14*32+10) /* HWP Energy Perf. Preference */ +#define X86_FEATURE_HWP_PKG_REQ (14*32+11) /* HWP Package Level Request */ + +/* AMD SVM Feature Identification, CPUID level 0x8000000a (EDX), word 15 */ +#define X86_FEATURE_NPT (15*32+ 0) /* Nested Page Table support */ +#define X86_FEATURE_LBRV (15*32+ 1) /* LBR Virtualization support */ +#define X86_FEATURE_SVML (15*32+ 2) /* "svm_lock" SVM locking MSR */ +#define X86_FEATURE_NRIPS (15*32+ 3) /* "nrip_save" SVM next_rip save */ +#define X86_FEATURE_TSCRATEMSR (15*32+ 4) /* "tsc_scale" TSC scaling support */ +#define X86_FEATURE_VMCBCLEAN (15*32+ 5) /* "vmcb_clean" VMCB clean bits support */ +#define X86_FEATURE_FLUSHBYASID (15*32+ 6) /* flush-by-ASID support */ +#define X86_FEATURE_DECODEASSISTS (15*32+ 7) /* Decode Assists support */ +#define X86_FEATURE_PAUSEFILTER (15*32+10) /* filtered pause intercept */ +#define X86_FEATURE_PFTHRESHOLD (15*32+12) /* pause filter threshold */ +#define X86_FEATURE_AVIC (15*32+13) /* Virtual Interrupt Controller */ +#define X86_FEATURE_V_VMSAVE_VMLOAD (15*32+15) /* Virtual VMSAVE VMLOAD */ +#define X86_FEATURE_VGIF (15*32+16) /* Virtual GIF */ + +/* Intel-defined CPU features, CPUID level 0x00000007:0 (ECX), word 16 */ +#define X86_FEATURE_AVX512VBMI (16*32+ 1) /* AVX512 Vector Bit Manipulation instructions*/ +#define X86_FEATURE_UMIP (16*32+ 2) /* User Mode Instruction Protection */ +#define X86_FEATURE_PKU (16*32+ 3) /* Protection Keys for Userspace */ +#define X86_FEATURE_OSPKE (16*32+ 4) /* OS Protection Keys Enable */ +#define X86_FEATURE_AVX512_VBMI2 (16*32+ 6) /* Additional AVX512 Vector Bit Manipulation Instructions */ +#define X86_FEATURE_GFNI (16*32+ 8) /* Galois Field New Instructions */ +#define X86_FEATURE_VAES (16*32+ 9) /* Vector AES */ +#define X86_FEATURE_VPCLMULQDQ (16*32+10) /* Carry-Less Multiplication Double Quadword */ +#define X86_FEATURE_AVX512_VNNI (16*32+11) /* Vector Neural Network Instructions */ +#define X86_FEATURE_AVX512_BITALG (16*32+12) /* Support for VPOPCNT[B,W] and VPSHUF-BITQMB instructions */ +#define X86_FEATURE_AVX512_VPOPCNTDQ (16*32+14) /* POPCNT for vectors of DW/QW */ +#define X86_FEATURE_LA57 (16*32+16) /* 5-level page tables */ +#define X86_FEATURE_RDPID (16*32+22) /* RDPID instruction */ + +/* AMD-defined CPU features, CPUID level 0x80000007 (EBX), word 17 */ +#define X86_FEATURE_OVERFLOW_RECOV (17*32+ 0) /* MCA overflow recovery support */ +#define X86_FEATURE_SUCCOR (17*32+ 1) /* Uncorrectable error containment and recovery */ +#define X86_FEATURE_SMCA (17*32+ 3) /* Scalable MCA */ /* * BUG word(s) */ -#define X86_BUG(x) (NCAPINTS*32 + (x)) +#define X86_BUG(x) (NCAPINTS*32 + (x)) -#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ -#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ -#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ -#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ -#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ -#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ -#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ -#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ -#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ +#define X86_BUG_F00F X86_BUG(0) /* Intel F00F */ +#define X86_BUG_FDIV X86_BUG(1) /* FPU FDIV */ +#define X86_BUG_COMA X86_BUG(2) /* Cyrix 6x86 coma */ +#define X86_BUG_AMD_TLB_MMATCH X86_BUG(3) /* "tlb_mmatch" AMD Erratum 383 */ +#define X86_BUG_AMD_APIC_C1E X86_BUG(4) /* "apic_c1e" AMD Erratum 400 */ +#define X86_BUG_11AP X86_BUG(5) /* Bad local APIC aka 11AP */ +#define X86_BUG_FXSAVE_LEAK X86_BUG(6) /* FXSAVE leaks FOP/FIP/FOP */ +#define X86_BUG_CLFLUSH_MONITOR X86_BUG(7) /* AAI65, CLFLUSH required before MONITOR */ +#define X86_BUG_SYSRET_SS_ATTRS X86_BUG(8) /* SYSRET doesn't fix up SS attrs */ #ifdef CONFIG_X86_32 /* * 64-bit kernels don't use X86_BUG_ESPFIX. Make the define conditional * to avoid confusion. */ -#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ +#define X86_BUG_ESPFIX X86_BUG(9) /* "" IRET to 16-bit SS corrupts ESP/RSP high bits */ #endif -#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ -#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ -#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ -#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ +#define X86_BUG_NULL_SEG X86_BUG(10) /* Nulling a selector preserves the base */ +#define X86_BUG_SWAPGS_FENCE X86_BUG(11) /* SWAPGS without input dep on GS */ +#define X86_BUG_MONITOR X86_BUG(12) /* IPI required to wake up remote CPU */ +#define X86_BUG_AMD_E400 X86_BUG(13) /* CPU is among the affected by Erratum 400 */ +#define X86_BUG_CPU_MELTDOWN X86_BUG(14) /* CPU is affected by meltdown attack and needs kernel page table isolation */ +#define X86_BUG_SPECTRE_V1 X86_BUG(15) /* CPU is affected by Spectre variant 1 attack with conditional branches */ +#define X86_BUG_SPECTRE_V2 X86_BUG(16) /* CPU is affected by Spectre variant 2 attack with indirect branches */ + #endif /* _ASM_X86_CPUFEATURES_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/desc.h +++ linux-azure-4.13.0/arch/x86/include/asm/desc.h @@ -5,6 +5,8 @@ #include #include #include +#include +#include #include #include @@ -18,6 +20,8 @@ desc->type = (info->read_exec_only ^ 1) << 1; desc->type |= info->contents << 2; + /* Set the ACCESS bit so it can be mapped RO */ + desc->type |= 1; desc->s = 1; desc->dpl = 0x3; @@ -58,17 +62,10 @@ return this_cpu_ptr(&gdt_page)->gdt; } -/* Get the fixmap index for a specific processor */ -static inline unsigned int get_cpu_gdt_ro_index(int cpu) -{ - return FIX_GDT_REMAP_BEGIN + cpu; -} - /* Provide the fixmap address of the remapped GDT */ static inline struct desc_struct *get_cpu_gdt_ro(int cpu) { - unsigned int idx = get_cpu_gdt_ro_index(cpu); - return (struct desc_struct *)__fix_to_virt(idx); + return (struct desc_struct *)&get_cpu_entry_area(cpu)->gdt; } /* Provide the current read-only GDT */ @@ -83,33 +80,25 @@ return per_cpu_ptr_to_phys(get_cpu_gdt_rw(cpu)); } -#ifdef CONFIG_X86_64 - static inline void pack_gate(gate_desc *gate, unsigned type, unsigned long func, unsigned dpl, unsigned ist, unsigned seg) { - gate->offset_low = PTR_LOW(func); + gate->offset_low = (u16) func; + gate->bits.p = 1; + gate->bits.dpl = dpl; + gate->bits.zero = 0; + gate->bits.type = type; + gate->offset_middle = (u16) (func >> 16); +#ifdef CONFIG_X86_64 gate->segment = __KERNEL_CS; - gate->ist = ist; - gate->p = 1; - gate->dpl = dpl; - gate->zero0 = 0; - gate->zero1 = 0; - gate->type = type; - gate->offset_middle = PTR_MIDDLE(func); - gate->offset_high = PTR_HIGH(func); -} - + gate->bits.ist = ist; + gate->reserved = 0; + gate->offset_high = (u32) (func >> 32); #else -static inline void pack_gate(gate_desc *gate, unsigned char type, - unsigned long base, unsigned dpl, unsigned flags, - unsigned short seg) -{ - gate->a = (seg << 16) | (base & 0xffff); - gate->b = (base & 0xffff0000) | (((0x80 | type | (dpl << 5)) & 0xff) << 8); -} - + gate->segment = seg; + gate->bits.ist = 0; #endif +} static inline int desc_empty(const void *ptr) { @@ -128,7 +117,6 @@ #define load_ldt(ldt) asm volatile("lldt %0"::"m" (ldt)) #define store_gdt(dtr) native_store_gdt(dtr) -#define store_idt(dtr) native_store_idt(dtr) #define store_tr(tr) (tr = native_store_tr()) #define load_TLS(t, cpu) native_load_tls(t, cpu) @@ -185,7 +173,8 @@ } -static inline void set_tssldt_descriptor(void *d, unsigned long addr, unsigned type, unsigned size) +static inline void set_tssldt_descriptor(void *d, unsigned long addr, + unsigned type, unsigned size) { #ifdef CONFIG_X86_64 struct ldttss_desc64 *desc = d; @@ -193,19 +182,19 @@ memset(desc, 0, sizeof(*desc)); desc->limit0 = size & 0xFFFF; - desc->base0 = PTR_LOW(addr); - desc->base1 = PTR_MIDDLE(addr) & 0xFF; + desc->base0 = (u16) addr; + desc->base1 = (addr >> 16) & 0xFF; desc->type = type; desc->p = 1; desc->limit1 = (size >> 16) & 0xF; - desc->base2 = (PTR_MIDDLE(addr) >> 8) & 0xFF; - desc->base3 = PTR_HIGH(addr); + desc->base2 = (addr >> 24) & 0xFF; + desc->base3 = (u32) (addr >> 32); #else pack_descriptor((struct desc_struct *)d, addr, size, 0x80 | type, 0); #endif } -static inline void __set_tss_desc(unsigned cpu, unsigned int entry, void *addr) +static inline void __set_tss_desc(unsigned cpu, unsigned int entry, struct x86_hw_tss *addr) { struct desc_struct *d = get_cpu_gdt_rw(cpu); tss_desc tss; @@ -248,7 +237,7 @@ asm volatile("sgdt %0":"=m" (*dtr)); } -static inline void native_store_idt(struct desc_ptr *dtr) +static inline void store_idt(struct desc_ptr *dtr) { asm volatile("sidt %0":"=m" (*dtr)); } --- linux-azure-4.13.0.orig/arch/x86/include/asm/desc_defs.h +++ linux-azure-4.13.0/arch/x86/include/asm/desc_defs.h @@ -47,20 +47,6 @@ GATE_TASK = 0x5, }; -/* 16byte gate */ -struct gate_struct64 { - u16 offset_low; - u16 segment; - unsigned ist : 3, zero0 : 5, type : 5, dpl : 2, p : 1; - u16 offset_middle; - u32 offset_high; - u32 zero1; -} __attribute__((packed)); - -#define PTR_LOW(x) ((unsigned long long)(x) & 0xFFFF) -#define PTR_MIDDLE(x) (((unsigned long long)(x) >> 16) & 0xFFFF) -#define PTR_HIGH(x) ((unsigned long long)(x) >> 32) - enum { DESC_TSS = 0x9, DESC_LDT = 0x2, @@ -77,20 +63,51 @@ u32 zero1; } __attribute__((packed)); + #ifdef CONFIG_X86_64 -typedef struct gate_struct64 gate_desc; typedef struct ldttss_desc64 ldt_desc; typedef struct ldttss_desc64 tss_desc; -#define gate_offset(g) ((g).offset_low | ((unsigned long)(g).offset_middle << 16) | ((unsigned long)(g).offset_high << 32)) -#define gate_segment(g) ((g).segment) #else -typedef struct desc_struct gate_desc; typedef struct desc_struct ldt_desc; typedef struct desc_struct tss_desc; -#define gate_offset(g) (((g).b & 0xffff0000) | ((g).a & 0x0000ffff)) -#define gate_segment(g) ((g).a >> 16) #endif +struct idt_bits { + u16 ist : 3, + zero : 5, + type : 5, + dpl : 2, + p : 1; +} __attribute__((packed)); + +struct gate_struct { + u16 offset_low; + u16 segment; + struct idt_bits bits; + u16 offset_middle; +#ifdef CONFIG_X86_64 + u32 offset_high; + u32 reserved; +#endif +} __attribute__((packed)); + +typedef struct gate_struct gate_desc; + +static inline unsigned long gate_offset(const gate_desc *g) +{ +#ifdef CONFIG_X86_64 + return g->offset_low | ((unsigned long)g->offset_middle << 16) | + ((unsigned long) g->offset_high << 32); +#else + return g->offset_low | ((unsigned long)g->offset_middle << 16); +#endif +} + +static inline unsigned long gate_segment(const gate_desc *g) +{ + return g->segment; +} + struct desc_ptr { unsigned short size; unsigned long address; --- linux-azure-4.13.0.orig/arch/x86/include/asm/disabled-features.h +++ linux-azure-4.13.0/arch/x86/include/asm/disabled-features.h @@ -42,6 +42,12 @@ # define DISABLE_LA57 (1<<(X86_FEATURE_LA57 & 31)) #endif +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define DISABLE_PTI 0 +#else +# define DISABLE_PTI (1 << (X86_FEATURE_PTI & 31)) +#endif + /* * Make sure to add features to the correct mask */ @@ -52,7 +58,7 @@ #define DISABLED_MASK4 0 #define DISABLED_MASK5 0 #define DISABLED_MASK6 0 -#define DISABLED_MASK7 0 +#define DISABLED_MASK7 (DISABLE_PTI) #define DISABLED_MASK8 0 #define DISABLED_MASK9 (DISABLE_MPX) #define DISABLED_MASK10 0 --- linux-azure-4.13.0.orig/arch/x86/include/asm/elf.h +++ linux-azure-4.13.0/arch/x86/include/asm/elf.h @@ -204,6 +204,7 @@ #define ELF_CORE_COPY_REGS(pr_reg, regs) \ do { \ + unsigned long base; \ unsigned v; \ (pr_reg)[0] = (regs)->r15; \ (pr_reg)[1] = (regs)->r14; \ @@ -226,8 +227,8 @@ (pr_reg)[18] = (regs)->flags; \ (pr_reg)[19] = (regs)->sp; \ (pr_reg)[20] = (regs)->ss; \ - (pr_reg)[21] = current->thread.fsbase; \ - (pr_reg)[22] = current->thread.gsbase; \ + rdmsrl(MSR_FS_BASE, base); (pr_reg)[21] = base; \ + rdmsrl(MSR_KERNEL_GS_BASE, base); (pr_reg)[22] = base; \ asm("movl %%ds,%0" : "=r" (v)); (pr_reg)[23] = v; \ asm("movl %%es,%0" : "=r" (v)); (pr_reg)[24] = v; \ asm("movl %%fs,%0" : "=r" (v)); (pr_reg)[25] = v; \ --- linux-azure-4.13.0.orig/arch/x86/include/asm/espfix.h +++ linux-azure-4.13.0/arch/x86/include/asm/espfix.h @@ -1,7 +1,7 @@ #ifndef _ASM_X86_ESPFIX_H #define _ASM_X86_ESPFIX_H -#ifdef CONFIG_X86_64 +#ifdef CONFIG_X86_ESPFIX64 #include @@ -10,7 +10,8 @@ extern void init_espfix_bsp(void); extern void init_espfix_ap(int cpu); - -#endif /* CONFIG_X86_64 */ +#else +static inline void init_espfix_ap(int cpu) { } +#endif #endif /* _ASM_X86_ESPFIX_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/fixmap.h +++ linux-azure-4.13.0/arch/x86/include/asm/fixmap.h @@ -44,7 +44,6 @@ PAGE_SIZE) #endif - /* * Here we define all the compile-time 'special' virtual * addresses. The point is to have a constant address at @@ -84,7 +83,6 @@ FIX_IO_APIC_BASE_0, FIX_IO_APIC_BASE_END = FIX_IO_APIC_BASE_0 + MAX_IO_APICS - 1, #endif - FIX_RO_IDT, /* Virtual mapping for read-only IDT */ #ifdef CONFIG_X86_32 FIX_KMAP_BEGIN, /* reserved pte's for temporary kernel mappings */ FIX_KMAP_END = FIX_KMAP_BEGIN+(KM_TYPE_NR*NR_CPUS)-1, @@ -100,9 +98,12 @@ #ifdef CONFIG_X86_INTEL_MID FIX_LNW_VRTC, #endif - /* Fixmap entries to remap the GDTs, one per processor. */ - FIX_GDT_REMAP_BEGIN, - FIX_GDT_REMAP_END = FIX_GDT_REMAP_BEGIN + NR_CPUS - 1, + +#ifdef CONFIG_ACPI_APEI_GHES + /* Used for GHES mapping from assorted contexts */ + FIX_APEI_GHES_IRQ, + FIX_APEI_GHES_NMI, +#endif __end_of_permanent_fixed_addresses, @@ -136,8 +137,10 @@ extern void reserve_top_address(unsigned long reserve); -#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_SIZE (__end_of_permanent_fixed_addresses << PAGE_SHIFT) #define FIXADDR_START (FIXADDR_TOP - FIXADDR_SIZE) +#define FIXADDR_TOT_SIZE (__end_of_fixed_addresses << PAGE_SHIFT) +#define FIXADDR_TOT_START (FIXADDR_TOP - FIXADDR_TOT_SIZE) extern int fixmaps_set; --- linux-azure-4.13.0.orig/arch/x86/include/asm/futex.h +++ linux-azure-4.13.0/arch/x86/include/asm/futex.h @@ -41,20 +41,11 @@ "+m" (*uaddr), "=&r" (tem) \ : "r" (oparg), "i" (-EFAULT), "1" (0)) -static inline int futex_atomic_op_inuser(int encoded_op, u32 __user *uaddr) +static inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) { - int op = (encoded_op >> 28) & 7; - int cmp = (encoded_op >> 24) & 15; - int oparg = (encoded_op << 8) >> 20; - int cmparg = (encoded_op << 20) >> 20; int oldval = 0, ret, tem; - if (encoded_op & (FUTEX_OP_OPARG_SHIFT << 28)) - oparg = 1 << oparg; - - if (!access_ok(VERIFY_WRITE, uaddr, sizeof(u32))) - return -EFAULT; - pagefault_disable(); switch (op) { @@ -80,30 +71,9 @@ pagefault_enable(); - if (!ret) { - switch (cmp) { - case FUTEX_OP_CMP_EQ: - ret = (oldval == cmparg); - break; - case FUTEX_OP_CMP_NE: - ret = (oldval != cmparg); - break; - case FUTEX_OP_CMP_LT: - ret = (oldval < cmparg); - break; - case FUTEX_OP_CMP_GE: - ret = (oldval >= cmparg); - break; - case FUTEX_OP_CMP_LE: - ret = (oldval <= cmparg); - break; - case FUTEX_OP_CMP_GT: - ret = (oldval > cmparg); - break; - default: - ret = -ENOSYS; - } - } + if (!ret) + *oval = oldval; + return ret; } --- linux-azure-4.13.0.orig/arch/x86/include/asm/hypervisor.h +++ linux-azure-4.13.0/arch/x86/include/asm/hypervisor.h @@ -20,14 +20,22 @@ #ifndef _ASM_X86_HYPERVISOR_H #define _ASM_X86_HYPERVISOR_H +/* x86 hypervisor types */ +enum x86_hypervisor_type { + X86_HYPER_NATIVE = 0, + X86_HYPER_VMWARE, + X86_HYPER_MS_HYPERV, + X86_HYPER_XEN_PV, + X86_HYPER_XEN_HVM, + X86_HYPER_KVM, +}; + #ifdef CONFIG_HYPERVISOR_GUEST #include +#include #include -/* - * x86 hypervisor information - */ struct hypervisor_x86 { /* Hypervisor name */ const char *name; @@ -35,40 +43,27 @@ /* Detection routine */ uint32_t (*detect)(void); - /* Platform setup (run once per boot) */ - void (*init_platform)(void); + /* Hypervisor type */ + enum x86_hypervisor_type type; - /* X2APIC detection (run once per boot) */ - bool (*x2apic_available)(void); + /* init time callbacks */ + struct x86_hyper_init init; - /* pin current vcpu to specified physical cpu (run rarely) */ - void (*pin_vcpu)(int); - - /* called during init_mem_mapping() to setup early mappings. */ - void (*init_mem_mapping)(void); + /* runtime callbacks */ + struct x86_hyper_runtime runtime; }; -extern const struct hypervisor_x86 *x86_hyper; - -/* Recognized hypervisors */ -extern const struct hypervisor_x86 x86_hyper_vmware; -extern const struct hypervisor_x86 x86_hyper_ms_hyperv; -extern const struct hypervisor_x86 x86_hyper_xen_pv; -extern const struct hypervisor_x86 x86_hyper_xen_hvm; -extern const struct hypervisor_x86 x86_hyper_kvm; - +extern enum x86_hypervisor_type x86_hyper_type; extern void init_hypervisor_platform(void); -extern bool hypervisor_x2apic_available(void); -extern void hypervisor_pin_vcpu(int cpu); - -static inline void hypervisor_init_mem_mapping(void) +static inline bool hypervisor_is_type(enum x86_hypervisor_type type) { - if (x86_hyper && x86_hyper->init_mem_mapping) - x86_hyper->init_mem_mapping(); + return x86_hyper_type == type; } #else static inline void init_hypervisor_platform(void) { } -static inline bool hypervisor_x2apic_available(void) { return false; } -static inline void hypervisor_init_mem_mapping(void) { } +static inline bool hypervisor_is_type(enum x86_hypervisor_type type) +{ + return type == X86_HYPER_NATIVE; +} #endif /* CONFIG_HYPERVISOR_GUEST */ #endif /* _ASM_X86_HYPERVISOR_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/inat.h +++ linux-azure-4.13.0/arch/x86/include/asm/inat.h @@ -97,6 +97,16 @@ #define INAT_MAKE_GROUP(grp) ((grp << INAT_GRP_OFFS) | INAT_MODRM) #define INAT_MAKE_IMM(imm) (imm << INAT_IMM_OFFS) +/* Identifiers for segment registers */ +#define INAT_SEG_REG_IGNORE 0 +#define INAT_SEG_REG_DEFAULT 1 +#define INAT_SEG_REG_CS 2 +#define INAT_SEG_REG_SS 3 +#define INAT_SEG_REG_DS 4 +#define INAT_SEG_REG_ES 5 +#define INAT_SEG_REG_FS 6 +#define INAT_SEG_REG_GS 7 + /* Attribute search APIs */ extern insn_attr_t inat_get_opcode_attribute(insn_byte_t opcode); extern int inat_get_last_prefix_id(insn_byte_t last_pfx); --- linux-azure-4.13.0.orig/arch/x86/include/asm/intel_ds.h +++ linux-azure-4.13.0/arch/x86/include/asm/intel_ds.h @@ -0,0 +1,36 @@ +#ifndef _ASM_INTEL_DS_H +#define _ASM_INTEL_DS_H + +#include + +#define BTS_BUFFER_SIZE (PAGE_SIZE << 4) +#define PEBS_BUFFER_SIZE (PAGE_SIZE << 4) + +/* The maximal number of PEBS events: */ +#define MAX_PEBS_EVENTS 8 + +/* + * A debug store configuration. + * + * We only support architectures that use 64bit fields. + */ +struct debug_store { + u64 bts_buffer_base; + u64 bts_index; + u64 bts_absolute_maximum; + u64 bts_interrupt_threshold; + u64 pebs_buffer_base; + u64 pebs_index; + u64 pebs_absolute_maximum; + u64 pebs_interrupt_threshold; + u64 pebs_event_reset[MAX_PEBS_EVENTS]; +} __aligned(PAGE_SIZE); + +DECLARE_PER_CPU_PAGE_ALIGNED(struct debug_store, cpu_debug_store); + +struct debug_store_buffers { + char bts_buffer[BTS_BUFFER_SIZE]; + char pebs_buffer[PEBS_BUFFER_SIZE]; +}; + +#endif --- linux-azure-4.13.0.orig/arch/x86/include/asm/intel_rdt_sched.h +++ linux-azure-4.13.0/arch/x86/include/asm/intel_rdt_sched.h @@ -0,0 +1,92 @@ +#ifndef _ASM_X86_INTEL_RDT_SCHED_H +#define _ASM_X86_INTEL_RDT_SCHED_H + +#ifdef CONFIG_INTEL_RDT + +#include +#include + +#define IA32_PQR_ASSOC 0x0c8f + +/** + * struct intel_pqr_state - State cache for the PQR MSR + * @cur_rmid: The cached Resource Monitoring ID + * @cur_closid: The cached Class Of Service ID + * @default_rmid: The user assigned Resource Monitoring ID + * @default_closid: The user assigned cached Class Of Service ID + * + * The upper 32 bits of IA32_PQR_ASSOC contain closid and the + * lower 10 bits rmid. The update to IA32_PQR_ASSOC always + * contains both parts, so we need to cache them. This also + * stores the user configured per cpu CLOSID and RMID. + * + * The cache also helps to avoid pointless updates if the value does + * not change. + */ +struct intel_pqr_state { + u32 cur_rmid; + u32 cur_closid; + u32 default_rmid; + u32 default_closid; +}; + +DECLARE_PER_CPU(struct intel_pqr_state, pqr_state); + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key); + +/* + * __intel_rdt_sched_in() - Writes the task's CLOSid/RMID to IA32_PQR_MSR + * + * Following considerations are made so that this has minimal impact + * on scheduler hot path: + * - This will stay as no-op unless we are running on an Intel SKU + * which supports resource control or monitoring and we enable by + * mounting the resctrl file system. + * - Caches the per cpu CLOSid/RMID values and does the MSR write only + * when a task with a different CLOSid/RMID is scheduled in. + * - We allocate RMIDs/CLOSids globally in order to keep this as + * simple as possible. + * Must be called with preemption disabled. + */ +static void __intel_rdt_sched_in(void) +{ + struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); + u32 closid = state->default_closid; + u32 rmid = state->default_rmid; + + /* + * If this task has a closid/rmid assigned, use it. + * Else use the closid/rmid assigned to this cpu. + */ + if (static_branch_likely(&rdt_alloc_enable_key)) { + if (current->closid) + closid = current->closid; + } + + if (static_branch_likely(&rdt_mon_enable_key)) { + if (current->rmid) + rmid = current->rmid; + } + + if (closid != state->cur_closid || rmid != state->cur_rmid) { + state->cur_closid = closid; + state->cur_rmid = rmid; + wrmsr(IA32_PQR_ASSOC, rmid, closid); + } +} + +static inline void intel_rdt_sched_in(void) +{ + if (static_branch_likely(&rdt_enable_key)) + __intel_rdt_sched_in(); +} + +#else + +static inline void intel_rdt_sched_in(void) {} + +#endif /* CONFIG_INTEL_RDT */ + +#endif /* _ASM_X86_INTEL_RDT_SCHED_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/invpcid.h +++ linux-azure-4.13.0/arch/x86/include/asm/invpcid.h @@ -0,0 +1,53 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_INVPCID +#define _ASM_X86_INVPCID + +static inline void __invpcid(unsigned long pcid, unsigned long addr, + unsigned long type) +{ + struct { u64 d[2]; } desc = { { pcid, addr } }; + + /* + * The memory clobber is because the whole point is to invalidate + * stale TLB entries and, especially if we're flushing global + * mappings, we don't want the compiler to reorder any subsequent + * memory accesses before the TLB flush. + * + * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and + * invpcid (%rcx), %rax in long mode. + */ + asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" + : : "m" (desc), "a" (type), "c" (&desc) : "memory"); +} + +#define INVPCID_TYPE_INDIV_ADDR 0 +#define INVPCID_TYPE_SINGLE_CTXT 1 +#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 +#define INVPCID_TYPE_ALL_NON_GLOBAL 3 + +/* Flush all mappings for a given pcid and addr, not including globals. */ +static inline void invpcid_flush_one(unsigned long pcid, + unsigned long addr) +{ + __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); +} + +/* Flush all mappings for a given PCID, not including globals. */ +static inline void invpcid_flush_single_context(unsigned long pcid) +{ + __invpcid(pcid, 0, INVPCID_TYPE_SINGLE_CTXT); +} + +/* Flush all mappings, including globals, for all PCIDs. */ +static inline void invpcid_flush_all(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_INCL_GLOBAL); +} + +/* Flush all mappings for all PCIDs except globals. */ +static inline void invpcid_flush_all_nonglobals(void) +{ + __invpcid(0, 0, INVPCID_TYPE_ALL_NON_GLOBAL); +} + +#endif /* _ASM_X86_INVPCID */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/irqflags.h +++ linux-azure-4.13.0/arch/x86/include/asm/irqflags.h @@ -141,6 +141,9 @@ swapgs; \ sysretl +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(x) pushfq; popq %rax +#endif #else #define INTERRUPT_RETURN iret #define ENABLE_INTERRUPTS_SYSEXIT sti; sysexit --- linux-azure-4.13.0.orig/arch/x86/include/asm/kdebug.h +++ linux-azure-4.13.0/arch/x86/include/asm/kdebug.h @@ -25,6 +25,7 @@ extern int __must_check __die(const char *, struct pt_regs *, long); extern void show_stack_regs(struct pt_regs *regs); extern void __show_regs(struct pt_regs *regs, int all); +extern void show_iret_regs(struct pt_regs *regs); extern unsigned long oops_begin(void); extern void oops_end(unsigned long, struct pt_regs *, int signr); --- linux-azure-4.13.0.orig/arch/x86/include/asm/kvm_host.h +++ linux-azure-4.13.0/arch/x86/include/asm/kvm_host.h @@ -628,6 +628,8 @@ u64 mcg_ext_ctl; u64 *mce_banks; + u64 spec_ctrl; + /* Cache MMIO info */ u64 mmio_gva; unsigned access; @@ -1437,4 +1439,7 @@ #endif } +void kvm_arch_mmu_notifier_invalidate_range(struct kvm *kvm, + unsigned long start, unsigned long end); + #endif /* _ASM_X86_KVM_HOST_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/kvm_para.h +++ linux-azure-4.13.0/arch/x86/include/asm/kvm_para.h @@ -88,7 +88,7 @@ bool kvm_para_available(void); unsigned int kvm_arch_para_features(void); void __init kvm_guest_init(void); -void kvm_async_pf_task_wait(u32 token); +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel); void kvm_async_pf_task_wake(u32 token); u32 kvm_read_and_reset_pf_reason(void); extern void kvm_disable_steal_time(void); @@ -103,7 +103,7 @@ #else /* CONFIG_KVM_GUEST */ #define kvm_guest_init() do {} while (0) -#define kvm_async_pf_task_wait(T) do {} while(0) +#define kvm_async_pf_task_wait(T, I) do {} while(0) #define kvm_async_pf_task_wake(T) do {} while(0) static inline bool kvm_para_available(void) --- linux-azure-4.13.0.orig/arch/x86/include/asm/mmu.h +++ linux-azure-4.13.0/arch/x86/include/asm/mmu.h @@ -2,15 +2,33 @@ #define _ASM_X86_MMU_H #include +#include #include +#include /* - * The x86 doesn't have a mmu context, but - * we put the segment information here. + * x86 has arch-specific MMU state beyond what lives in mm_struct. */ typedef struct { + /* + * ctx_id uniquely identifies this mm_struct. A ctx_id will never + * be reused, and zero is not a valid ctx_id. + */ + u64 ctx_id; + + /* + * Any code that needs to do any sort of TLB flushing for this + * mm will first make its changes to the page tables, then + * increment tlb_gen, then flush. This lets the low-level + * flushing code keep track of what needs flushing. + * + * This is not used on Xen PV. + */ + atomic64_t tlb_gen; + #ifdef CONFIG_MODIFY_LDT_SYSCALL - struct ldt_struct *ldt; + struct rw_semaphore ldt_usr_sem; + struct ldt_struct *ldt; #endif #ifdef CONFIG_X86_64 @@ -37,6 +55,11 @@ #endif } mm_context_t; +#define INIT_MM_CONTEXT(mm) \ + .context = { \ + .ctx_id = 1, \ + } + void leave_mm(int cpu); #endif /* _ASM_X86_MMU_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/mmu_context.h +++ linux-azure-4.13.0/arch/x86/include/asm/mmu_context.h @@ -12,6 +12,9 @@ #include #include #include + +extern atomic64_t last_mm_ctx_id; + #ifndef CONFIG_PARAVIRT static inline void paravirt_activate_mm(struct mm_struct *prev, struct mm_struct *next) @@ -46,22 +49,53 @@ * call gates. On native, we could merge the ldt_struct and LDT * allocations, but it's not worth trying to optimize. */ - struct desc_struct *entries; - unsigned int nr_entries; + struct desc_struct *entries; + unsigned int nr_entries; + + /* + * If PTI is in use, then the entries array is not mapped while we're + * in user mode. The whole array will be aliased at the addressed + * given by ldt_slot_va(slot). We use two slots so that we can allocate + * and map, and enable a new LDT without invalidating the mapping + * of an older, still-in-use LDT. + * + * slot will be -1 if this LDT doesn't have an alias mapping. + */ + int slot; }; +/* This is a multiple of PAGE_SIZE. */ +#define LDT_SLOT_STRIDE (LDT_ENTRIES * LDT_ENTRY_SIZE) + +static inline void *ldt_slot_va(int slot) +{ +#ifdef CONFIG_X86_64 + return (void *)(LDT_BASE_ADDR + LDT_SLOT_STRIDE * slot); +#else + BUG(); +#endif +} + /* * Used for LDT copy/destruction. */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm); +static inline void init_new_context_ldt(struct mm_struct *mm) +{ + mm->context.ldt = NULL; + init_rwsem(&mm->context.ldt_usr_sem); +} +int ldt_dup_context(struct mm_struct *oldmm, struct mm_struct *mm); void destroy_context_ldt(struct mm_struct *mm); +void ldt_arch_exit_mmap(struct mm_struct *mm); #else /* CONFIG_MODIFY_LDT_SYSCALL */ -static inline int init_new_context_ldt(struct task_struct *tsk, - struct mm_struct *mm) +static inline void init_new_context_ldt(struct mm_struct *mm) { } +static inline int ldt_dup_context(struct mm_struct *oldmm, + struct mm_struct *mm) { return 0; } -static inline void destroy_context_ldt(struct mm_struct *mm) {} +static inline void destroy_context_ldt(struct mm_struct *mm) { } +static inline void ldt_arch_exit_mmap(struct mm_struct *mm) { } #endif static inline void load_mm_ldt(struct mm_struct *mm) @@ -69,8 +103,8 @@ #ifdef CONFIG_MODIFY_LDT_SYSCALL struct ldt_struct *ldt; - /* lockless_dereference synchronizes with smp_store_release */ - ldt = lockless_dereference(mm->context.ldt); + /* READ_ONCE synchronizes with smp_store_release */ + ldt = READ_ONCE(mm->context.ldt); /* * Any change to mm->context.ldt is followed by an IPI to all @@ -86,10 +120,31 @@ * that we can see. */ - if (unlikely(ldt)) - set_ldt(ldt->entries, ldt->nr_entries); - else + if (unlikely(ldt)) { + if (static_cpu_has(X86_FEATURE_PTI)) { + if (WARN_ON_ONCE((unsigned long)ldt->slot > 1)) { + /* + * Whoops -- either the new LDT isn't mapped + * (if slot == -1) or is mapped into a bogus + * slot (if slot > 1). + */ + clear_LDT(); + return; + } + + /* + * If page table isolation is enabled, ldt->entries + * will not be mapped in the userspace pagetables. + * Tell the CPU to access the LDT through the alias + * at ldt_slot_va(ldt->slot). + */ + set_ldt(ldt_slot_va(ldt->slot), ldt->nr_entries); + } else { + set_ldt(ldt->entries, ldt->nr_entries); + } + } else { clear_LDT(); + } #else clear_LDT(); #endif @@ -123,24 +178,26 @@ DEBUG_LOCKS_WARN_ON(preemptible()); } -static inline void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk) -{ - if (this_cpu_read(cpu_tlbstate.state) == TLBSTATE_OK) - this_cpu_write(cpu_tlbstate.state, TLBSTATE_LAZY); -} +void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); static inline int init_new_context(struct task_struct *tsk, struct mm_struct *mm) { - #ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS + mutex_init(&mm->context.lock); + + mm->context.ctx_id = atomic64_inc_return(&last_mm_ctx_id); + atomic64_set(&mm->context.tlb_gen, 0); + +#ifdef CONFIG_X86_INTEL_MEMORY_PROTECTION_KEYS if (cpu_feature_enabled(X86_FEATURE_OSPKE)) { /* pkey 0 is the default and always allocated */ mm->context.pkey_allocation_map = 0x1; /* -1 means unallocated or invalid */ mm->context.execute_only_pkey = -1; } - #endif - return init_new_context_ldt(tsk, mm); +#endif + init_new_context_ldt(mm); + return 0; } static inline void destroy_context(struct mm_struct *mm) { @@ -173,15 +230,16 @@ } while (0) #endif -static inline void arch_dup_mmap(struct mm_struct *oldmm, - struct mm_struct *mm) +static inline int arch_dup_mmap(struct mm_struct *oldmm, struct mm_struct *mm) { paravirt_arch_dup_mmap(oldmm, mm); + return ldt_dup_context(oldmm, mm); } static inline void arch_exit_mmap(struct mm_struct *mm) { paravirt_arch_exit_mmap(mm); + ldt_arch_exit_mmap(mm); } #ifdef CONFIG_X86_64 @@ -278,7 +336,6 @@ return __pkru_allows_pkey(vma_pkey(vma), write); } - /* * This can be used from process context to figure out what the value of * CR3 is without needing to do a (slow) __read_cr3(). @@ -288,7 +345,8 @@ */ static inline unsigned long __get_current_cr3_fast(void) { - unsigned long cr3 = __pa(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd); + unsigned long cr3 = build_cr3(this_cpu_read(cpu_tlbstate.loaded_mm)->pgd, + this_cpu_read(cpu_tlbstate.loaded_mm_asid)); /* For now, be very restrictive about when this can be called. */ VM_WARN_ON(in_nmi() || preemptible()); --- linux-azure-4.13.0.orig/arch/x86/include/asm/module.h +++ linux-azure-4.13.0/arch/x86/include/asm/module.h @@ -2,6 +2,15 @@ #define _ASM_X86_MODULE_H #include +#include + +struct mod_arch_specific { +#ifdef CONFIG_UNWINDER_ORC + unsigned int num_orcs; + int *orc_unwind_ip; + struct orc_entry *orc_unwind; +#endif +}; #ifdef CONFIG_X86_64 /* X86_64 does not define MODULE_PROC_FAMILY */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/mshyperv.h +++ linux-azure-4.13.0/arch/x86/include/asm/mshyperv.h @@ -3,7 +3,9 @@ #include #include +#include #include +#include /* * The below CPUID leaves are present if VersionAndFeatures.HypervisorPresent @@ -28,6 +30,8 @@ u32 features; u32 misc_features; u32 hints; + u32 max_vp_index; + u32 max_lp_index; }; extern struct ms_hyperv_info ms_hyperv; @@ -168,8 +172,146 @@ #if IS_ENABLED(CONFIG_HYPERV) extern struct clocksource *hyperv_cs; +extern void *hv_hypercall_pg; + +static inline u64 hv_do_hypercall(u64 control, void *input, void *output) +{ + u64 input_address = (input) ? virt_to_phys(input) : 0; + u64 output_address = (output) ? virt_to_phys(output) : 0; +#ifdef CONFIG_X86_64 + u64 hv_status; + + if (!hv_hypercall_pg) + return (u64)ULLONG_MAX; + + __asm__ __volatile__("mov %3, %%r8\n" + CALL_NOSPEC + : "=a" (hv_status), + "+c" (control), "+d" (input_address) + : "r" (output_address), + THUNK_TARGET(hv_hypercall_pg) + : "cc", "memory", "r8", "r9", "r10", "r11"); + + return hv_status; + +#else + u32 control_hi = control >> 32; + u32 control_lo = control & 0xFFFFFFFF; + u32 input_address_hi = input_address >> 32; + u32 input_address_lo = input_address & 0xFFFFFFFF; + u32 output_address_hi = output_address >> 32; + u32 output_address_lo = output_address & 0xFFFFFFFF; + + if (!hv_hypercall_pg) + return (u64)ULLONG_MAX; + + __asm__ __volatile__(CALL_NOSPEC + : "+a" (control_lo), "+d" (control_hi), + "+c" (input_address_lo) + : "b" (input_address_hi), + "D"(output_address_hi), "S"(output_address_lo), + THUNK_TARGET(hv_hypercall_pg) + : "cc", "memory"); + + return control_lo | ((u64)control_hi << 32); +#endif /* !x86_64 */ +} + +#define HV_HYPERCALL_RESULT_MASK GENMASK_ULL(15, 0) +#define HV_HYPERCALL_FAST_BIT BIT(16) +#define HV_HYPERCALL_VARHEAD_OFFSET 17 +#define HV_HYPERCALL_REP_COMP_OFFSET 32 +#define HV_HYPERCALL_REP_COMP_MASK GENMASK_ULL(43, 32) +#define HV_HYPERCALL_REP_START_OFFSET 48 +#define HV_HYPERCALL_REP_START_MASK GENMASK_ULL(59, 48) + +/* Fast hypercall with 8 bytes of input and no output */ +static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) +{ + u64 hv_status, control = (u64)code | HV_HYPERCALL_FAST_BIT; + register void *__sp asm(_ASM_SP); + +#ifdef CONFIG_X86_64 + { + __asm__ __volatile__(CALL_NOSPEC + : "=a" (hv_status), "+r" (__sp), + "+c" (control), "+d" (input1) + : THUNK_TARGET(hv_hypercall_pg) + : "cc", "r8", "r9", "r10", "r11"); + } +#else + { + u32 input1_hi = upper_32_bits(input1); + u32 input1_lo = lower_32_bits(input1); + + __asm__ __volatile__ (CALL_NOSPEC + : "=A"(hv_status), + "+c"(input1_lo), + "+r"(__sp) + : "A" (control), + "b" (input1_hi), + THUNK_TARGET(hv_hypercall_pg) + : "cc", "edi", "esi"); + } +#endif + return hv_status; +} + +/* + * Rep hypercalls. Callers of this functions are supposed to ensure that + * rep_count and vahead_size comply with union hv_hypercall_input definition. + */ +static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, + void *input, void *output) +{ + union hv_hypercall_input hc_input = { .code = code, + .varhead_size = varhead_size, + .rep_count = rep_count}; + u64 status; + + do { + status = hv_do_hypercall(hc_input.as_uint64, input, output); + if ((status & 0xffff) != HV_STATUS_SUCCESS) + return status; + + hc_input.rep_start = (status >> 32) & 0xfff; + + touch_nmi_watchdog(); + } while (hc_input.rep_start < hc_input.rep_count); + + return status; +} + +/* + * Hypervisor's notion of virtual processor ID is different from + * Linux' notion of CPU ID. This information can only be retrieved + * in the context of the calling CPU. Setup a map for easy access + * to this information. + */ +extern u32 __percpu *hv_vp_index; +extern u32 hv_max_vp_index; + +/** + * hv_cpu_number_to_vp_number() - Map CPU to VP. + * @cpu_number: CPU number in Linux terms + * + * This function returns the mapping between the Linux processor + * number and the hypervisor's virtual processor number, useful + * in making hypercalls and such that talk about specific + * processors. + * + * Return: Virtual processor number in Hyper-V terms + */ +static inline int hv_cpu_number_to_vp_number(int cpu_number) +{ + WARN_ON(hv_vp_index[cpu_number] == -1); + + return hv_vp_index[cpu_number]; +} void hyperv_init(void); +void hyperv_setup_mmu_ops(void); +void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); --- linux-azure-4.13.0.orig/arch/x86/include/asm/msr-index.h +++ linux-azure-4.13.0/arch/x86/include/asm/msr-index.h @@ -41,6 +41,9 @@ #define MSR_PPIN_CTL 0x0000004e #define MSR_PPIN 0x0000004f +#define MSR_IA32_SPEC_CTRL 0x00000048 +#define MSR_IA32_PRED_CMD 0x00000049 + #define MSR_IA32_PERFCTR0 0x000000c1 #define MSR_IA32_PERFCTR1 0x000000c2 #define MSR_FSB_FREQ 0x000000cd @@ -342,6 +345,7 @@ #define MSR_F15H_NB_PERF_CTR 0xc0010241 #define MSR_F15H_PTSC 0xc0010280 #define MSR_F15H_IC_CFG 0xc0011021 +#define MSR_F15H_IC_CFG_DIS_IND BIT_ULL(14) /* Fam 10h MSRs */ #define MSR_FAM10H_MMIO_CONF_BASE 0xc0010058 @@ -351,6 +355,9 @@ #define FAM10H_MMIO_CONF_BASE_MASK 0xfffffffULL #define FAM10H_MMIO_CONF_BASE_SHIFT 20 #define MSR_FAM10H_NODE_ID 0xc001100c +#define MSR_F10H_DECFG 0xc0011029 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT 1 +#define MSR_F10H_DECFG_LFENCE_SERIALIZE BIT_ULL(MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT) /* K8 MSRs */ #define MSR_K8_TOP_MEM1 0xc001001a @@ -434,6 +441,8 @@ #define FEATURE_CONTROL_VMXON_ENABLED_INSIDE_SMX (1<<1) #define FEATURE_CONTROL_VMXON_ENABLED_OUTSIDE_SMX (1<<2) #define FEATURE_CONTROL_LMCE (1<<20) +#define FEATURE_ENABLE_IBRS (1<<0) +#define FEATURE_SET_IBPB (1<<0) #define MSR_IA32_APICBASE 0x0000001b #define MSR_IA32_APICBASE_BSP (1<<8) --- linux-azure-4.13.0.orig/arch/x86/include/asm/msr.h +++ linux-azure-4.13.0/arch/x86/include/asm/msr.h @@ -213,8 +213,7 @@ * that some other imaginary CPU is updating continuously with a * time stamp. */ - alternative_2("", "mfence", X86_FEATURE_MFENCE_RDTSC, - "lfence", X86_FEATURE_LFENCE_RDTSC); + alternative("", "lfence", X86_FEATURE_LFENCE_RDTSC); return rdtsc(); } --- linux-azure-4.13.0.orig/arch/x86/include/asm/mwait.h +++ linux-azure-4.13.0/arch/x86/include/asm/mwait.h @@ -5,6 +5,8 @@ #include #include +#include +#include #define MWAIT_SUBSTATE_MASK 0xf #define MWAIT_CSTATE_MASK 0xf @@ -105,9 +107,15 @@ mb(); } + if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + __monitor((void *)¤t_thread_info()->flags, 0, 0); if (!need_resched()) __mwait(eax, ecx); + + if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); } current_clr_polling(); } --- linux-azure-4.13.0.orig/arch/x86/include/asm/nospec-branch.h +++ linux-azure-4.13.0/arch/x86/include/asm/nospec-branch.h @@ -0,0 +1,185 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef __NOSPEC_BRANCH_H__ +#define __NOSPEC_BRANCH_H__ + +#include +#include +#include + +#ifdef __ASSEMBLY__ + +/* + * This should be used immediately before a retpoline alternative. It tells + * objtool where the retpolines are so that it can make sense of the control + * flow by just reading the original instruction(s) and ignoring the + * alternatives. + */ +.macro ANNOTATE_NOSPEC_ALTERNATIVE + .Lannotate_\@: + .pushsection .discard.nospec + .long .Lannotate_\@ - . + .popsection +.endm + +/* + * This should be used immediately before an indirect jump/call. It tells + * objtool the subsequent indirect jump/call is vouched safe for retpoline + * builds. + */ +.macro ANNOTATE_RETPOLINE_SAFE + .Lannotate_\@: + .pushsection .discard.retpoline_safe + _ASM_PTR .Lannotate_\@ + .popsection +.endm + +/* + * These are the bare retpoline primitives for indirect jmp and call. + * Do not use these directly; they only exist to make the ALTERNATIVE + * invocation below less ugly. + */ +.macro RETPOLINE_JMP reg:req + call .Ldo_rop_\@ +.Lspec_trap_\@: + pause + lfence + jmp .Lspec_trap_\@ +.Ldo_rop_\@: + mov \reg, (%_ASM_SP) + ret +.endm + +/* + * This is a wrapper around RETPOLINE_JMP so the called function in reg + * returns to the instruction after the macro. + */ +.macro RETPOLINE_CALL reg:req + jmp .Ldo_call_\@ +.Ldo_retpoline_jmp_\@: + RETPOLINE_JMP \reg +.Ldo_call_\@: + call .Ldo_retpoline_jmp_\@ +.endm + +/* + * JMP_NOSPEC and CALL_NOSPEC macros can be used instead of a simple + * indirect jmp/call which may be susceptible to the Spectre variant 2 + * attack. + */ +.macro JMP_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; jmp *\reg), \ + __stringify(RETPOLINE_JMP \reg), X86_FEATURE_RETPOLINE, \ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; jmp *\reg), X86_FEATURE_RETPOLINE_AMD +#else + jmp *\reg +#endif +.endm + +.macro CALL_NOSPEC reg:req +#ifdef CONFIG_RETPOLINE + ANNOTATE_NOSPEC_ALTERNATIVE + ALTERNATIVE_2 __stringify(ANNOTATE_RETPOLINE_SAFE; call *\reg), \ + __stringify(RETPOLINE_CALL \reg), X86_FEATURE_RETPOLINE,\ + __stringify(lfence; ANNOTATE_RETPOLINE_SAFE; call *\reg), X86_FEATURE_RETPOLINE_AMD +#else + call *\reg +#endif +.endm + +/* This clobbers the BX register */ +.macro FILL_RETURN_BUFFER nr:req ftr:req +#ifdef CONFIG_RETPOLINE + ALTERNATIVE "", "call __clear_rsb", \ftr +#endif +.endm + +#else /* __ASSEMBLY__ */ + +#define ANNOTATE_NOSPEC_ALTERNATIVE \ + "999:\n\t" \ + ".pushsection .discard.nospec\n\t" \ + ".long 999b - .\n\t" \ + ".popsection\n\t" + +#define ANNOTATE_RETPOLINE_SAFE \ + "999:\n\t" \ + ".pushsection .discard.retpoline_safe\n\t" \ + _ASM_PTR " 999b\n\t" \ + ".popsection\n\t" + +#if defined(CONFIG_X86_64) && defined(RETPOLINE) + +/* + * Since the inline asm uses the %V modifier which is only in newer GCC, + * the 64-bit one is dependent on RETPOLINE not CONFIG_RETPOLINE. + */ +# define CALL_NOSPEC \ + ANNOTATE_NOSPEC_ALTERNATIVE \ + ALTERNATIVE( \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%[thunk_target]\n", \ + "call __x86_indirect_thunk_%V[thunk_target]\n", \ + X86_FEATURE_RETPOLINE) +# define THUNK_TARGET(addr) [thunk_target] "r" (addr) + +#elif defined(CONFIG_X86_32) && defined(CONFIG_RETPOLINE) +/* + * For i386 we use the original ret-equivalent retpoline, because + * otherwise we'll run out of registers. We don't care about CET + * here, anyway. + */ +# define CALL_NOSPEC \ + ALTERNATIVE( \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%[thunk_target]\n", \ + " jmp 904f;\n" \ + " .align 16\n" \ + "901: call 903f;\n" \ + "902: pause;\n" \ + " lfence;\n" \ + " jmp 902b;\n" \ + " .align 16\n" \ + "903: addl $4, %%esp;\n" \ + " pushl %[thunk_target];\n" \ + " ret;\n" \ + " .align 16\n" \ + "904: call 901b;\n", \ + X86_FEATURE_RETPOLINE) + +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#else /* No retpoline for C / inline asm */ +# define CALL_NOSPEC "call *%[thunk_target]\n" +# define THUNK_TARGET(addr) [thunk_target] "rm" (addr) +#endif + +/* The Spectre V2 mitigation variants */ +enum spectre_v2_mitigation { + SPECTRE_V2_NONE, + SPECTRE_V2_RETPOLINE_MINIMAL, + SPECTRE_V2_RETPOLINE_MINIMAL_AMD, + SPECTRE_V2_RETPOLINE_GENERIC, + SPECTRE_V2_RETPOLINE_AMD, + SPECTRE_V2_IBRS, +}; + +/* + * On VMEXIT we must ensure that no RSB predictions learned in the guest + * can be followed in the host, by overwriting the RSB completely. Both + * retpoline and IBRS mitigations for Spectre v2 need this; only on future + * CPUs with IBRS_ATT *might* it be avoided. + */ +static inline void vmexit_fill_RSB(void) +{ +#ifdef CONFIG_RETPOLINE + alternative_input("", + "call __fill_rsb", + X86_FEATURE_RETPOLINE, + ASM_NO_INPUT_CLOBBER(_ASM_BX, "memory")); +#endif +} + +#endif /* __ASSEMBLY__ */ +#endif /* __NOSPEC_BRANCH_H__ */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/orc_lookup.h +++ linux-azure-4.13.0/arch/x86/include/asm/orc_lookup.h @@ -0,0 +1,46 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ +#ifndef _ORC_LOOKUP_H +#define _ORC_LOOKUP_H + +/* + * This is a lookup table for speeding up access to the .orc_unwind table. + * Given an input address offset, the corresponding lookup table entry + * specifies a subset of the .orc_unwind table to search. + * + * Each block represents the end of the previous range and the start of the + * next range. An extra block is added to give the last range an end. + * + * The block size should be a power of 2 to avoid a costly 'div' instruction. + * + * A block size of 256 was chosen because it roughly doubles unwinder + * performance while only adding ~5% to the ORC data footprint. + */ +#define LOOKUP_BLOCK_ORDER 8 +#define LOOKUP_BLOCK_SIZE (1 << LOOKUP_BLOCK_ORDER) + +#ifndef LINKER_SCRIPT + +extern unsigned int orc_lookup[]; +extern unsigned int orc_lookup_end[]; + +#define LOOKUP_START_IP (unsigned long)_stext +#define LOOKUP_STOP_IP (unsigned long)_etext + +#endif /* LINKER_SCRIPT */ + +#endif /* _ORC_LOOKUP_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/orc_types.h +++ linux-azure-4.13.0/arch/x86/include/asm/orc_types.h @@ -0,0 +1,107 @@ +/* + * Copyright (C) 2017 Josh Poimboeuf + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see . + */ + +#ifndef _ORC_TYPES_H +#define _ORC_TYPES_H + +#include +#include + +/* + * The ORC_REG_* registers are base registers which are used to find other + * registers on the stack. + * + * ORC_REG_PREV_SP, also known as DWARF Call Frame Address (CFA), is the + * address of the previous frame: the caller's SP before it called the current + * function. + * + * ORC_REG_UNDEFINED means the corresponding register's value didn't change in + * the current frame. + * + * The most commonly used base registers are SP and BP -- which the previous SP + * is usually based on -- and PREV_SP and UNDEFINED -- which the previous BP is + * usually based on. + * + * The rest of the base registers are needed for special cases like entry code + * and GCC realigned stacks. + */ +#define ORC_REG_UNDEFINED 0 +#define ORC_REG_PREV_SP 1 +#define ORC_REG_DX 2 +#define ORC_REG_DI 3 +#define ORC_REG_BP 4 +#define ORC_REG_SP 5 +#define ORC_REG_R10 6 +#define ORC_REG_R13 7 +#define ORC_REG_BP_INDIRECT 8 +#define ORC_REG_SP_INDIRECT 9 +#define ORC_REG_MAX 15 + +/* + * ORC_TYPE_CALL: Indicates that sp_reg+sp_offset resolves to PREV_SP (the + * caller's SP right before it made the call). Used for all callable + * functions, i.e. all C code and all callable asm functions. + * + * ORC_TYPE_REGS: Used in entry code to indicate that sp_reg+sp_offset points + * to a fully populated pt_regs from a syscall, interrupt, or exception. + * + * ORC_TYPE_REGS_IRET: Used in entry code to indicate that sp_reg+sp_offset + * points to the iret return frame. + * + * The UNWIND_HINT macros are used only for the unwind_hint struct. They + * aren't used in struct orc_entry due to size and complexity constraints. + * Objtool converts them to real types when it converts the hints to orc + * entries. + */ +#define ORC_TYPE_CALL 0 +#define ORC_TYPE_REGS 1 +#define ORC_TYPE_REGS_IRET 2 +#define UNWIND_HINT_TYPE_SAVE 3 +#define UNWIND_HINT_TYPE_RESTORE 4 + +#ifndef __ASSEMBLY__ +/* + * This struct is more or less a vastly simplified version of the DWARF Call + * Frame Information standard. It contains only the necessary parts of DWARF + * CFI, simplified for ease of access by the in-kernel unwinder. It tells the + * unwinder how to find the previous SP and BP (and sometimes entry regs) on + * the stack for a given code address. Each instance of the struct corresponds + * to one or more code locations. + */ +struct orc_entry { + s16 sp_offset; + s16 bp_offset; + unsigned sp_reg:4; + unsigned bp_reg:4; + unsigned type:2; +} __packed; + +/* + * This struct is used by asm and inline asm code to manually annotate the + * location of registers on the stack for the ORC unwinder. + * + * Type can be either ORC_TYPE_* or UNWIND_HINT_TYPE_*. + */ +struct unwind_hint { + u32 ip; + s16 sp_offset; + u8 sp_reg; + u8 type; +}; +#endif /* __ASSEMBLY__ */ + +#endif /* _ORC_TYPES_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/page_64.h +++ linux-azure-4.13.0/arch/x86/include/asm/page_64.h @@ -51,6 +51,10 @@ void copy_page(void *to, void *from); +#ifdef CONFIG_X86_MCE +#define arch_unmap_kpfn arch_unmap_kpfn +#endif + #endif /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_VSYSCALL_EMULATION --- linux-azure-4.13.0.orig/arch/x86/include/asm/paravirt.h +++ linux-azure-4.13.0/arch/x86/include/asm/paravirt.h @@ -6,6 +6,7 @@ #ifdef CONFIG_PARAVIRT #include #include +#include #include @@ -15,10 +16,9 @@ #include #include -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - PVOP_VCALL2(pv_cpu_ops.load_sp0, tss, thread); + PVOP_VCALL1(pv_cpu_ops.load_sp0, sp0); } /* The paravirtualized CPUID instruction. */ @@ -71,11 +71,6 @@ PVOP_VCALL1(pv_mmu_ops.write_cr3, x); } -static inline unsigned long __read_cr4(void) -{ - return PVOP_CALL0(unsigned long, pv_cpu_ops.read_cr4); -} - static inline void __write_cr4(unsigned long x) { PVOP_VCALL1(pv_cpu_ops.write_cr4, x); @@ -228,10 +223,6 @@ { PVOP_VCALL2(pv_cpu_ops.set_ldt, addr, entries); } -static inline void store_idt(struct desc_ptr *dtr) -{ - PVOP_VCALL1(pv_cpu_ops.store_idt, dtr); -} static inline unsigned long paravirt_store_tr(void) { return PVOP_CALL0(unsigned long, pv_cpu_ops.store_tr); @@ -365,12 +356,6 @@ PVOP_VCALL1(pv_mmu_ops.release_p4d, pfn); } -static inline void pte_update(struct mm_struct *mm, unsigned long addr, - pte_t *ptep) -{ - PVOP_VCALL3(pv_mmu_ops.pte_update, mm, addr, ptep); -} - static inline pte_t __pte(pteval_t val) { pteval_t ret; @@ -472,28 +457,6 @@ PVOP_VCALL4(pv_mmu_ops.set_pte_at, mm, addr, ptep, pte.pte); } -static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmd) -{ - if (sizeof(pmdval_t) > sizeof(long)) - /* 5 arg words */ - pv_mmu_ops.set_pmd_at(mm, addr, pmdp, pmd); - else - PVOP_VCALL4(pv_mmu_ops.set_pmd_at, mm, addr, pmdp, - native_pmd_val(pmd)); -} - -static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud) -{ - if (sizeof(pudval_t) > sizeof(long)) - /* 5 arg words */ - pv_mmu_ops.set_pud_at(mm, addr, pudp, pud); - else - PVOP_VCALL4(pv_mmu_ops.set_pud_at, mm, addr, pudp, - native_pud_val(pud)); -} - static inline void set_pmd(pmd_t *pmdp, pmd_t pmd) { pmdval_t val = native_pmd_val(pmd); @@ -916,23 +879,27 @@ #define INTERRUPT_RETURN \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_iret), CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret)) + ANNOTATE_RETPOLINE_SAFE; \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_iret);) #define DISABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_disable), clobbers, \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_disable); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #define ENABLE_INTERRUPTS(clobbers) \ PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_irq_enable), clobbers, \ PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_irq_ops+PV_IRQ_irq_enable); \ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #ifdef CONFIG_X86_32 #define GET_CR0_INTO_EAX \ push %ecx; push %edx; \ + ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_cpu_ops+PV_CPU_read_cr0); \ pop %edx; pop %ecx #else /* !CONFIG_X86_32 */ @@ -954,21 +921,29 @@ */ #define SWAPGS \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_swapgs), CLBR_NONE, \ - call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs) \ + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_cpu_ops+PV_CPU_swapgs); \ ) #define GET_CR2_INTO_RAX \ - call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2) - -#define PARAVIRT_ADJUST_EXCEPTION_FRAME \ - PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_adjust_exception_frame), \ - CLBR_NONE, \ - call PARA_INDIRECT(pv_irq_ops+PV_IRQ_adjust_exception_frame)) + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_mmu_ops+PV_MMU_read_cr2); #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(pv_cpu_ops, PV_CPU_usergs_sysret64), \ CLBR_NONE, \ - jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64)) + ANNOTATE_RETPOLINE_SAFE; \ + jmp PARA_INDIRECT(pv_cpu_ops+PV_CPU_usergs_sysret64);) + +#ifdef CONFIG_DEBUG_ENTRY +#define SAVE_FLAGS(clobbers) \ + PARA_SITE(PARA_PATCH(pv_irq_ops, PV_IRQ_save_fl), clobbers, \ + PV_SAVE_REGS(clobbers | CLBR_CALLEE_SAVE); \ + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_irq_ops+PV_IRQ_save_fl); \ + PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) +#endif + #endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/paravirt_types.h +++ linux-azure-4.13.0/arch/x86/include/asm/paravirt_types.h @@ -42,6 +42,7 @@ #include #include #include +#include struct page; struct thread_struct; @@ -107,7 +108,6 @@ unsigned long (*read_cr0)(void); void (*write_cr0)(unsigned long); - unsigned long (*read_cr4)(void); void (*write_cr4)(unsigned long); #ifdef CONFIG_X86_64 @@ -119,8 +119,6 @@ void (*load_tr_desc)(void); void (*load_gdt)(const struct desc_ptr *); void (*load_idt)(const struct desc_ptr *); - /* store_gdt has been removed. */ - void (*store_idt)(struct desc_ptr *); void (*set_ldt)(const void *desc, unsigned entries); unsigned long (*store_tr)(void); void (*load_tls)(struct thread_struct *t, unsigned int cpu); @@ -136,7 +134,7 @@ void (*alloc_ldt)(struct desc_struct *ldt, unsigned entries); void (*free_ldt)(struct desc_struct *ldt, unsigned entries); - void (*load_sp0)(struct tss_struct *tss, struct thread_struct *t); + void (*load_sp0)(unsigned long sp0); void (*set_iopl_mask)(unsigned mask); @@ -196,9 +194,6 @@ void (*safe_halt)(void); void (*halt)(void); -#ifdef CONFIG_X86_64 - void (*adjust_exception_frame)(void); -#endif } __no_randomize_layout; struct pv_mmu_ops { @@ -248,12 +243,6 @@ void (*set_pte_at)(struct mm_struct *mm, unsigned long addr, pte_t *ptep, pte_t pteval); void (*set_pmd)(pmd_t *pmdp, pmd_t pmdval); - void (*set_pmd_at)(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp, pmd_t pmdval); - void (*set_pud_at)(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pudval); - void (*pte_update)(struct mm_struct *mm, unsigned long addr, - pte_t *ptep); pte_t (*ptep_modify_prot_start)(struct mm_struct *mm, unsigned long addr, pte_t *ptep); @@ -403,7 +392,9 @@ * offset into the paravirt_patch_template structure, and can therefore be * freely converted back into a structure offset. */ -#define PARAVIRT_CALL "call *%c[paravirt_opptr];" +#define PARAVIRT_CALL \ + ANNOTATE_RETPOLINE_SAFE \ + "call *%c[paravirt_opptr];" /* * These macros are intended to wrap calls through one of the paravirt @@ -471,8 +462,8 @@ */ #ifdef CONFIG_X86_32 #define PVOP_VCALL_ARGS \ - unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; \ - register void *__sp asm("esp") + unsigned long __eax = __eax, __edx = __edx, __ecx = __ecx; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "a" ((unsigned long)(x)) @@ -492,8 +483,8 @@ /* [re]ax isn't an arg, but the return val */ #define PVOP_VCALL_ARGS \ unsigned long __edi = __edi, __esi = __esi, \ - __edx = __edx, __ecx = __ecx, __eax = __eax; \ - register void *__sp asm("rsp") + __edx = __edx, __ecx = __ecx, __eax = __eax; + #define PVOP_CALL_ARGS PVOP_VCALL_ARGS #define PVOP_CALL_ARG1(x) "D" ((unsigned long)(x)) @@ -544,7 +535,7 @@ asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -554,7 +545,7 @@ asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ @@ -581,7 +572,7 @@ asm volatile(pre \ paravirt_alt(PARAVIRT_CALL) \ post \ - : call_clbr, "+r" (__sp) \ + : call_clbr, ASM_CALL_CONSTRAINT \ : paravirt_type(op), \ paravirt_clobber(clbr), \ ##__VA_ARGS__ \ --- linux-azure-4.13.0.orig/arch/x86/include/asm/percpu.h +++ linux-azure-4.13.0/arch/x86/include/asm/percpu.h @@ -525,7 +525,7 @@ { bool oldbit; - asm volatile("bt "__percpu_arg(2)",%1\n\t" + asm volatile("bt "__percpu_arg(2)",%1" CC_SET(c) : CC_OUT(c) (oldbit) : "m" (*(unsigned long __percpu *)addr), "Ir" (nr)); --- linux-azure-4.13.0.orig/arch/x86/include/asm/pgalloc.h +++ linux-azure-4.13.0/arch/x86/include/asm/pgalloc.h @@ -29,6 +29,17 @@ */ extern gfp_t __userpte_alloc_gfp; +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Instead of one PGD, we acquire two PGDs. Being order-1, it is + * both 8k in size and 8k-aligned. That lets us just flip bit 12 + * in a pointer to swap between the two 4k halves. + */ +#define PGD_ALLOCATION_ORDER 1 +#else +#define PGD_ALLOCATION_ORDER 0 +#endif + /* * Allocate and free page tables. */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/pgtable.h +++ linux-azure-4.13.0/arch/x86/include/asm/pgtable.h @@ -17,6 +17,7 @@ #include void ptdump_walk_pgd_level(struct seq_file *m, pgd_t *pgd); +void ptdump_walk_pgd_level_debugfs(struct seq_file *m, pgd_t *pgd, bool user); void ptdump_walk_pgd_level_checkwx(void); #ifdef CONFIG_DEBUG_WX @@ -43,8 +44,6 @@ #else /* !CONFIG_PARAVIRT */ #define set_pte(ptep, pte) native_set_pte(ptep, pte) #define set_pte_at(mm, addr, ptep, pte) native_set_pte_at(mm, addr, ptep, pte) -#define set_pmd_at(mm, addr, pmdp, pmd) native_set_pmd_at(mm, addr, pmdp, pmd) -#define set_pud_at(mm, addr, pudp, pud) native_set_pud_at(mm, addr, pudp, pud) #define set_pte_atomic(ptep, pte) \ native_set_pte_atomic(ptep, pte) @@ -75,8 +74,6 @@ #define pte_clear(mm, addr, ptep) native_pte_clear(mm, addr, ptep) #define pmd_clear(pmd) native_pmd_clear(pmd) -#define pte_update(mm, addr, ptep) do { } while (0) - #define pgd_val(x) native_pgd_val(x) #define __pgd(x) native_make_pgd(x) @@ -835,7 +832,12 @@ static inline int p4d_bad(p4d_t p4d) { - return (p4d_flags(p4d) & ~(_KERNPG_TABLE | _PAGE_USER)) != 0; + unsigned long ignore_flags = _KERNPG_TABLE | _PAGE_USER; + + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + ignore_flags |= _PAGE_NX; + + return (p4d_flags(p4d) & ~ignore_flags) != 0; } #endif /* CONFIG_PGTABLE_LEVELS > 3 */ @@ -869,7 +871,12 @@ static inline int pgd_bad(pgd_t pgd) { - return (pgd_flags(pgd) & ~_PAGE_USER) != _KERNPG_TABLE; + unsigned long ignore_flags = _PAGE_USER; + + if (IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + ignore_flags |= _PAGE_NX; + + return (pgd_flags(pgd) & ~ignore_flags) != _KERNPG_TABLE; } static inline int pgd_none(pgd_t pgd) @@ -898,7 +905,11 @@ * pgd_offset() returns a (pgd_t *) * pgd_index() is used get the offset into the pgd page's array of pgd_t's; */ -#define pgd_offset(mm, address) ((mm)->pgd + pgd_index((address))) +#define pgd_offset_pgd(pgd, address) (pgd + pgd_index((address))) +/* + * a shortcut to get a pgd_t in a given mm + */ +#define pgd_offset(mm, address) pgd_offset_pgd((mm)->pgd, (address)) /* * a shortcut which implies the use of the kernel's pgd, instead * of a process's @@ -965,31 +976,18 @@ native_set_pte(ptep, pte); } -static inline void native_set_pmd_at(struct mm_struct *mm, unsigned long addr, - pmd_t *pmdp , pmd_t pmd) +static inline void set_pmd_at(struct mm_struct *mm, unsigned long addr, + pmd_t *pmdp, pmd_t pmd) { native_set_pmd(pmdp, pmd); } -static inline void native_set_pud_at(struct mm_struct *mm, unsigned long addr, - pud_t *pudp, pud_t pud) +static inline void set_pud_at(struct mm_struct *mm, unsigned long addr, + pud_t *pudp, pud_t pud) { native_set_pud(pudp, pud); } -#ifndef CONFIG_PARAVIRT -/* - * Rules for using pte_update - it must be called after any PTE update which - * has not been done using the set_pte / clear_pte interfaces. It is used by - * shadow mode hypervisors to resynchronize the shadow page tables. Kernel PTE - * updates should either be sets, clears, or set_pte_atomic for P->P - * transitions, which means this hook should only be called for user PTEs. - * This hook implies a P->P protection or access change has taken place, which - * requires a subsequent TLB flush. - */ -#define pte_update(mm, addr, ptep) do { } while (0) -#endif - /* * We only update the dirty/accessed state if we set * the dirty bit by hand in the kernel, since the hardware @@ -1017,7 +1015,6 @@ pte_t *ptep) { pte_t pte = native_ptep_get_and_clear(ptep); - pte_update(mm, addr, ptep); return pte; } @@ -1044,7 +1041,6 @@ unsigned long addr, pte_t *ptep) { clear_bit(_PAGE_BIT_RW, (unsigned long *)&ptep->pte); - pte_update(mm, addr, ptep); } #define flush_tlb_fix_spurious_fault(vma, address) do { } while (0) @@ -1109,7 +1105,14 @@ */ static inline void clone_pgd_range(pgd_t *dst, pgd_t *src, int count) { - memcpy(dst, src, count * sizeof(pgd_t)); + memcpy(dst, src, count * sizeof(pgd_t)); +#ifdef CONFIG_PAGE_TABLE_ISOLATION + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + /* Clone the user space pgd as well */ + memcpy(kernel_to_user_pgdp(dst), kernel_to_user_pgdp(src), + count * sizeof(pgd_t)); +#endif } #define PTE_SHIFT ilog2(PTRS_PER_PTE) --- linux-azure-4.13.0.orig/arch/x86/include/asm/pgtable_32_types.h +++ linux-azure-4.13.0/arch/x86/include/asm/pgtable_32_types.h @@ -37,13 +37,23 @@ #define LAST_PKMAP 1024 #endif -#define PKMAP_BASE ((FIXADDR_START - PAGE_SIZE * (LAST_PKMAP + 1)) \ - & PMD_MASK) +/* + * Define this here and validate with BUILD_BUG_ON() in pgtable_32.c + * to avoid include recursion hell + */ +#define CPU_ENTRY_AREA_PAGES (NR_CPUS * 40) + +#define CPU_ENTRY_AREA_BASE \ + ((FIXADDR_TOT_START - PAGE_SIZE * (CPU_ENTRY_AREA_PAGES + 1)) \ + & PMD_MASK) + +#define PKMAP_BASE \ + ((CPU_ENTRY_AREA_BASE - PAGE_SIZE) & PMD_MASK) #ifdef CONFIG_HIGHMEM # define VMALLOC_END (PKMAP_BASE - 2 * PAGE_SIZE) #else -# define VMALLOC_END (FIXADDR_START - 2 * PAGE_SIZE) +# define VMALLOC_END (CPU_ENTRY_AREA_BASE - 2 * PAGE_SIZE) #endif #define MODULES_VADDR VMALLOC_START --- linux-azure-4.13.0.orig/arch/x86/include/asm/pgtable_64.h +++ linux-azure-4.13.0/arch/x86/include/asm/pgtable_64.h @@ -130,9 +130,97 @@ #endif } +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * All top-level PAGE_TABLE_ISOLATION page tables are order-1 pages + * (8k-aligned and 8k in size). The kernel one is at the beginning 4k and + * the user one is in the last 4k. To switch between them, you + * just need to flip the 12th bit in their addresses. + */ +#define PTI_PGTABLE_SWITCH_BIT PAGE_SHIFT + +/* + * This generates better code than the inline assembly in + * __set_bit(). + */ +static inline void *ptr_set_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr |= BIT(bit); + return (void *)__ptr; +} +static inline void *ptr_clear_bit(void *ptr, int bit) +{ + unsigned long __ptr = (unsigned long)ptr; + + __ptr &= ~BIT(bit); + return (void *)__ptr; +} + +static inline pgd_t *kernel_to_user_pgdp(pgd_t *pgdp) +{ + return ptr_set_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline pgd_t *user_to_kernel_pgdp(pgd_t *pgdp) +{ + return ptr_clear_bit(pgdp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *kernel_to_user_p4dp(p4d_t *p4dp) +{ + return ptr_set_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} + +static inline p4d_t *user_to_kernel_p4dp(p4d_t *p4dp) +{ + return ptr_clear_bit(p4dp, PTI_PGTABLE_SWITCH_BIT); +} +#endif /* CONFIG_PAGE_TABLE_ISOLATION */ + +/* + * Page table pages are page-aligned. The lower half of the top + * level is used for userspace and the top half for the kernel. + * + * Returns true for parts of the PGD that map userspace and + * false for the parts that map the kernel. + */ +static inline bool pgdp_maps_userspace(void *__ptr) +{ + unsigned long ptr = (unsigned long)__ptr; + + return (ptr & ~PAGE_MASK) < (PAGE_SIZE / 2); +} + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +pgd_t __pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd); + +/* + * Take a PGD location (pgdp) and a pgd value that needs to be set there. + * Populates the user and returns the resulting PGD that must be set in + * the kernel copy of the page tables. + */ +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + if (!static_cpu_has(X86_FEATURE_PTI)) + return pgd; + return __pti_set_user_pgd(pgdp, pgd); +} +#else +static inline pgd_t pti_set_user_pgd(pgd_t *pgdp, pgd_t pgd) +{ + return pgd; +} +#endif + static inline void native_set_p4d(p4d_t *p4dp, p4d_t p4d) { +#if defined(CONFIG_PAGE_TABLE_ISOLATION) && !defined(CONFIG_X86_5LEVEL) + p4dp->pgd = pti_set_user_pgd(&p4dp->pgd, p4d.pgd); +#else *p4dp = p4d; +#endif } static inline void native_p4d_clear(p4d_t *p4d) @@ -146,7 +234,11 @@ static inline void native_set_pgd(pgd_t *pgdp, pgd_t pgd) { +#ifdef CONFIG_PAGE_TABLE_ISOLATION + *pgdp = pti_set_user_pgd(pgdp, pgd); +#else *pgdp = pgd; +#endif } static inline void native_pgd_clear(pgd_t *pgd) --- linux-azure-4.13.0.orig/arch/x86/include/asm/pgtable_64_types.h +++ linux-azure-4.13.0/arch/x86/include/asm/pgtable_64_types.h @@ -74,33 +74,52 @@ #define PGDIR_SIZE (_AC(1, UL) << PGDIR_SHIFT) #define PGDIR_MASK (~(PGDIR_SIZE - 1)) -/* See Documentation/x86/x86_64/mm.txt for a description of the memory map. */ -#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) +/* + * See Documentation/x86/x86_64/mm.txt for a description of the memory map. + * + * Be very careful vs. KASLR when changing anything here. The KASLR address + * range must not overlap with anything except the KASAN shadow area, which + * is correct as KASAN disables KASLR. + */ +#define MAXMEM _AC(__AC(1, UL) << MAX_PHYSMEM_BITS, UL) + #ifdef CONFIG_X86_5LEVEL -#define VMALLOC_SIZE_TB _AC(16384, UL) -#define __VMALLOC_BASE _AC(0xff92000000000000, UL) -#define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) +# define VMALLOC_SIZE_TB _AC(12800, UL) +# define __VMALLOC_BASE _AC(0xffa0000000000000, UL) +# define __VMEMMAP_BASE _AC(0xffd4000000000000, UL) +# define LDT_PGD_ENTRY _AC(-112, UL) +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #else -#define VMALLOC_SIZE_TB _AC(32, UL) -#define __VMALLOC_BASE _AC(0xffffc90000000000, UL) -#define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +# define VMALLOC_SIZE_TB _AC(32, UL) +# define __VMALLOC_BASE _AC(0xffffc90000000000, UL) +# define __VMEMMAP_BASE _AC(0xffffea0000000000, UL) +# define LDT_PGD_ENTRY _AC(-3, UL) +# define LDT_BASE_ADDR (LDT_PGD_ENTRY << PGDIR_SHIFT) #endif + #ifdef CONFIG_RANDOMIZE_MEMORY -#define VMALLOC_START vmalloc_base -#define VMEMMAP_START vmemmap_base +# define VMALLOC_START vmalloc_base +# define VMEMMAP_START vmemmap_base #else -#define VMALLOC_START __VMALLOC_BASE -#define VMEMMAP_START __VMEMMAP_BASE +# define VMALLOC_START __VMALLOC_BASE +# define VMEMMAP_START __VMEMMAP_BASE #endif /* CONFIG_RANDOMIZE_MEMORY */ -#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) -#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) + +#define VMALLOC_END (VMALLOC_START + _AC((VMALLOC_SIZE_TB << 40) - 1, UL)) + +#define MODULES_VADDR (__START_KERNEL_map + KERNEL_IMAGE_SIZE) /* The module sections ends with the start of the fixmap */ -#define MODULES_END __fix_to_virt(__end_of_fixed_addresses + 1) -#define MODULES_LEN (MODULES_END - MODULES_VADDR) -#define ESPFIX_PGD_ENTRY _AC(-2, UL) -#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) -#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) -#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) +#define MODULES_END _AC(0xffffffffff000000, UL) +#define MODULES_LEN (MODULES_END - MODULES_VADDR) + +#define ESPFIX_PGD_ENTRY _AC(-2, UL) +#define ESPFIX_BASE_ADDR (ESPFIX_PGD_ENTRY << P4D_SHIFT) + +#define CPU_ENTRY_AREA_PGD _AC(-4, UL) +#define CPU_ENTRY_AREA_BASE (CPU_ENTRY_AREA_PGD << P4D_SHIFT) + +#define EFI_VA_START ( -4 * (_AC(1, UL) << 30)) +#define EFI_VA_END (-68 * (_AC(1, UL) << 30)) #define EARLY_DYNAMIC_PAGE_TABLES 64 --- linux-azure-4.13.0.orig/arch/x86/include/asm/pgtable_types.h +++ linux-azure-4.13.0/arch/x86/include/asm/pgtable_types.h @@ -121,10 +121,9 @@ #define _PAGE_PROTNONE (_AT(pteval_t, 1) << _PAGE_BIT_PROTNONE) -#define _PAGE_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_USER | \ - _PAGE_ACCESSED | _PAGE_DIRTY) #define _KERNPG_TABLE (_PAGE_PRESENT | _PAGE_RW | _PAGE_ACCESSED | \ _PAGE_DIRTY) +#define _PAGE_TABLE (_KERNPG_TABLE | _PAGE_USER) /* * Set of bits not changed in pte_modify. The pte's --- linux-azure-4.13.0.orig/arch/x86/include/asm/preempt.h +++ linux-azure-4.13.0/arch/x86/include/asm/preempt.h @@ -100,19 +100,14 @@ #ifdef CONFIG_PREEMPT extern asmlinkage void ___preempt_schedule(void); -# define __preempt_schedule() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule" : "+r"(__sp)); \ -}) +# define __preempt_schedule() \ + asm volatile ("call ___preempt_schedule" : ASM_CALL_CONSTRAINT) extern asmlinkage void preempt_schedule(void); extern asmlinkage void ___preempt_schedule_notrace(void); -# define __preempt_schedule_notrace() \ -({ \ - register void *__sp asm(_ASM_SP); \ - asm volatile ("call ___preempt_schedule_notrace" : "+r"(__sp)); \ -}) +# define __preempt_schedule_notrace() \ + asm volatile ("call ___preempt_schedule_notrace" : ASM_CALL_CONSTRAINT) + extern asmlinkage void preempt_schedule_notrace(void); #endif --- linux-azure-4.13.0.orig/arch/x86/include/asm/processor-flags.h +++ linux-azure-4.13.0/arch/x86/include/asm/processor-flags.h @@ -35,6 +35,12 @@ /* Mask off the address space ID bits. */ #define CR3_ADDR_MASK 0x7FFFFFFFFFFFF000ull #define CR3_PCID_MASK 0xFFFull +#define CR3_NOFLUSH (1UL << 63) + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define X86_CR3_PTI_PCID_USER_BIT 11 +#endif + #else /* * CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save @@ -42,6 +48,7 @@ */ #define CR3_ADDR_MASK 0xFFFFFFFFull #define CR3_PCID_MASK 0ull +#define CR3_NOFLUSH 0 #endif #endif /* _ASM_X86_PROCESSOR_FLAGS_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/processor.h +++ linux-azure-4.13.0/arch/x86/include/asm/processor.h @@ -159,9 +159,11 @@ extern struct cpuinfo_x86 boot_cpu_data; extern struct cpuinfo_x86 new_cpu_data; -extern struct tss_struct doublefault_tss; -extern __u32 cpu_caps_cleared[NCAPINTS]; -extern __u32 cpu_caps_set[NCAPINTS]; +#include + +extern struct x86_hw_tss doublefault_tss; +extern __u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +extern __u32 cpu_caps_set[NCAPINTS + NBUGINTS]; #ifdef CONFIG_SMP DECLARE_PER_CPU_READ_MOSTLY(struct cpuinfo_x86, cpu_info); @@ -244,6 +246,11 @@ write_cr3(__pa(pgdir)); } +/* + * Note that while the legacy 'TSS' name comes from 'Task State Segment', + * on modern x86 CPUs the TSS also holds information important to 64-bit mode, + * unrelated to the task-switch mechanism: + */ #ifdef CONFIG_X86_32 /* This is the TSS defined by the hardware. */ struct x86_hw_tss { @@ -296,7 +303,13 @@ struct x86_hw_tss { u32 reserved1; u64 sp0; + + /* + * We store cpu_current_top_of_stack in sp1 so it's always accessible. + * Linux does not use ring 1, so sp1 is not otherwise needed. + */ u64 sp1; + u64 sp2; u64 reserved2; u64 ist[7]; @@ -314,12 +327,22 @@ #define IO_BITMAP_BITS 65536 #define IO_BITMAP_BYTES (IO_BITMAP_BITS/8) #define IO_BITMAP_LONGS (IO_BITMAP_BYTES/sizeof(long)) -#define IO_BITMAP_OFFSET offsetof(struct tss_struct, io_bitmap) +#define IO_BITMAP_OFFSET (offsetof(struct tss_struct, io_bitmap) - offsetof(struct tss_struct, x86_tss)) #define INVALID_IO_BITMAP_OFFSET 0x8000 +struct entry_stack { + unsigned long words[64]; +}; + +struct entry_stack_page { + struct entry_stack stack; +} __aligned(PAGE_SIZE); + struct tss_struct { /* - * The hardware state: + * The fixed hardware portion. This must not cross a page boundary + * at risk of violating the SDM's advice and potentially triggering + * errata. */ struct x86_hw_tss x86_tss; @@ -330,18 +353,9 @@ * be within the limit. */ unsigned long io_bitmap[IO_BITMAP_LONGS + 1]; +} __aligned(PAGE_SIZE); -#ifdef CONFIG_X86_32 - /* - * Space for the temporary SYSENTER stack. - */ - unsigned long SYSENTER_stack_canary; - unsigned long SYSENTER_stack[64]; -#endif - -} ____cacheline_aligned; - -DECLARE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss); +DECLARE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw); /* * sizeof(unsigned long) coming from an extra "long" at the end @@ -355,6 +369,9 @@ #ifdef CONFIG_X86_32 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack); +#else +/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */ +#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1 #endif /* @@ -423,7 +440,9 @@ struct thread_struct { /* Cached TLS descriptors: */ struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES]; +#ifdef CONFIG_X86_32 unsigned long sp0; +#endif unsigned long sp; #ifdef CONFIG_X86_32 unsigned long sysenter_cs; @@ -510,16 +529,9 @@ } static inline void -native_load_sp0(struct tss_struct *tss, struct thread_struct *thread) +native_load_sp0(unsigned long sp0) { - tss->x86_tss.sp0 = thread->sp0; -#ifdef CONFIG_X86_32 - /* Only happens when SEP is enabled, no need to test "SEP"arately: */ - if (unlikely(tss->x86_tss.ss1 != thread->sysenter_cs)) { - tss->x86_tss.ss1 = thread->sysenter_cs; - wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); - } -#endif + this_cpu_write(cpu_tss_rw.x86_tss.sp0, sp0); } static inline void native_swapgs(void) @@ -531,12 +543,18 @@ static inline unsigned long current_top_of_stack(void) { -#ifdef CONFIG_X86_64 - return this_cpu_read_stable(cpu_tss.x86_tss.sp0); -#else - /* sp0 on x86_32 is special in and around vm86 mode. */ + /* + * We can't read directly from tss.sp0: sp0 on x86_32 is special in + * and around vm86 mode and sp0 on x86_64 is special because of the + * entry trampoline. + */ return this_cpu_read_stable(cpu_current_top_of_stack); -#endif +} + +static inline bool on_thread_stack(void) +{ + return (unsigned long)(current_top_of_stack() - + current_stack_pointer()) < THREAD_SIZE; } #ifdef CONFIG_PARAVIRT @@ -544,10 +562,9 @@ #else #define __cpuid native_cpuid -static inline void load_sp0(struct tss_struct *tss, - struct thread_struct *thread) +static inline void load_sp0(unsigned long sp0) { - native_load_sp0(tss, thread); + native_load_sp0(sp0); } #define set_iopl_mask native_set_iopl_mask @@ -670,8 +687,6 @@ * Like all of Linux's memory ordering operations, this is a * compiler barrier as well. */ - register void *__sp asm(_ASM_SP); - #ifdef CONFIG_X86_32 asm volatile ( "pushfl\n\t" @@ -679,7 +694,7 @@ "pushl $1f\n\t" "iret\n\t" "1:" - : "+r" (__sp) : : "memory"); + : ASM_CALL_CONSTRAINT : : "memory"); #else unsigned int tmp; @@ -694,7 +709,7 @@ "pushq $1f\n\t" "iretq\n\t" "1:" - : "=&r" (tmp), "+r" (__sp) : : "cc", "memory"); + : "=&r" (tmp), ASM_CALL_CONSTRAINT : : "cc", "memory"); #endif } @@ -796,6 +811,15 @@ #define TOP_OF_INIT_STACK ((unsigned long)&init_stack + sizeof(init_stack) - \ TOP_OF_KERNEL_STACK_PADDING) +#define task_top_of_stack(task) ((unsigned long)(task_pt_regs(task) + 1)) + +#define task_pt_regs(task) \ +({ \ + unsigned long __ptr = (unsigned long)task_stack_page(task); \ + __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ + ((struct pt_regs *)__ptr) - 1; \ +}) + #ifdef CONFIG_X86_32 /* * User space process size: 3GB (default). @@ -813,34 +837,26 @@ .addr_limit = KERNEL_DS, \ } -/* - * TOP_OF_KERNEL_STACK_PADDING reserves 8 bytes on top of the ring0 stack. - * This is necessary to guarantee that the entire "struct pt_regs" - * is accessible even if the CPU haven't stored the SS/ESP registers - * on the stack (interrupt gate does not save these registers - * when switching to the same priv ring). - * Therefore beware: accessing the ss/esp fields of the - * "struct pt_regs" is possible, but they may contain the - * completely wrong values. - */ -#define task_pt_regs(task) \ -({ \ - unsigned long __ptr = (unsigned long)task_stack_page(task); \ - __ptr += THREAD_SIZE - TOP_OF_KERNEL_STACK_PADDING; \ - ((struct pt_regs *)__ptr) - 1; \ -}) - #define KSTK_ESP(task) (task_pt_regs(task)->sp) #else /* - * User space process size. 47bits minus one guard page. The guard - * page is necessary on Intel CPUs: if a SYSCALL instruction is at - * the highest possible canonical userspace address, then that - * syscall will enter the kernel with a non-canonical return - * address, and SYSRET will explode dangerously. We avoid this - * particular problem by preventing anything from being mapped - * at the maximum canonical address. + * User space process size. This is the first address outside the user range. + * There are a few constraints that determine this: + * + * On Intel CPUs, if a SYSCALL instruction is at the highest canonical + * address, then that syscall will enter the kernel with a + * non-canonical return address, and SYSRET will explode dangerously. + * We avoid this particular problem by preventing anything executable + * from being mapped at the maximum canonical address. + * + * On AMD CPUs in the Ryzen family, there's a nasty bug in which the + * CPUs malfunction if they execute code from the highest canonical page. + * They'll speculate right off the end of the canonical space, and + * bad things happen. This is worked around in the same way as the + * Intel problem. + * + * With page table isolation enabled, we map the LDT in ... [stay tuned] */ #define TASK_SIZE_MAX ((1UL << 47) - PAGE_SIZE) @@ -859,11 +875,9 @@ #define STACK_TOP_MAX TASK_SIZE_MAX #define INIT_THREAD { \ - .sp0 = TOP_OF_INIT_STACK, \ .addr_limit = KERNEL_DS, \ } -#define task_pt_regs(tsk) ((struct pt_regs *)(tsk)->thread.sp0 - 1) extern unsigned long KSTK_ESP(struct task_struct *task); #endif /* CONFIG_X86_64 */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/proto.h +++ linux-azure-4.13.0/arch/x86/include/asm/proto.h @@ -24,6 +24,9 @@ void __end_entry_SYSENTER_compat(void); void entry_SYSCALL_compat(void); void entry_INT80_compat(void); +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +void xen_entry_INT80_compat(void); +#endif #endif void x86_configure_nx(void); --- linux-azure-4.13.0.orig/arch/x86/include/asm/pti.h +++ linux-azure-4.13.0/arch/x86/include/asm/pti.h @@ -0,0 +1,14 @@ +// SPDX-License-Identifier: GPL-2.0 +#ifndef _ASM_X86_PTI_H +#define _ASM_X86_PTI_H +#ifndef __ASSEMBLY__ + +#ifdef CONFIG_PAGE_TABLE_ISOLATION +extern void pti_init(void); +extern void pti_check_boottime_disable(void); +#else +static inline void pti_check_boottime_disable(void) { } +#endif + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_PTI_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/ptrace.h +++ linux-azure-4.13.0/arch/x86/include/asm/ptrace.h @@ -115,9 +115,9 @@ #endif } -#ifdef CONFIG_X86_64 static inline bool user_64bit_mode(struct pt_regs *regs) { +#ifdef CONFIG_X86_64 #ifndef CONFIG_PARAVIRT /* * On non-paravirt systems, this is the only long mode CPL 3 @@ -128,8 +128,12 @@ /* Headers are too twisted for this to go in paravirt.h. */ return regs->cs == __USER_CS || regs->cs == pv_info.extra_user_64bit_cs; #endif +#else /* !CONFIG_X86_64 */ + return false; +#endif } +#ifdef CONFIG_X86_64 #define current_user_stack_pointer() current_pt_regs()->sp #define compat_user_stack_pointer() current_pt_regs()->sp #endif --- linux-azure-4.13.0.orig/arch/x86/include/asm/rmwcc.h +++ linux-azure-4.13.0/arch/x86/include/asm/rmwcc.h @@ -28,7 +28,7 @@ #define __GEN_RMWcc(fullop, var, cc, ...) \ do { \ bool c; \ - asm volatile (fullop ";" CC_SET(cc) \ + asm volatile (fullop CC_SET(cc) \ : "+m" (var), CC_OUT(cc) (c) \ : __VA_ARGS__ : "memory"); \ return c; \ --- linux-azure-4.13.0.orig/arch/x86/include/asm/rwsem.h +++ linux-azure-4.13.0/arch/x86/include/asm/rwsem.h @@ -103,7 +103,6 @@ ({ \ long tmp; \ struct rw_semaphore* ret; \ - register void *__sp asm(_ASM_SP); \ \ asm volatile("# beginning down_write\n\t" \ LOCK_PREFIX " xadd %1,(%4)\n\t" \ @@ -114,7 +113,8 @@ " call " slow_path "\n" \ "1:\n" \ "# ending down_write" \ - : "+m" (sem->count), "=d" (tmp), "=a" (ret), "+r" (__sp) \ + : "+m" (sem->count), "=d" (tmp), \ + "=a" (ret), ASM_CALL_CONSTRAINT \ : "a" (sem), "1" (RWSEM_ACTIVE_WRITE_BIAS) \ : "memory", "cc"); \ ret; \ --- linux-azure-4.13.0.orig/arch/x86/include/asm/spec_ctrl.h +++ linux-azure-4.13.0/arch/x86/include/asm/spec_ctrl.h @@ -0,0 +1,141 @@ +#ifndef _ASM_X86_SPEC_CTRL_H +#define _ASM_X86_SPEC_CTRL_H + +#include +#include +#include +#include + +#ifdef __ASSEMBLY__ + +.extern use_ibrs +.extern use_ibpb + +#define __ASM_ENABLE_IBRS \ + pushq %rax; \ + pushq %rcx; \ + pushq %rdx; \ + movl $MSR_IA32_SPEC_CTRL, %ecx; \ + movl $0, %edx; \ + movl $FEATURE_ENABLE_IBRS, %eax; \ + wrmsr; \ + popq %rdx; \ + popq %rcx; \ + popq %rax +#define __ASM_ENABLE_IBRS_CLOBBER \ + movl $MSR_IA32_SPEC_CTRL, %ecx; \ + movl $0, %edx; \ + movl $FEATURE_ENABLE_IBRS, %eax; \ + wrmsr; +#define __ASM_DISABLE_IBRS \ + pushq %rax; \ + pushq %rcx; \ + pushq %rdx; \ + movl $MSR_IA32_SPEC_CTRL, %ecx; \ + movl $0, %edx; \ + movl $0, %eax; \ + wrmsr; \ + popq %rdx; \ + popq %rcx; \ + popq %rax +#define __ASM_STUFF_RSB \ + call 1f; \ + pause; \ +1: call 2f; \ + pause; \ +2: call 3f; \ + pause; \ +3: call 4f; \ + pause; \ +4: call 5f; \ + pause; \ +5: call 6f; \ + pause; \ +6: call 7f; \ + pause; \ +7: call 8f; \ + pause; \ +8: call 9f; \ + pause; \ +9: call 10f; \ + pause; \ +10: call 11f; \ + pause; \ +11: call 12f; \ + pause; \ +12: call 13f; \ + pause; \ +13: call 14f; \ + pause; \ +14: call 15f; \ + pause; \ +15: call 16f; \ + pause; \ +16: call 17f; \ + pause; \ +17: call 18f; \ + pause; \ +18: call 19f; \ + pause; \ +19: call 20f; \ + pause; \ +20: call 21f; \ + pause; \ +21: call 22f; \ + pause; \ +22: call 23f; \ + pause; \ +23: call 24f; \ + pause; \ +24: call 25f; \ + pause; \ +25: call 26f; \ + pause; \ +26: call 27f; \ + pause; \ +27: call 28f; \ + pause; \ +28: call 29f; \ + pause; \ +29: call 30f; \ + pause; \ +30: call 31f; \ + pause; \ +31: call 32f; \ + pause; \ +32: \ + add $(32*8), %rsp; + +.macro ENABLE_IBRS + testl $1, use_ibrs + jz 10f + __ASM_ENABLE_IBRS + jmp 20f +10: + lfence +20: +.endm + +.macro ENABLE_IBRS_CLOBBER + testl $1, use_ibrs + jz 11f + __ASM_ENABLE_IBRS_CLOBBER + jmp 21f +11: + lfence +21: +.endm + +.macro DISABLE_IBRS + testl $1, use_ibrs + jz 9f + __ASM_DISABLE_IBRS +9: +.endm + +.macro STUFF_RSB +ALTERNATIVE __stringify(__ASM_STUFF_RSB), "", X86_FEATURE_SMEP +.endm + +#endif /* __ASSEMBLY__ */ +#endif /* _ASM_X86_SPEC_CTRL_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/special_insns.h +++ linux-azure-4.13.0/arch/x86/include/asm/special_insns.h @@ -135,6 +135,11 @@ extern asmlinkage void native_load_gs_index(unsigned); +static inline unsigned long __read_cr4(void) +{ + return native_read_cr4(); +} + #ifdef CONFIG_PARAVIRT #include #else @@ -173,11 +178,6 @@ native_write_cr3(x); } -static inline unsigned long __read_cr4(void) -{ - return native_read_cr4(); -} - static inline void __write_cr4(unsigned long x) { native_write_cr4(x); --- linux-azure-4.13.0.orig/arch/x86/include/asm/stacktrace.h +++ linux-azure-4.13.0/arch/x86/include/asm/stacktrace.h @@ -15,6 +15,7 @@ STACK_TYPE_TASK, STACK_TYPE_IRQ, STACK_TYPE_SOFTIRQ, + STACK_TYPE_ENTRY, STACK_TYPE_EXCEPTION, STACK_TYPE_EXCEPTION_LAST = STACK_TYPE_EXCEPTION + N_EXCEPTION_STACKS-1, }; @@ -27,6 +28,8 @@ bool in_task_stack(unsigned long *stack, struct task_struct *task, struct stack_info *info); +bool in_entry_stack(unsigned long *stack, struct stack_info *info); + int get_stack_info(unsigned long *stack, struct task_struct *task, struct stack_info *info, unsigned long *visit_mask); --- linux-azure-4.13.0.orig/arch/x86/include/asm/switch_to.h +++ linux-azure-4.13.0/arch/x86/include/asm/switch_to.h @@ -1,6 +1,8 @@ #ifndef _ASM_X86_SWITCH_TO_H #define _ASM_X86_SWITCH_TO_H +#include + struct task_struct; /* one of the stranger aspects of C forward declarations */ struct task_struct *__switch_to_asm(struct task_struct *prev, @@ -72,4 +74,28 @@ ((last) = __switch_to_asm((prev), (next))); \ } while (0) +#ifdef CONFIG_X86_32 +static inline void refresh_sysenter_cs(struct thread_struct *thread) +{ + /* Only happens when SEP is enabled, no need to test "SEP"arately: */ + if (unlikely(this_cpu_read(cpu_tss_rw.x86_tss.ss1) == thread->sysenter_cs)) + return; + + this_cpu_write(cpu_tss_rw.x86_tss.ss1, thread->sysenter_cs); + wrmsr(MSR_IA32_SYSENTER_CS, thread->sysenter_cs, 0); +} +#endif + +/* This is used when switching tasks or entering/exiting vm86 mode. */ +static inline void update_sp0(struct task_struct *task) +{ + /* On x86_64, sp0 always points to the entry trampoline stack, which is constant: */ +#ifdef CONFIG_X86_32 + load_sp0(task->thread.sp0); +#else + if (static_cpu_has(X86_FEATURE_XENPV)) + load_sp0(task_top_of_stack(task)); +#endif +} + #endif /* _ASM_X86_SWITCH_TO_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/syscalls.h +++ linux-azure-4.13.0/arch/x86/include/asm/syscalls.h @@ -21,7 +21,7 @@ asmlinkage long sys_iopl(unsigned int); /* kernel/ldt.c */ -asmlinkage int sys_modify_ldt(int, void __user *, unsigned long); +asmlinkage long sys_modify_ldt(int, void __user *, unsigned long); /* kernel/signal.c */ asmlinkage long sys_rt_sigreturn(void); --- linux-azure-4.13.0.orig/arch/x86/include/asm/thread_info.h +++ linux-azure-4.13.0/arch/x86/include/asm/thread_info.h @@ -48,6 +48,17 @@ * - this struct shares the supervisor stack pages */ #ifndef __ASSEMBLY__ +static inline unsigned long current_stack_pointer(void) +{ + unsigned long sp; +#ifdef CONFIG_X86_64 + asm("mov %%rsp,%0" : "=g" (sp)); +#else + asm("mov %%esp,%0" : "=g" (sp)); +#endif + return sp; +} + struct task_struct; #include #include @@ -98,6 +109,7 @@ #define TIF_SYSCALL_TRACEPOINT 28 /* syscall tracepoint instrumentation */ #define TIF_ADDR32 29 /* 32-bit address space on 64 bits */ #define TIF_X32 30 /* 32-bit native x86-64 binary */ +#define TIF_FSCHECK 31 /* Check FS is USER_DS on return */ #define _TIF_SYSCALL_TRACE (1 << TIF_SYSCALL_TRACE) #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) @@ -122,6 +134,7 @@ #define _TIF_SYSCALL_TRACEPOINT (1 << TIF_SYSCALL_TRACEPOINT) #define _TIF_ADDR32 (1 << TIF_ADDR32) #define _TIF_X32 (1 << TIF_X32) +#define _TIF_FSCHECK (1 << TIF_FSCHECK) /* * work to do in syscall_trace_enter(). Also includes TIF_NOHZ for @@ -137,7 +150,8 @@ (_TIF_SYSCALL_TRACE | _TIF_NOTIFY_RESUME | _TIF_SIGPENDING | \ _TIF_NEED_RESCHED | _TIF_SINGLESTEP | _TIF_SYSCALL_EMU | \ _TIF_SYSCALL_AUDIT | _TIF_USER_RETURN_NOTIFY | _TIF_UPROBE | \ - _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT) + _TIF_PATCH_PENDING | _TIF_NOHZ | _TIF_SYSCALL_TRACEPOINT | \ + _TIF_FSCHECK) /* flags to check in __switch_to() */ #define _TIF_WORK_CTXSW \ @@ -155,17 +169,6 @@ */ #ifndef __ASSEMBLY__ -static inline unsigned long current_stack_pointer(void) -{ - unsigned long sp; -#ifdef CONFIG_X86_64 - asm("mov %%rsp,%0" : "=g" (sp)); -#else - asm("mov %%esp,%0" : "=g" (sp)); -#endif - return sp; -} - /* * Walks up the stack frames to make sure that the specified object is * entirely contained by a single stack frame. @@ -214,7 +217,7 @@ #else /* !__ASSEMBLY__ */ #ifdef CONFIG_X86_64 -# define cpu_current_top_of_stack (cpu_tss + TSS_sp0) +# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1) #endif #endif --- linux-azure-4.13.0.orig/arch/x86/include/asm/tlbflush.h +++ linux-azure-4.13.0/arch/x86/include/asm/tlbflush.h @@ -8,53 +8,130 @@ #include #include #include +#include +#include +#include -static inline void __invpcid(unsigned long pcid, unsigned long addr, - unsigned long type) +/* + * The x86 feature is called PCID (Process Context IDentifier). It is similar + * to what is traditionally called ASID on the RISC processors. + * + * We don't use the traditional ASID implementation, where each process/mm gets + * its own ASID and flush/restart when we run out of ASID space. + * + * Instead we have a small per-cpu array of ASIDs and cache the last few mm's + * that came by on this CPU, allowing cheaper switch_mm between processes on + * this CPU. + * + * We end up with different spaces for different things. To avoid confusion we + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] + * the canonical identifier for an mm + * + * kPCID - [1, TLB_NR_DYN_ASIDS] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * + * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. + * + */ + +/* There are 12 bits of space for ASIDS in CR3 */ +#define CR3_HW_ASID_BITS 12 + +/* + * When enabled, PAGE_TABLE_ISOLATION consumes a single bit for + * user/kernel switches + */ +#ifdef CONFIG_PAGE_TABLE_ISOLATION +# define PTI_CONSUMED_PCID_BITS 1 +#else +# define PTI_CONSUMED_PCID_BITS 0 +#endif + +#define CR3_AVAIL_PCID_BITS (X86_CR3_PCID_BITS - PTI_CONSUMED_PCID_BITS) + +/* + * ASIDs are zero-based: 0->MAX_AVAIL_ASID are valid. -1 below to account + * for them being zero-based. Another -1 is because PCID 0 is reserved for + * use by non-PCID-aware users. + */ +#define MAX_ASID_AVAILABLE ((1 << CR3_AVAIL_PCID_BITS) - 2) + +/* + * 6 because 6 should be plenty and struct tlb_state will fit in two cache + * lines. + */ +#define TLB_NR_DYN_ASIDS 6 + +/* + * Given @asid, compute kPCID + */ +static inline u16 kern_pcid(u16 asid) { - struct { u64 d[2]; } desc = { { pcid, addr } }; + VM_WARN_ON_ONCE(asid > MAX_ASID_AVAILABLE); +#ifdef CONFIG_PAGE_TABLE_ISOLATION /* - * The memory clobber is because the whole point is to invalidate - * stale TLB entries and, especially if we're flushing global - * mappings, we don't want the compiler to reorder any subsequent - * memory accesses before the TLB flush. - * - * The hex opcode is invpcid (%ecx), %eax in 32-bit mode and - * invpcid (%rcx), %rax in long mode. + * Make sure that the dynamic ASID space does not confict with the + * bit we are using to switch between user and kernel ASIDs. */ - asm volatile (".byte 0x66, 0x0f, 0x38, 0x82, 0x01" - : : "m" (desc), "a" (type), "c" (&desc) : "memory"); -} - -#define INVPCID_TYPE_INDIV_ADDR 0 -#define INVPCID_TYPE_SINGLE_CTXT 1 -#define INVPCID_TYPE_ALL_INCL_GLOBAL 2 -#define INVPCID_TYPE_ALL_NON_GLOBAL 3 + BUILD_BUG_ON(TLB_NR_DYN_ASIDS >= (1 << X86_CR3_PTI_PCID_USER_BIT)); -/* Flush all mappings for a given pcid and addr, not including globals. */ -static inline void invpcid_flush_one(unsigned long pcid, - unsigned long addr) -{ - __invpcid(pcid, addr, INVPCID_TYPE_INDIV_ADDR); + /* + * The ASID being passed in here should have respected the + * MAX_ASID_AVAILABLE and thus never have the switch bit set. + */ + VM_WARN_ON_ONCE(asid & (1 << X86_CR3_PTI_PCID_USER_BIT)); +#endif + /* + * The dynamically-assigned ASIDs that get passed in are small + * ( MAX_ASID_AVAILABLE); + VM_WARN_ON_ONCE(!this_cpu_has(X86_FEATURE_PCID)); + return __pa(pgd) | kern_pcid(asid) | CR3_NOFLUSH; } #ifdef CONFIG_PARAVIRT @@ -65,6 +142,18 @@ #define __flush_tlb_single(addr) __native_flush_tlb_single(addr) #endif +/* + * If tlb_use_lazy_mode is true, then we try to avoid switching CR3 to point + * to init_mm when we switch to a kernel thread (e.g. the idle thread). If + * it's false, then we immediately switch CR3 when entering a kernel thread. + */ +DECLARE_STATIC_KEY_TRUE(tlb_use_lazy_mode); + +struct tlb_context { + u64 ctx_id; + u64 tlb_gen; +}; + struct tlb_state { /* * cpu_tlbstate.loaded_mm should match CR3 whenever interrupts @@ -73,13 +162,72 @@ * mode even if we've already switched back to swapper_pg_dir. */ struct mm_struct *loaded_mm; - int state; + u16 loaded_mm_asid; + u16 next_asid; + /* last user mm's ctx id */ + u64 last_ctx_id; + + /* + * We can be in one of several states: + * + * - Actively using an mm. Our CPU's bit will be set in + * mm_cpumask(loaded_mm) and is_lazy == false; + * + * - Not using a real mm. loaded_mm == &init_mm. Our CPU's bit + * will not be set in mm_cpumask(&init_mm) and is_lazy == false. + * + * - Lazily using a real mm. loaded_mm != &init_mm, our bit + * is set in mm_cpumask(loaded_mm), but is_lazy == true. + * We're heuristically guessing that the CR3 load we + * skipped more than makes up for the overhead added by + * lazy mode. + */ + bool is_lazy; + + /* + * If set we changed the page tables in such a way that we + * needed an invalidation of all contexts (aka. PCIDs / ASIDs). + * This tells us to go invalidate all the non-loaded ctxs[] + * on the next context switch. + * + * The current ctx was kept up-to-date as it ran and does not + * need to be invalidated. + */ + bool invalidate_other; + + /* + * Mask that contains TLB_NR_DYN_ASIDS+1 bits to indicate + * the corresponding user PCID needs a flush next time we + * switch to it; see SWITCH_TO_USER_CR3. + */ + unsigned short user_pcid_flush_mask; /* * Access to this CR4 shadow and to H/W CR4 is protected by * disabling interrupts when modifying either one. */ unsigned long cr4; + + /* + * This is a list of all contexts that might exist in the TLB. + * There is one per ASID that we use, and the ASID (what the + * CPU calls PCID) is the index into ctxts. + * + * For each context, ctx_id indicates which mm the TLB's user + * entries came from. As an invariant, the TLB will never + * contain entries that are out-of-date as when that mm reached + * the tlb_gen in the list. + * + * To be clear, this means that it's legal for the TLB code to + * flush the TLB without updating tlb_gen. This can happen + * (for now, at least) due to paravirt remote flushes. + * + * NB: context 0 is a bit special, since it's also used by + * various bits of init code. This is fine -- code that + * isn't aware of PCID will end up harmlessly flushing + * context 0. + */ + struct tlb_context ctxs[TLB_NR_DYN_ASIDS]; }; DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate); @@ -132,6 +280,14 @@ } /* + * Mark all other ASIDs as invalid, preserves the current. + */ +static inline void invalidate_other_asid(void) +{ + this_cpu_write(cpu_tlbstate.invalidate_other, true); +} + +/* * Save some of cr4 feature set we're using (e.g. Pentium 4MB * enable and PPro Global page enable), so that any CPU's that boot * up after us can get the correct flags. This should only be used @@ -148,37 +304,66 @@ cr4_set_bits(mask); } -static inline void __native_flush_tlb(void) + +/* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. + * + * See SWITCH_TO_USER_CR3. + */ +static inline void invalidate_user_asid(u16 asid) { + /* There is no user ASID if address space separation is off */ + if (!IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION)) + return; + /* - * If current->mm == NULL then we borrow a mm which may change during a - * task switch and therefore we must not be preempted while we write CR3 - * back: + * We only have a single ASID if PCID is off and the CR3 + * write will have flushed it. */ - preempt_disable(); - native_write_cr3(__native_read_cr3()); - preempt_enable(); + if (!cpu_feature_enabled(X86_FEATURE_PCID)) + return; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + __set_bit(kern_pcid(asid), + (unsigned long *)this_cpu_ptr(&cpu_tlbstate.user_pcid_flush_mask)); } -static inline void __native_flush_tlb_global_irq_disabled(void) +extern void initialize_tlbstate_and_flush(void); + +/* + * flush the entire current user mapping + */ +static inline void __native_flush_tlb(void) { - unsigned long cr4; + /* + * Preemption or interrupts must be disabled to protect the access + * to the per CPU variable and to prevent being preempted between + * read_cr3() and write_cr3(). + */ + WARN_ON_ONCE(preemptible()); - cr4 = this_cpu_read(cpu_tlbstate.cr4); - /* clear PGE */ - native_write_cr4(cr4 & ~X86_CR4_PGE); - /* write old PGE again and flush TLBs */ - native_write_cr4(cr4); + invalidate_user_asid(this_cpu_read(cpu_tlbstate.loaded_mm_asid)); + + /* If current->mm == NULL then the read_cr3() "borrows" an mm */ + native_write_cr3(__native_read_cr3()); } +/* + * flush everything + */ static inline void __native_flush_tlb_global(void) { - unsigned long flags; + unsigned long cr4, flags; if (static_cpu_has(X86_FEATURE_INVPCID)) { /* * Using INVPCID is considerably faster than a pair of writes * to CR4 sandwiched inside an IRQ flag save/restore. + * + * Note, this works with CR4.PCIDE=0 or 1. */ invpcid_flush_all(); return; @@ -191,28 +376,69 @@ */ raw_local_irq_save(flags); - __native_flush_tlb_global_irq_disabled(); + cr4 = this_cpu_read(cpu_tlbstate.cr4); + /* toggle PGE */ + native_write_cr4(cr4 ^ X86_CR4_PGE); + /* write old PGE again and flush TLBs */ + native_write_cr4(cr4); raw_local_irq_restore(flags); } +/* + * flush one page in the user mapping + */ static inline void __native_flush_tlb_single(unsigned long addr) { + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1. + * Just use invalidate_user_asid() in case we are called early. + */ + if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE)) + invalidate_user_asid(loaded_mm_asid); + else + invpcid_flush_one(user_pcid(loaded_mm_asid), addr); } +/* + * flush everything + */ static inline void __flush_tlb_all(void) { - if (boot_cpu_has(X86_FEATURE_PGE)) + if (boot_cpu_has(X86_FEATURE_PGE)) { __flush_tlb_global(); - else + } else { + /* + * !PGE -> !PCID (setup_pcid()), thus every flush is total. + */ __flush_tlb(); + } } +/* + * flush one page in the kernel mapping + */ static inline void __flush_tlb_one(unsigned long addr) { count_vm_tlb_event(NR_TLB_LOCAL_FLUSH_ONE); __flush_tlb_single(addr); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + /* + * __flush_tlb_single() will have cleared the TLB entry for this ASID, + * but since kernel space is replicated across all, we must also + * invalidate all others. + */ + invalidate_other_asid(); } #define TLB_FLUSH_ALL -1UL @@ -231,9 +457,26 @@ * and page-granular flushes are available only on i486 and up. */ struct flush_tlb_info { - struct mm_struct *mm; - unsigned long start; - unsigned long end; + /* + * We support several kinds of flushes. + * + * - Fully flush a single mm. .mm will be set, .end will be + * TLB_FLUSH_ALL, and .new_tlb_gen will be the tlb_gen to + * which the IPI sender is trying to catch us up. + * + * - Partially flush a single mm. .mm will be set, .start and + * .end will indicate the range, and .new_tlb_gen will be set + * such that the changes between generation .new_tlb_gen-1 and + * .new_tlb_gen are entirely contained in the indicated range. + * + * - Fully flush all mms whose tlb_gens have been updated. .mm + * will be NULL, .end will be TLB_FLUSH_ALL, and .new_tlb_gen + * will be zero. + */ + struct mm_struct *mm; + unsigned long start; + unsigned long end; + u64 new_tlb_gen; }; #define local_flush_tlb() __flush_tlb() @@ -256,12 +499,21 @@ void native_flush_tlb_others(const struct cpumask *cpumask, const struct flush_tlb_info *info); -#define TLBSTATE_OK 1 -#define TLBSTATE_LAZY 2 +static inline u64 inc_mm_tlb_gen(struct mm_struct *mm) +{ + /* + * Bump the generation count. This also serves as a full barrier + * that synchronizes with switch_mm(): callers are required to order + * their read of mm_cpumask after their writes to the paging + * structures. + */ + return atomic64_inc_return(&mm->context.tlb_gen); +} static inline void arch_tlbbatch_add_mm(struct arch_tlbflush_unmap_batch *batch, struct mm_struct *mm) { + inc_mm_tlb_gen(mm); cpumask_or(&batch->cpumask, &batch->cpumask, mm_cpumask(mm)); } --- linux-azure-4.13.0.orig/arch/x86/include/asm/trace/common.h +++ linux-azure-4.13.0/arch/x86/include/asm/trace/common.h @@ -0,0 +1,15 @@ +#ifndef _ASM_TRACE_COMMON_H +#define _ASM_TRACE_COMMON_H + +extern int trace_irq_vector_regfunc(void); +extern void trace_irq_vector_unregfunc(void); + +#ifdef CONFIG_TRACING +DECLARE_STATIC_KEY_FALSE(trace_irqvectors_key); +#define trace_irqvectors_enabled() \ + static_branch_unlikely(&trace_irqvectors_key) +#else +static inline bool trace_irqvectors_enabled(void) { return false; } +#endif + +#endif --- linux-azure-4.13.0.orig/arch/x86/include/asm/trace/exceptions.h +++ linux-azure-4.13.0/arch/x86/include/asm/trace/exceptions.h @@ -5,9 +5,7 @@ #define _TRACE_PAGE_FAULT_H #include - -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +#include DECLARE_EVENT_CLASS(x86_exceptions, --- linux-azure-4.13.0.orig/arch/x86/include/asm/trace/fpu.h +++ linux-azure-4.13.0/arch/x86/include/asm/trace/fpu.h @@ -36,11 +36,6 @@ ) ); -DEFINE_EVENT(x86_fpu, x86_fpu_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); - DEFINE_EVENT(x86_fpu, x86_fpu_before_save, TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) @@ -75,11 +70,6 @@ TP_PROTO(struct fpu *fpu), TP_ARGS(fpu) ); - -DEFINE_EVENT(x86_fpu, x86_fpu_deactivate_state, - TP_PROTO(struct fpu *fpu), - TP_ARGS(fpu) -); DEFINE_EVENT(x86_fpu, x86_fpu_init_state, TP_PROTO(struct fpu *fpu), --- linux-azure-4.13.0.orig/arch/x86/include/asm/trace/hyperv.h +++ linux-azure-4.13.0/arch/x86/include/asm/trace/hyperv.h @@ -0,0 +1,40 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hyperv + +#if !defined(_TRACE_HYPERV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HYPERV_H + +#include + +#if IS_ENABLED(CONFIG_HYPERV) + +TRACE_EVENT(hyperv_mmu_flush_tlb_others, + TP_PROTO(const struct cpumask *cpus, + const struct flush_tlb_info *info), + TP_ARGS(cpus, info), + TP_STRUCT__entry( + __field(unsigned int, ncpus) + __field(struct mm_struct *, mm) + __field(unsigned long, addr) + __field(unsigned long, end) + ), + TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); + __entry->mm = info->mm; + __entry->addr = info->start; + __entry->end = info->end; + ), + TP_printk("ncpus %d mm %p addr %lx, end %lx", + __entry->ncpus, __entry->mm, + __entry->addr, __entry->end) + ); + +#endif /* CONFIG_HYPERV */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH asm/trace/ +#undef TRACE_INCLUDE_FILE +#define TRACE_INCLUDE_FILE hyperv +#endif /* _TRACE_HYPERV_H */ + +/* This part must be outside protection */ +#include --- linux-azure-4.13.0.orig/arch/x86/include/asm/trace/irq_vectors.h +++ linux-azure-4.13.0/arch/x86/include/asm/trace/irq_vectors.h @@ -5,9 +5,7 @@ #define _TRACE_IRQ_VECTORS_H #include - -extern int trace_irq_vector_regfunc(void); -extern void trace_irq_vector_unregfunc(void); +#include DECLARE_EVENT_CLASS(x86_irq_vector, @@ -69,7 +67,7 @@ * irq_work - called when entering/exiting a irq work interrupt * vector handler */ -DEFINE_IRQ_VECTOR_EVENT(irq_work); +// DEFINE_IRQ_VECTOR_EVENT(irq_work); /* * We must dis-allow sampling irq_work_exit() because perf event sampling --- linux-azure-4.13.0.orig/arch/x86/include/asm/traps.h +++ linux-azure-4.13.0/arch/x86/include/asm/traps.h @@ -13,9 +13,6 @@ asmlinkage void debug(void); asmlinkage void nmi(void); asmlinkage void int3(void); -asmlinkage void xen_debug(void); -asmlinkage void xen_int3(void); -asmlinkage void xen_stack_segment(void); asmlinkage void overflow(void); asmlinkage void bounds(void); asmlinkage void invalid_op(void); @@ -39,7 +36,6 @@ asmlinkage void simd_coprocessor_error(void); #ifdef CONFIG_TRACING -asmlinkage void trace_page_fault(void); #define trace_stack_segment stack_segment #define trace_divide_error divide_error #define trace_bounds bounds @@ -54,6 +50,32 @@ #define trace_alignment_check alignment_check #define trace_simd_coprocessor_error simd_coprocessor_error #define trace_async_page_fault async_page_fault +#define trace_page_fault page_fault +#endif + +#if defined(CONFIG_X86_64) && defined(CONFIG_XEN_PV) +asmlinkage void xen_divide_error(void); +asmlinkage void xen_xennmi(void); +asmlinkage void xen_xendebug(void); +asmlinkage void xen_xenint3(void); +asmlinkage void xen_overflow(void); +asmlinkage void xen_bounds(void); +asmlinkage void xen_invalid_op(void); +asmlinkage void xen_device_not_available(void); +asmlinkage void xen_double_fault(void); +asmlinkage void xen_coprocessor_segment_overrun(void); +asmlinkage void xen_invalid_TSS(void); +asmlinkage void xen_segment_not_present(void); +asmlinkage void xen_stack_segment(void); +asmlinkage void xen_general_protection(void); +asmlinkage void xen_page_fault(void); +asmlinkage void xen_spurious_interrupt_bug(void); +asmlinkage void xen_coprocessor_error(void); +asmlinkage void xen_alignment_check(void); +#ifdef CONFIG_X86_MCE +asmlinkage void xen_machine_check(void); +#endif /* CONFIG_X86_MCE */ +asmlinkage void xen_simd_coprocessor_error(void); #endif dotraplinkage void do_divide_error(struct pt_regs *, long); @@ -70,18 +92,9 @@ dotraplinkage void do_stack_segment(struct pt_regs *, long); #ifdef CONFIG_X86_64 dotraplinkage void do_double_fault(struct pt_regs *, long); -asmlinkage struct pt_regs *sync_regs(struct pt_regs *); #endif dotraplinkage void do_general_protection(struct pt_regs *, long); dotraplinkage void do_page_fault(struct pt_regs *, unsigned long); -#ifdef CONFIG_TRACING -dotraplinkage void trace_do_page_fault(struct pt_regs *, unsigned long); -#else -static inline void trace_do_page_fault(struct pt_regs *regs, unsigned long error) -{ - do_page_fault(regs, error); -} -#endif dotraplinkage void do_spurious_interrupt_bug(struct pt_regs *, long); dotraplinkage void do_coprocessor_error(struct pt_regs *, long); dotraplinkage void do_alignment_check(struct pt_regs *, long); @@ -148,4 +161,22 @@ X86_TRAP_IRET = 32, /* 32, IRET Exception */ }; +/* + * Page fault error code bits: + * + * bit 0 == 0: no page found 1: protection fault + * bit 1 == 0: read access 1: write access + * bit 2 == 0: kernel-mode access 1: user-mode access + * bit 3 == 1: use of reserved bit detected + * bit 4 == 1: fault was an instruction fetch + * bit 5 == 1: protection keys block access + */ +enum x86_pf_error_code { + X86_PF_PROT = 1 << 0, + X86_PF_WRITE = 1 << 1, + X86_PF_USER = 1 << 2, + X86_PF_RSVD = 1 << 3, + X86_PF_INSTR = 1 << 4, + X86_PF_PK = 1 << 5, +}; #endif /* _ASM_X86_TRAPS_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/uaccess.h +++ linux-azure-4.13.0/arch/x86/include/asm/uaccess.h @@ -26,7 +26,12 @@ #define get_ds() (KERNEL_DS) #define get_fs() (current->thread.addr_limit) -#define set_fs(x) (current->thread.addr_limit = (x)) +static inline void set_fs(mm_segment_t fs) +{ + current->thread.addr_limit = fs; + /* On user-mode return, check fs is correct */ + set_thread_flag(TIF_FSCHECK); +} #define segment_eq(a, b) ((a).seg == (b).seg) @@ -161,11 +166,11 @@ ({ \ int __ret_gu; \ register __inttype(*(ptr)) __val_gu asm("%"_ASM_DX); \ - register void *__sp asm(_ASM_SP); \ __chk_user_ptr(ptr); \ might_fault(); \ asm volatile("call __get_user_%P4" \ - : "=a" (__ret_gu), "=r" (__val_gu), "+r" (__sp) \ + : "=a" (__ret_gu), "=r" (__val_gu), \ + ASM_CALL_CONSTRAINT \ : "0" (ptr), "i" (sizeof(*(ptr)))); \ (x) = (__force __typeof__(*(ptr))) __val_gu; \ __builtin_expect(__ret_gu, 0); \ --- linux-azure-4.13.0.orig/arch/x86/include/asm/unwind.h +++ linux-azure-4.13.0/arch/x86/include/asm/unwind.h @@ -6,17 +6,23 @@ #include #include +#define IRET_FRAME_OFFSET (offsetof(struct pt_regs, ip)) +#define IRET_FRAME_SIZE (sizeof(struct pt_regs) - IRET_FRAME_OFFSET) + struct unwind_state { struct stack_info stack_info; unsigned long stack_mask; struct task_struct *task; int graph_idx; bool error; -#ifdef CONFIG_FRAME_POINTER +#if defined(CONFIG_UNWINDER_ORC) + bool signal, full_regs; + unsigned long sp, bp, ip; + struct pt_regs *regs; +#elif defined(CONFIG_UNWINDER_FRAME_POINTER) bool got_irq; - unsigned long *bp, *orig_sp; + unsigned long *bp, *orig_sp, ip; struct pt_regs *regs; - unsigned long ip; #else unsigned long *sp; #endif @@ -24,62 +30,90 @@ void __unwind_start(struct unwind_state *state, struct task_struct *task, struct pt_regs *regs, unsigned long *first_frame); - bool unwind_next_frame(struct unwind_state *state); - unsigned long unwind_get_return_address(struct unwind_state *state); +unsigned long *unwind_get_return_address_ptr(struct unwind_state *state); static inline bool unwind_done(struct unwind_state *state) { return state->stack_info.type == STACK_TYPE_UNKNOWN; } -static inline -void unwind_start(struct unwind_state *state, struct task_struct *task, - struct pt_regs *regs, unsigned long *first_frame) -{ - first_frame = first_frame ? : get_stack_pointer(task, regs); - - __unwind_start(state, task, regs, first_frame); -} - static inline bool unwind_error(struct unwind_state *state) { return state->error; } -#ifdef CONFIG_FRAME_POINTER - static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +void unwind_start(struct unwind_state *state, struct task_struct *task, + struct pt_regs *regs, unsigned long *first_frame) { - if (unwind_done(state)) - return NULL; + first_frame = first_frame ? : get_stack_pointer(task, regs); - return state->regs ? &state->regs->ip : state->bp + 1; + __unwind_start(state, task, regs, first_frame); } -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +#if defined(CONFIG_UNWINDER_ORC) || defined(CONFIG_UNWINDER_FRAME_POINTER) +/* + * If 'partial' returns true, only the iret frame registers are valid. + */ +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, + bool *partial) { if (unwind_done(state)) return NULL; + if (partial) { +#ifdef CONFIG_UNWINDER_ORC + *partial = !state->full_regs; +#else + *partial = false; +#endif + } + return state->regs; } - -#else /* !CONFIG_FRAME_POINTER */ - -static inline -unsigned long *unwind_get_return_address_ptr(struct unwind_state *state) +#else +static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state, + bool *partial) { return NULL; } +#endif -static inline struct pt_regs *unwind_get_entry_regs(struct unwind_state *state) +#ifdef CONFIG_UNWINDER_ORC +void unwind_init(void); +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size); +#else +static inline void unwind_init(void) {} +static inline +void unwind_module_init(struct module *mod, void *orc_ip, size_t orc_ip_size, + void *orc, size_t orc_size) {} +#endif + +/* + * This disables KASAN checking when reading a value from another task's stack, + * since the other task could be running on another CPU and could have poisoned + * the stack in the meantime. + */ +#define READ_ONCE_TASK_STACK(task, x) \ +({ \ + unsigned long val; \ + if (task == current) \ + val = READ_ONCE(x); \ + else \ + val = READ_ONCE_NOCHECK(x); \ + val; \ +}) + +static inline bool task_on_another_cpu(struct task_struct *task) { - return NULL; +#ifdef CONFIG_SMP + return task != current && task->on_cpu; +#else + return false; +#endif } -#endif /* CONFIG_FRAME_POINTER */ - #endif /* _ASM_X86_UNWIND_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/unwind_hints.h +++ linux-azure-4.13.0/arch/x86/include/asm/unwind_hints.h @@ -0,0 +1,103 @@ +#ifndef _ASM_X86_UNWIND_HINTS_H +#define _ASM_X86_UNWIND_HINTS_H + +#include "orc_types.h" + +#ifdef __ASSEMBLY__ + +/* + * In asm, there are two kinds of code: normal C-type callable functions and + * the rest. The normal callable functions can be called by other code, and + * don't do anything unusual with the stack. Such normal callable functions + * are annotated with the ENTRY/ENDPROC macros. Most asm code falls in this + * category. In this case, no special debugging annotations are needed because + * objtool can automatically generate the ORC data for the ORC unwinder to read + * at runtime. + * + * Anything which doesn't fall into the above category, such as syscall and + * interrupt handlers, tends to not be called directly by other functions, and + * often does unusual non-C-function-type things with the stack pointer. Such + * code needs to be annotated such that objtool can understand it. The + * following CFI hint macros are for this type of code. + * + * These macros provide hints to objtool about the state of the stack at each + * instruction. Objtool starts from the hints and follows the code flow, + * making automatic CFI adjustments when it sees pushes and pops, filling out + * the debuginfo as necessary. It will also warn if it sees any + * inconsistencies. + */ +.macro UNWIND_HINT sp_reg=ORC_REG_SP sp_offset=0 type=ORC_TYPE_CALL +#ifdef CONFIG_STACK_VALIDATION +.Lunwind_hint_ip_\@: + .pushsection .discard.unwind_hints + /* struct unwind_hint */ + .long .Lunwind_hint_ip_\@ - . + .short \sp_offset + .byte \sp_reg + .byte \type + .popsection +#endif +.endm + +.macro UNWIND_HINT_EMPTY + UNWIND_HINT sp_reg=ORC_REG_UNDEFINED +.endm + +.macro UNWIND_HINT_REGS base=%rsp offset=0 indirect=0 extra=1 iret=0 + .if \base == %rsp && \indirect + .set sp_reg, ORC_REG_SP_INDIRECT + .elseif \base == %rsp + .set sp_reg, ORC_REG_SP + .elseif \base == %rbp + .set sp_reg, ORC_REG_BP + .elseif \base == %rdi + .set sp_reg, ORC_REG_DI + .elseif \base == %rdx + .set sp_reg, ORC_REG_DX + .elseif \base == %r10 + .set sp_reg, ORC_REG_R10 + .else + .error "UNWIND_HINT_REGS: bad base register" + .endif + + .set sp_offset, \offset + + .if \iret + .set type, ORC_TYPE_REGS_IRET + .elseif \extra == 0 + .set type, ORC_TYPE_REGS_IRET + .set sp_offset, \offset + (16*8) + .else + .set type, ORC_TYPE_REGS + .endif + + UNWIND_HINT sp_reg=sp_reg sp_offset=sp_offset type=type +.endm + +.macro UNWIND_HINT_IRET_REGS base=%rsp offset=0 + UNWIND_HINT_REGS base=\base offset=\offset iret=1 +.endm + +.macro UNWIND_HINT_FUNC sp_offset=8 + UNWIND_HINT sp_offset=\sp_offset +.endm + +#else /* !__ASSEMBLY__ */ + +#define UNWIND_HINT(sp_reg, sp_offset, type) \ + "987: \n\t" \ + ".pushsection .discard.unwind_hints\n\t" \ + /* struct unwind_hint */ \ + ".long 987b - .\n\t" \ + ".short " __stringify(sp_offset) "\n\t" \ + ".byte " __stringify(sp_reg) "\n\t" \ + ".byte " __stringify(type) "\n\t" \ + ".popsection\n\t" + +#define UNWIND_HINT_SAVE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_SAVE) + +#define UNWIND_HINT_RESTORE UNWIND_HINT(0, 0, UNWIND_HINT_TYPE_RESTORE) + +#endif /* __ASSEMBLY__ */ + +#endif /* _ASM_X86_UNWIND_HINTS_H */ --- linux-azure-4.13.0.orig/arch/x86/include/asm/vsyscall.h +++ linux-azure-4.13.0/arch/x86/include/asm/vsyscall.h @@ -6,6 +6,7 @@ #ifdef CONFIG_X86_VSYSCALL_EMULATION extern void map_vsyscall(void); +extern void set_vsyscall_pgtable_user_bits(pgd_t *root); /* * Called on instruction fetch fault in vsyscall page. --- linux-azure-4.13.0.orig/arch/x86/include/asm/x86_init.h +++ linux-azure-4.13.0/arch/x86/include/asm/x86_init.h @@ -114,6 +114,18 @@ }; /** + * struct x86_hyper_init - x86 hypervisor init functions + * @init_platform: platform setup + * @x2apic_available: X2APIC detection + * @init_mem_mapping: setup early mappings during init_mem_mapping() + */ +struct x86_hyper_init { + void (*init_platform)(void); + bool (*x2apic_available)(void); + void (*init_mem_mapping)(void); +}; + +/** * struct x86_init_ops - functions for platform specific setup * */ @@ -126,6 +138,7 @@ struct x86_init_timers timers; struct x86_init_iommu iommu; struct x86_init_pci pci; + struct x86_hyper_init hyper; }; /** @@ -199,6 +212,15 @@ }; /** + * struct x86_hyper_runtime - x86 hypervisor specific runtime callbacks + * + * @pin_vcpu: pin current vcpu to specified physical cpu (run rarely) + */ +struct x86_hyper_runtime { + void (*pin_vcpu)(int cpu); +}; + +/** * struct x86_platform_ops - platform specific runtime functions * @calibrate_cpu: calibrate CPU * @calibrate_tsc: calibrate TSC, if different from CPU @@ -217,6 +239,7 @@ * possible in x86_early_init_platform_quirks() by * only using the current x86_hardware_subarch * semantics. + * @hyper: x86 hypervisor specific runtime callbacks */ struct x86_platform_ops { unsigned long (*calibrate_cpu)(void); @@ -232,6 +255,7 @@ void (*apic_post_init)(void); struct x86_legacy_features legacy; void (*set_legacy_features)(void); + struct x86_hyper_runtime hyper; }; struct pci_dev; --- linux-azure-4.13.0.orig/arch/x86/include/asm/xen/hypercall.h +++ linux-azure-4.13.0/arch/x86/include/asm/xen/hypercall.h @@ -44,6 +44,7 @@ #include #include #include +#include #include #include @@ -113,10 +114,9 @@ register unsigned long __arg2 asm(__HYPERCALL_ARG2REG) = __arg2; \ register unsigned long __arg3 asm(__HYPERCALL_ARG3REG) = __arg3; \ register unsigned long __arg4 asm(__HYPERCALL_ARG4REG) = __arg4; \ - register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; \ - register void *__sp asm(_ASM_SP); + register unsigned long __arg5 asm(__HYPERCALL_ARG5REG) = __arg5; -#define __HYPERCALL_0PARAM "=r" (__res), "+r" (__sp) +#define __HYPERCALL_0PARAM "=r" (__res), ASM_CALL_CONSTRAINT #define __HYPERCALL_1PARAM __HYPERCALL_0PARAM, "+r" (__arg1) #define __HYPERCALL_2PARAM __HYPERCALL_1PARAM, "+r" (__arg2) #define __HYPERCALL_3PARAM __HYPERCALL_2PARAM, "+r" (__arg3) @@ -218,9 +218,9 @@ __HYPERCALL_5ARG(a1, a2, a3, a4, a5); stac(); - asm volatile("call *%[call]" + asm volatile(CALL_NOSPEC : __HYPERCALL_5PARAM - : [call] "a" (&hypercall_page[call]) + : [thunk_target] "a" (&hypercall_page[call]) : __HYPERCALL_CLOBBER5); clac(); @@ -552,6 +552,8 @@ MULTI_update_descriptor(struct multicall_entry *mcl, u64 maddr, struct desc_struct desc) { + u32 *p = (u32 *) &desc; + mcl->op = __HYPERVISOR_update_descriptor; if (sizeof(maddr) == sizeof(long)) { mcl->args[0] = maddr; @@ -559,8 +561,8 @@ } else { mcl->args[0] = maddr; mcl->args[1] = maddr >> 32; - mcl->args[2] = desc.a; - mcl->args[3] = desc.b; + mcl->args[2] = *p++; + mcl->args[3] = *p; } trace_xen_mc_entry(mcl, sizeof(maddr) == sizeof(long) ? 2 : 4); --- linux-azure-4.13.0.orig/arch/x86/include/uapi/asm/hyperv.h +++ linux-azure-4.13.0/arch/x86/include/uapi/asm/hyperv.h @@ -149,6 +149,9 @@ */ #define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) +/* Recommend using the newer ExProcessorMasks interface */ +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) + /* * HV_VP_SET available */ @@ -242,7 +245,11 @@ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) /* Declare the various hypercall operations. */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d @@ -259,6 +266,35 @@ #define HV_PROCESSOR_POWER_STATE_C2 2 #define HV_PROCESSOR_POWER_STATE_C3 3 +/* Hypercall interface */ +union hv_hypercall_input { + u64 as_uint64; + struct { + __u32 as_uint32_lo; + __u32 as_uint32_hi; + }; + struct { + __u64 code:16; + __u64 fast:1; + __u64 varhead_size:10; + __u64 reserved1:5; + __u64 rep_count:12; + __u64 reserved2:4; + __u64 rep_start:12; + __u64 reserved3:4; + }; +}; + +#define HV_FLUSH_ALL_PROCESSORS BIT(0) +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES BIT(1) +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY BIT(2) +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT BIT(3) + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARCE_4K, + HV_GENERIC_SET_ALL, +}; + /* hypercall status code */ #define HV_STATUS_SUCCESS 0 #define HV_STATUS_INVALID_HYPERCALL_CODE 2 --- linux-azure-4.13.0.orig/arch/x86/include/uapi/asm/processor-flags.h +++ linux-azure-4.13.0/arch/x86/include/uapi/asm/processor-flags.h @@ -77,7 +77,12 @@ #define X86_CR3_PWT _BITUL(X86_CR3_PWT_BIT) #define X86_CR3_PCD_BIT 4 /* Page Cache Disable */ #define X86_CR3_PCD _BITUL(X86_CR3_PCD_BIT) -#define X86_CR3_PCID_MASK _AC(0x00000fff,UL) /* PCID Mask */ + +#define X86_CR3_PCID_BITS 12 +#define X86_CR3_PCID_MASK (_AC((1UL << X86_CR3_PCID_BITS) - 1, UL)) + +#define X86_CR3_PCID_NOFLUSH_BIT 63 /* Preserve old PCID */ +#define X86_CR3_PCID_NOFLUSH _BITULL(X86_CR3_PCID_NOFLUSH_BIT) /* * Intel CPU features in CR4 @@ -151,5 +156,8 @@ #define CX86_ARR_BASE 0xc4 #define CX86_RCR_BASE 0xdc +#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ + X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ + X86_CR0_PG) #endif /* _UAPI_ASM_X86_PROCESSOR_FLAGS_H */ --- linux-azure-4.13.0.orig/arch/x86/kernel/Makefile +++ linux-azure-4.13.0/arch/x86/kernel/Makefile @@ -24,9 +24,9 @@ KASAN_SANITIZE_head$(BITS).o := n KASAN_SANITIZE_dumpstack.o := n KASAN_SANITIZE_dumpstack_$(BITS).o := n -KASAN_SANITIZE_stacktrace.o := n +KASAN_SANITIZE_stacktrace.o := n +KASAN_SANITIZE_paravirt.o := n -OBJECT_FILES_NON_STANDARD_head_$(BITS).o := y OBJECT_FILES_NON_STANDARD_relocate_kernel_$(BITS).o := y OBJECT_FILES_NON_STANDARD_ftrace_$(BITS).o := y OBJECT_FILES_NON_STANDARD_test_nx.o := y @@ -126,11 +126,9 @@ obj-$(CONFIG_TRACING) += tracepoint.o obj-$(CONFIG_SCHED_MC_PRIO) += itmt.o -ifdef CONFIG_FRAME_POINTER -obj-y += unwind_frame.o -else -obj-y += unwind_guess.o -endif +obj-$(CONFIG_UNWINDER_ORC) += unwind_orc.o +obj-$(CONFIG_UNWINDER_FRAME_POINTER) += unwind_frame.o +obj-$(CONFIG_UNWINDER_GUESS) += unwind_guess.o ### # 64 bit specific files --- linux-azure-4.13.0.orig/arch/x86/kernel/acpi/wakeup_32.S +++ linux-azure-4.13.0/arch/x86/kernel/acpi/wakeup_32.S @@ -2,6 +2,7 @@ #include #include #include +#include # Copyright 2003, 2008 Pavel Machek , distribute under GPLv2 @@ -37,6 +38,7 @@ # jump to place where we left off movl saved_eip, %eax + ANNOTATE_RETPOLINE_SAFE jmp *%eax bogus_magic: --- linux-azure-4.13.0.orig/arch/x86/kernel/acpi/wakeup_64.S +++ linux-azure-4.13.0/arch/x86/kernel/acpi/wakeup_64.S @@ -6,6 +6,7 @@ #include #include #include +#include # Copyright 2003 Pavel Machek , distribute under GPLv2 @@ -33,6 +34,7 @@ movq saved_rbp, %rbp movq saved_rip, %rax + ANNOTATE_RETPOLINE_SAFE jmp *%rax ENDPROC(wakeup_long64) --- linux-azure-4.13.0.orig/arch/x86/kernel/alternative.c +++ linux-azure-4.13.0/arch/x86/kernel/alternative.c @@ -344,9 +344,12 @@ static void __init_or_module noinline optimize_nops(struct alt_instr *a, u8 *instr) { unsigned long flags; + int i; - if (instr[0] != 0x90) - return; + for (i = 0; i < a->padlen; i++) { + if (instr[i] != 0x90) + return; + } local_irq_save(flags); add_nops(instr + (a->instrlen - a->padlen), a->padlen); --- linux-azure-4.13.0.orig/arch/x86/kernel/amd_nb.c +++ linux-azure-4.13.0/arch/x86/kernel/amd_nb.c @@ -27,6 +27,8 @@ {} }; +#define PCI_DEVICE_ID_AMD_CNB17H_F4 0x1704 + const struct pci_device_id amd_nb_misc_ids[] = { { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_K8_NB_MISC) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_10H_NB_MISC) }, @@ -37,6 +39,7 @@ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F3) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F3) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F3) }, {} }; EXPORT_SYMBOL_GPL(amd_nb_misc_ids); @@ -48,6 +51,7 @@ { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_16H_M30H_NB_F4) }, { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_17H_DF_F4) }, + { PCI_DEVICE(PCI_VENDOR_ID_AMD, PCI_DEVICE_ID_AMD_CNB17H_F4) }, {} }; @@ -402,11 +406,48 @@ } EXPORT_SYMBOL_GPL(amd_flush_garts); +static void __fix_erratum_688(void *info) +{ +#define MSR_AMD64_IC_CFG 0xC0011021 + + msr_set_bit(MSR_AMD64_IC_CFG, 3); + msr_set_bit(MSR_AMD64_IC_CFG, 14); +} + +/* Apply erratum 688 fix so machines without a BIOS fix work. */ +static __init void fix_erratum_688(void) +{ + struct pci_dev *F4; + u32 val; + + if (boot_cpu_data.x86 != 0x14) + return; + + if (!amd_northbridges.num) + return; + + F4 = node_to_amd_nb(0)->link; + if (!F4) + return; + + if (pci_read_config_dword(F4, 0x164, &val)) + return; + + if (val & BIT(2)) + return; + + on_each_cpu(__fix_erratum_688, NULL, 0); + + pr_info("x86/cpu/AMD: CPU erratum 688 worked around\n"); +} + static __init int init_amd_nbs(void) { amd_cache_northbridges(); amd_cache_gart(); + fix_erratum_688(); + return 0; } --- linux-azure-4.13.0.orig/arch/x86/kernel/apic/apic.c +++ linux-azure-4.13.0/arch/x86/kernel/apic/apic.c @@ -35,7 +35,6 @@ #include #include -#include #include #include #include @@ -575,11 +574,21 @@ return ~0U; } +static u32 skx_deadline_rev(void) +{ + switch (boot_cpu_data.x86_mask) { + case 0x03: return 0x01000136; + case 0x04: return 0x02000014; + } + + return ~0U; +} + static const struct x86_cpu_id deadline_match[] = { DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_HASWELL_X, hsx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_BROADWELL_X, 0x0b000020), DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_BROADWELL_XEON_D, bdx_deadline_rev), - DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_SKYLAKE_X, 0x02000014), + DEADLINE_MODEL_MATCH_FUNC( INTEL_FAM6_SKYLAKE_X, skx_deadline_rev), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_CORE, 0x22), DEADLINE_MODEL_MATCH_REV ( INTEL_FAM6_HASWELL_ULT, 0x20), @@ -599,9 +608,14 @@ static void apic_check_deadline_errata(void) { - const struct x86_cpu_id *m = x86_match_cpu(deadline_match); + const struct x86_cpu_id *m; u32 rev; + if (!boot_cpu_has(X86_FEATURE_TSC_DEADLINE_TIMER) || + boot_cpu_has(X86_FEATURE_HYPERVISOR)) + return; + + m = x86_match_cpu(deadline_match); if (!m) return; @@ -1059,9 +1073,7 @@ * interrupt lock, which is the WrongThing (tm) to do. */ entering_ack_irq(); - trace_local_timer_entry(LOCAL_TIMER_VECTOR); local_apic_timer_interrupt(); - trace_local_timer_exit(LOCAL_TIMER_VECTOR); exiting_irq(); set_irq_regs(old_regs); @@ -1651,7 +1663,7 @@ * under KVM */ if (max_physical_apicid > 255 || - !hypervisor_x2apic_available()) { + !x86_init.hyper.x2apic_available()) { pr_info("x2apic: IRQ remapping doesn't support X2APIC mode\n"); x2apic_disable(); return; @@ -1952,9 +1964,7 @@ u8 vector = ~regs->orig_ax; entering_irq(); - trace_spurious_apic_entry(vector); __smp_spurious_interrupt(vector); - trace_spurious_apic_exit(vector); exiting_irq(); } @@ -2008,9 +2018,7 @@ __visible void __irq_entry smp_trace_error_interrupt(struct pt_regs *regs) { entering_irq(); - trace_error_apic_entry(ERROR_APIC_VECTOR); __smp_error_interrupt(regs); - trace_error_apic_exit(ERROR_APIC_VECTOR); exiting_irq(); } --- linux-azure-4.13.0.orig/arch/x86/kernel/apic/x2apic_uv_x.c +++ linux-azure-4.13.0/arch/x86/kernel/apic/x2apic_uv_x.c @@ -920,9 +920,8 @@ /* * percpu heartbeat timer */ -static void uv_heartbeat(unsigned long ignored) +static void uv_heartbeat(struct timer_list *timer) { - struct timer_list *timer = &uv_scir_info->timer; unsigned char bits = uv_scir_info->state; /* Flip heartbeat bit: */ @@ -947,7 +946,7 @@ struct timer_list *timer = &uv_cpu_scir_info(cpu)->timer; uv_set_cpu_scir_bits(cpu, SCIR_CPU_HEARTBEAT|SCIR_CPU_ACTIVITY); - setup_pinned_timer(timer, uv_heartbeat, cpu); + timer_setup(timer, uv_heartbeat, TIMER_PINNED); timer->expires = jiffies + SCIR_CPU_HB_INTERVAL; add_timer_on(timer, cpu); uv_cpu_scir_info(cpu)->enabled = 1; --- linux-azure-4.13.0.orig/arch/x86/kernel/asm-offsets.c +++ linux-azure-4.13.0/arch/x86/kernel/asm-offsets.c @@ -16,6 +16,7 @@ #include #include #include +#include #ifdef CONFIG_XEN #include @@ -92,4 +93,13 @@ BLANK(); DEFINE(PTREGS_SIZE, sizeof(struct pt_regs)); + + /* TLB state for the entry code */ + OFFSET(TLB_STATE_user_pcid_flush_mask, tlb_state, user_pcid_flush_mask); + + /* Layout info for cpu_entry_area */ + OFFSET(CPU_ENTRY_AREA_tss, cpu_entry_area, tss); + OFFSET(CPU_ENTRY_AREA_entry_trampoline, cpu_entry_area, entry_trampoline); + OFFSET(CPU_ENTRY_AREA_entry_stack, cpu_entry_area, entry_stack_page); + DEFINE(SIZEOF_entry_stack, sizeof(struct entry_stack)); } --- linux-azure-4.13.0.orig/arch/x86/kernel/asm-offsets_32.c +++ linux-azure-4.13.0/arch/x86/kernel/asm-offsets_32.c @@ -49,13 +49,8 @@ BLANK(); /* Offset from the sysenter stack to tss.sp0 */ - DEFINE(TSS_sysenter_sp0, offsetof(struct tss_struct, x86_tss.sp0) - - offsetofend(struct tss_struct, SYSENTER_stack)); - - /* Offset from cpu_tss to SYSENTER_stack */ - OFFSET(CPU_TSS_SYSENTER_stack, tss_struct, SYSENTER_stack); - /* Size of SYSENTER_stack */ - DEFINE(SIZEOF_SYSENTER_stack, sizeof(((struct tss_struct *)0)->SYSENTER_stack)); + DEFINE(TSS_sysenter_sp0, offsetof(struct cpu_entry_area, tss.x86_tss.sp0) - + offsetofend(struct cpu_entry_area, entry_stack_page.stack)); #ifdef CONFIG_CC_STACKPROTECTOR BLANK(); --- linux-azure-4.13.0.orig/arch/x86/kernel/asm-offsets_64.c +++ linux-azure-4.13.0/arch/x86/kernel/asm-offsets_64.c @@ -20,9 +20,11 @@ int main(void) { #ifdef CONFIG_PARAVIRT - OFFSET(PV_IRQ_adjust_exception_frame, pv_irq_ops, adjust_exception_frame); OFFSET(PV_CPU_usergs_sysret64, pv_cpu_ops, usergs_sysret64); OFFSET(PV_CPU_swapgs, pv_cpu_ops, swapgs); +#ifdef CONFIG_DEBUG_ENTRY + OFFSET(PV_IRQ_save_fl, pv_irq_ops, save_fl); +#endif BLANK(); #endif @@ -63,6 +65,7 @@ OFFSET(TSS_ist, tss_struct, x86_tss.ist); OFFSET(TSS_sp0, tss_struct, x86_tss.sp0); + OFFSET(TSS_sp1, tss_struct, x86_tss.sp1); BLANK(); #ifdef CONFIG_CC_STACKPROTECTOR --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/Makefile +++ linux-azure-4.13.0/arch/x86/kernel/cpu/Makefile @@ -21,7 +21,8 @@ obj-y += rdrand.o obj-y += match.o obj-y += bugs.o -obj-$(CONFIG_CPU_FREQ) += aperfmperf.o +obj-y += aperfmperf.o +obj-y += cpuid-deps.o obj-$(CONFIG_PROC_FS) += proc.o obj-$(CONFIG_X86_FEATURE_NAMES) += capflags.o powerflags.o @@ -33,7 +34,7 @@ obj-$(CONFIG_CPU_SUP_TRANSMETA_32) += transmeta.o obj-$(CONFIG_CPU_SUP_UMC_32) += umc.o -obj-$(CONFIG_INTEL_RDT_A) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_schemata.o +obj-$(CONFIG_INTEL_RDT) += intel_rdt.o intel_rdt_rdtgroup.o intel_rdt_monitor.o intel_rdt_ctrlmondata.o obj-$(CONFIG_X86_MCE) += mcheck/ obj-$(CONFIG_MTRR) += mtrr/ --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/amd.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/amd.c @@ -760,8 +760,11 @@ case 0x15: init_amd_bd(c); break; } - /* Enable workaround for FXSAVE leak */ - if (c->x86 >= 6) + /* + * Enable workaround for FXSAVE leak on CPUs + * without a XSaveErPtr feature + */ + if ((c->x86 >= 6) && (!cpu_has(c, X86_FEATURE_XSAVEERPTR))) set_cpu_bug(c, X86_BUG_FXSAVE_LEAK); cpu_detect_cache_sizes(c); @@ -782,8 +785,32 @@ set_cpu_cap(c, X86_FEATURE_K8); if (cpu_has(c, X86_FEATURE_XMM2)) { - /* MFENCE stops RDTSC speculation */ - set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + unsigned long long val; + int ret; + + /* + * A serializing LFENCE has less overhead than MFENCE, so + * use it for execution serialization. On families which + * don't have that MSR, LFENCE is already serializing. + * msr_set_bit() uses the safe accessors, too, even if the MSR + * is not present. + */ + msr_set_bit(MSR_F10H_DECFG, + MSR_F10H_DECFG_LFENCE_SERIALIZE_BIT); + + /* + * Verify that the MSR write was successful (could be running + * under a hypervisor) and only then assume that LFENCE is + * serializing. + */ + ret = rdmsrl_safe(MSR_F10H_DECFG, &val); + if (!ret && (val & MSR_F10H_DECFG_LFENCE_SERIALIZE)) { + /* A serializing LFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_LFENCE_RDTSC); + } else { + /* MFENCE stops RDTSC speculation */ + set_cpu_cap(c, X86_FEATURE_MFENCE_RDTSC); + } } /* @@ -803,6 +830,45 @@ /* AMD CPUs don't reset SS attributes on SYSRET, Xen does. */ if (!cpu_has(c, X86_FEATURE_XENPV)) set_cpu_bug(c, X86_BUG_SYSRET_SS_ATTRS); + + /* AMD speculative control support */ + if (cpu_has(c, X86_FEATURE_SPEC_CTRL)) { + pr_info_once("FEATURE SPEC_CTRL Present\n"); + set_ibrs_supported(); + set_ibpb_supported(); + if (ibrs_inuse) + sysctl_ibrs_enabled = 1; + if (ibpb_inuse) + sysctl_ibpb_enabled = 1; + } else if (cpu_has(c, X86_FEATURE_IBPB)) { + pr_info_once("FEATURE SPEC_CTRL Not Present\n"); + pr_info_once("FEATURE IBPB Present\n"); + set_ibpb_supported(); + if (ibpb_inuse) + sysctl_ibpb_enabled = 1; + } else { + pr_info_once("FEATURE SPEC_CTRL Not Present\n"); + pr_info_once("FEATURE IBPB Not Present\n"); + /* + * On AMD processors that do not support the speculative + * control features, IBPB type support can be achieved by + * disabling indirect branch predictor support. + */ + if (!ibpb_disabled) { + u64 val; + + switch (c->x86) { + case 0x10: + case 0x12: + case 0x16: + pr_info_once("Disabling indirect branch predictor support\n"); + rdmsrl(MSR_F15H_IC_CFG, val); + val |= MSR_F15H_IC_CFG_DIS_IND; + wrmsrl(MSR_F15H_IC_CFG, val); + break; + } + } + } } #ifdef CONFIG_X86_32 --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/aperfmperf.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/aperfmperf.c @@ -14,6 +14,8 @@ #include #include +#include "cpu.h" + struct aperfmperf_sample { unsigned int khz; ktime_t time; @@ -24,7 +26,7 @@ static DEFINE_PER_CPU(struct aperfmperf_sample, samples); #define APERFMPERF_CACHE_THRESHOLD_MS 10 -#define APERFMPERF_REFRESH_DELAY_MS 20 +#define APERFMPERF_REFRESH_DELAY_MS 10 #define APERFMPERF_STALE_THRESHOLD_MS 1000 /* @@ -38,14 +40,8 @@ u64 aperf, aperf_delta; u64 mperf, mperf_delta; struct aperfmperf_sample *s = this_cpu_ptr(&samples); - ktime_t now = ktime_get(); - s64 time_delta = ktime_ms_delta(now, s->time); unsigned long flags; - /* Don't bother re-computing within the cache threshold time. */ - if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) - return; - local_irq_save(flags); rdmsrl(MSR_IA32_APERF, aperf); rdmsrl(MSR_IA32_MPERF, mperf); @@ -61,31 +57,68 @@ if (mperf_delta == 0) return; - s->time = now; + s->time = ktime_get(); s->aperf = aperf; s->mperf = mperf; + s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); +} + +static bool aperfmperf_snapshot_cpu(int cpu, ktime_t now, bool wait) +{ + s64 time_delta = ktime_ms_delta(now, per_cpu(samples.time, cpu)); - /* If the previous iteration was too long ago, discard it. */ - if (time_delta > APERFMPERF_STALE_THRESHOLD_MS) - s->khz = 0; - else - s->khz = div64_u64((cpu_khz * aperf_delta), mperf_delta); + /* Don't bother re-computing within the cache threshold time. */ + if (time_delta < APERFMPERF_CACHE_THRESHOLD_MS) + return true; + + smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, wait); + + /* Return false if the previous iteration was too long ago. */ + return time_delta <= APERFMPERF_STALE_THRESHOLD_MS; } -unsigned int arch_freq_get_on_cpu(int cpu) +unsigned int aperfmperf_get_khz(int cpu) +{ + if (!cpu_khz) + return 0; + + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return 0; + + aperfmperf_snapshot_cpu(cpu, ktime_get(), true); + return per_cpu(samples.khz, cpu); +} + +void arch_freq_prepare_all(void) { - unsigned int khz; + ktime_t now = ktime_get(); + bool wait = false; + int cpu; + + if (!cpu_khz) + return; + if (!static_cpu_has(X86_FEATURE_APERFMPERF)) + return; + + for_each_online_cpu(cpu) + if (!aperfmperf_snapshot_cpu(cpu, now, false)) + wait = true; + + if (wait) + msleep(APERFMPERF_REFRESH_DELAY_MS); +} + +unsigned int arch_freq_get_on_cpu(int cpu) +{ if (!cpu_khz) return 0; if (!static_cpu_has(X86_FEATURE_APERFMPERF)) return 0; - smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); - khz = per_cpu(samples.khz, cpu); - if (khz) - return khz; + if (aperfmperf_snapshot_cpu(cpu, ktime_get(), true)) + return per_cpu(samples.khz, cpu); msleep(APERFMPERF_REFRESH_DELAY_MS); smp_call_function_single(cpu, aperfmperf_snapshot_khz, NULL, 1); --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/bugs.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/bugs.c @@ -9,6 +9,11 @@ */ #include #include +#include +#include + +#include +#include #include #include #include @@ -18,6 +23,9 @@ #include #include #include +#include + +static void __init spectre_v2_select_mitigation(void); void __init check_bugs(void) { @@ -28,6 +36,9 @@ print_cpu_info(&boot_cpu_data); } + /* Select the proper spectre mitigation before patching alternatives */ + spectre_v2_select_mitigation(); + #ifdef CONFIG_X86_32 /* * Check whether we are able to run this kernel safely on SMP. @@ -59,3 +70,236 @@ set_memory_4k((unsigned long)__va(0), 1); #endif } + +/* The kernel command line selection */ +enum spectre_v2_mitigation_cmd { + SPECTRE_V2_CMD_NONE, + SPECTRE_V2_CMD_AUTO, + SPECTRE_V2_CMD_FORCE, + SPECTRE_V2_CMD_RETPOLINE, + SPECTRE_V2_CMD_RETPOLINE_GENERIC, + SPECTRE_V2_CMD_RETPOLINE_AMD, +}; + +static const char *spectre_v2_strings[] = { + [SPECTRE_V2_NONE] = "Vulnerable", + [SPECTRE_V2_RETPOLINE_MINIMAL] = "Vulnerable: Minimal generic ASM retpoline", + [SPECTRE_V2_RETPOLINE_MINIMAL_AMD] = "Vulnerable: Minimal AMD ASM retpoline", + [SPECTRE_V2_RETPOLINE_GENERIC] = "Mitigation: Full generic retpoline", + [SPECTRE_V2_RETPOLINE_AMD] = "Mitigation: Full AMD retpoline", +}; + +#undef pr_fmt +#define pr_fmt(fmt) "Spectre V2 mitigation: " fmt + +static enum spectre_v2_mitigation spectre_v2_enabled = SPECTRE_V2_NONE; + +static void __init spec2_print_if_insecure(const char *reason) +{ + if (boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", reason); +} + +static void __init spec2_print_if_secure(const char *reason) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + pr_info("%s\n", reason); +} + +static inline bool retp_compiler(void) +{ + return __is_defined(RETPOLINE); +} + +static inline bool match_option(const char *arg, int arglen, const char *opt) +{ + int len = strlen(opt); + + return len == arglen && !strncmp(arg, opt, len); +} + +static enum spectre_v2_mitigation_cmd __init spectre_v2_parse_cmdline(void) +{ + char arg[20]; + int ret; + + ret = cmdline_find_option(boot_command_line, "spectre_v2", arg, + sizeof(arg)); + if (ret > 0) { + if (match_option(arg, ret, "off")) { + goto disable; + } else if (match_option(arg, ret, "on")) { + spec2_print_if_secure("force enabled on command line."); + return SPECTRE_V2_CMD_FORCE; + } else if (match_option(arg, ret, "retpoline")) { + spec2_print_if_insecure("retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE; + } else if (match_option(arg, ret, "retpoline,amd")) { + if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD) { + pr_err("retpoline,amd selected but CPU is not AMD. Switching to AUTO select\n"); + return SPECTRE_V2_CMD_AUTO; + } + spec2_print_if_insecure("AMD retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE_AMD; + } else if (match_option(arg, ret, "retpoline,generic")) { + spec2_print_if_insecure("generic retpoline selected on command line."); + return SPECTRE_V2_CMD_RETPOLINE_GENERIC; + } else if (match_option(arg, ret, "auto")) { + return SPECTRE_V2_CMD_AUTO; + } + } + + if (!cmdline_find_option_bool(boot_command_line, "nospectre_v2")) + return SPECTRE_V2_CMD_AUTO; +disable: + spec2_print_if_insecure("disabled on command line."); + return SPECTRE_V2_CMD_NONE; +} + +/* Check for Skylake-like CPUs (for RSB handling) */ +static bool __init is_skylake_era(void) +{ + if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && + boot_cpu_data.x86 == 6) { + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_SKYLAKE_MOBILE: + case INTEL_FAM6_SKYLAKE_DESKTOP: + case INTEL_FAM6_SKYLAKE_X: + case INTEL_FAM6_KABYLAKE_MOBILE: + case INTEL_FAM6_KABYLAKE_DESKTOP: + return true; + } + } + return false; +} + +static void __init spectre_v2_select_mitigation(void) +{ + enum spectre_v2_mitigation_cmd cmd = spectre_v2_parse_cmdline(); + enum spectre_v2_mitigation mode = SPECTRE_V2_NONE; + + /* + * If the CPU is not affected and the command line mode is NONE or AUTO + * then nothing to do. + */ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2) && + (cmd == SPECTRE_V2_CMD_NONE || cmd == SPECTRE_V2_CMD_AUTO)) + return; + + switch (cmd) { + case SPECTRE_V2_CMD_NONE: + return; + + case SPECTRE_V2_CMD_FORCE: + /* FALLTRHU */ + case SPECTRE_V2_CMD_AUTO: + goto retpoline_auto; + + case SPECTRE_V2_CMD_RETPOLINE_AMD: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_amd; + break; + case SPECTRE_V2_CMD_RETPOLINE_GENERIC: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_generic; + break; + case SPECTRE_V2_CMD_RETPOLINE: + if (IS_ENABLED(CONFIG_RETPOLINE)) + goto retpoline_auto; + break; + } + pr_err("kernel not compiled with retpoline; no mitigation available!"); + return; + +retpoline_auto: + if (boot_cpu_data.x86_vendor == X86_VENDOR_AMD) { + retpoline_amd: + if (!boot_cpu_has(X86_FEATURE_LFENCE_RDTSC)) { + pr_err("LFENCE not serializing. Switching to generic retpoline\n"); + goto retpoline_generic; + } + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_AMD : + SPECTRE_V2_RETPOLINE_MINIMAL_AMD; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE_AMD); + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } else { + retpoline_generic: + mode = retp_compiler() ? SPECTRE_V2_RETPOLINE_GENERIC : + SPECTRE_V2_RETPOLINE_MINIMAL; + setup_force_cpu_cap(X86_FEATURE_RETPOLINE); + } + + spectre_v2_enabled = mode; + pr_info("%s\n", spectre_v2_strings[mode]); + + pr_info("Speculation control IBPB %s IBRS %s", + ibpb_supported ? "supported" : "not-supported", + ibrs_supported ? "supported" : "not-supported"); + + /* + * If we have a full retpoline mode and then disable IBPB in kernel mode + * we do not require both. + */ + if (mode == SPECTRE_V2_RETPOLINE_AMD || + mode == SPECTRE_V2_RETPOLINE_GENERIC) + { + if (ibrs_supported) { + pr_info("Retpoline compiled kernel. Defaulting IBRS to disabled"); + set_ibrs_disabled(); + if (!ibrs_inuse) + sysctl_ibrs_enabled = 0; + } + } + + /* + * If neither SMEP or KPTI are available, there is a risk of + * hitting userspace addresses in the RSB after a context switch + * from a shallow call stack to a deeper one. To prevent this fill + * the entire RSB, even when using IBRS. + * + * Skylake era CPUs have a separate issue with *underflow* of the + * RSB, when they will predict 'ret' targets from the generic BTB. + * The proper mitigation for this is IBRS. If IBRS is not supported + * or deactivated in favour of retpolines the RSB fill on context + * switch is required. + */ + if ((!boot_cpu_has(X86_FEATURE_PTI) && + !boot_cpu_has(X86_FEATURE_SMEP)) || is_skylake_era()) { + setup_force_cpu_cap(X86_FEATURE_RSB_CTXSW); + pr_info("Filling RSB on context switch\n"); + } +} + +#undef pr_fmt + +#ifdef CONFIG_SYSFS +ssize_t cpu_show_meltdown(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_CPU_MELTDOWN)) + return sprintf(buf, "Not affected\n"); + if (boot_cpu_has(X86_FEATURE_PTI)) + return sprintf(buf, "Mitigation: PTI\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v1(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V1)) + return sprintf(buf, "Not affected\n"); + if (osb_is_enabled) + return sprintf(buf, "Mitigation: OSB (observable speculation barrier, Intel v6)\n"); + return sprintf(buf, "Vulnerable\n"); +} + +ssize_t cpu_show_spectre_v2(struct device *dev, + struct device_attribute *attr, char *buf) +{ + if (!boot_cpu_has_bug(X86_BUG_SPECTRE_V2)) + return sprintf(buf, "Not affected\n"); + + return sprintf(buf, "%s%s\n", spectre_v2_strings[spectre_v2_enabled], + ibpb_inuse ? ", IBPB (Intel v4)" : ""); +} +#endif --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/common.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/common.c @@ -168,6 +168,24 @@ } __setup("nompx", x86_mpx_setup); +#ifdef CONFIG_X86_64 +static int __init x86_nopcid_setup(char *s) +{ + /* nopcid doesn't accept parameters */ + if (s) + return -EINVAL; + + /* do not emit a message if the feature is not present */ + if (!boot_cpu_has(X86_FEATURE_PCID)) + return 0; + + setup_clear_cpu_cap(X86_FEATURE_PCID); + pr_info("nopcid: PCID feature disabled\n"); + return 0; +} +early_param("nopcid", x86_nopcid_setup); +#endif + static int __init x86_noinvpcid_setup(char *s) { /* noinvpcid doesn't accept parameters */ @@ -434,8 +452,8 @@ return NULL; /* Not found */ } -__u32 cpu_caps_cleared[NCAPINTS]; -__u32 cpu_caps_set[NCAPINTS]; +__u32 cpu_caps_cleared[NCAPINTS + NBUGINTS]; +__u32 cpu_caps_set[NCAPINTS + NBUGINTS]; void load_percpu_segment(int cpu) { @@ -448,28 +466,23 @@ load_stack_canary_segment(); } -/* Setup the fixmap mapping only once per-processor */ -static inline void setup_fixmap_gdt(int cpu) -{ -#ifdef CONFIG_X86_64 - /* On 64-bit systems, we use a read-only fixmap GDT. */ - pgprot_t prot = PAGE_KERNEL_RO; -#else - /* - * On native 32-bit systems, the GDT cannot be read-only because - * our double fault handler uses a task gate, and entering through - * a task gate needs to change an available TSS to busy. If the GDT - * is read-only, that will triple fault. - * - * On Xen PV, the GDT must be read-only because the hypervisor requires - * it. - */ - pgprot_t prot = boot_cpu_has(X86_FEATURE_XENPV) ? - PAGE_KERNEL_RO : PAGE_KERNEL; +#ifdef CONFIG_X86_32 +/* The 32-bit entry code needs to find cpu_entry_area. */ +DEFINE_PER_CPU(struct cpu_entry_area *, cpu_entry_area); #endif - __set_fixmap(get_cpu_gdt_ro_index(cpu), get_cpu_gdt_paddr(cpu), prot); -} +#ifdef CONFIG_X86_64 +/* + * Special IST stacks which the CPU switches to when it calls + * an IST-marked descriptor entry. Up to 7 stacks (hardware + * limit), all of them are 4K, except the debug stack which + * is 8K. + */ +static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { + [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, + [DEBUG_STACK - 1] = DEBUG_STKSZ +}; +#endif /* Load the original GDT from the per-cpu structure */ void load_direct_gdt(int cpu) @@ -705,7 +718,7 @@ { int i; - for (i = 0; i < NCAPINTS; i++) { + for (i = 0; i < NCAPINTS + NBUGINTS; i++) { c->x86_capability[i] &= ~cpu_caps_cleared[i]; c->x86_capability[i] |= cpu_caps_set[i]; } @@ -885,6 +898,13 @@ } setup_force_cpu_cap(X86_FEATURE_ALWAYS); + + if (c->x86_vendor != X86_VENDOR_AMD) + setup_force_cpu_bug(X86_BUG_CPU_MELTDOWN); + + setup_force_cpu_bug(X86_BUG_SPECTRE_V1); + setup_force_cpu_bug(X86_BUG_SPECTRE_V2); + fpu__init_system(c); } @@ -1199,7 +1219,7 @@ return; cpu = get_cpu(); - tss = &per_cpu(cpu_tss, cpu); + tss = &per_cpu(cpu_tss_rw, cpu); /* * We cache MSR_IA32_SYSENTER_CS's value in the TSS's ss1 field -- @@ -1208,11 +1228,7 @@ tss->x86_tss.ss1 = __KERNEL_CS; wrmsr(MSR_IA32_SYSENTER_CS, tss->x86_tss.ss1, 0); - - wrmsr(MSR_IA32_SYSENTER_ESP, - (unsigned long)tss + offsetofend(struct tss_struct, SYSENTER_stack), - 0); - + wrmsr(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1), 0); wrmsr(MSR_IA32_SYSENTER_EIP, (unsigned long)entry_SYSENTER_32, 0); put_cpu(); @@ -1275,18 +1291,16 @@ pr_cont(")\n"); } -static __init int setup_disablecpuid(char *arg) +/* + * clearcpuid= was already parsed in fpu__init_parse_early_param. + * But we need to keep a dummy __setup around otherwise it would + * show up as an environment variable for init. + */ +static __init int setup_clearcpuid(char *arg) { - int bit; - - if (get_option(&arg, &bit) && bit >= 0 && bit < NCAPINTS * 32) - setup_clear_cpu_cap(bit); - else - return 0; - return 1; } -__setup("clearcpuid=", setup_disablecpuid); +__setup("clearcpuid=", setup_clearcpuid); #ifdef CONFIG_X86_64 struct desc_ptr idt_descr __ro_after_init = { @@ -1317,25 +1331,22 @@ DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT; EXPORT_PER_CPU_SYMBOL(__preempt_count); -/* - * Special IST stacks which the CPU switches to when it calls - * an IST-marked descriptor entry. Up to 7 stacks (hardware - * limit), all of them are 4K, except the debug stack which - * is 8K. - */ -static const unsigned int exception_stack_sizes[N_EXCEPTION_STACKS] = { - [0 ... N_EXCEPTION_STACKS - 1] = EXCEPTION_STKSZ, - [DEBUG_STACK - 1] = DEBUG_STKSZ -}; - -static DEFINE_PER_CPU_PAGE_ALIGNED(char, exception_stacks - [(N_EXCEPTION_STACKS - 1) * EXCEPTION_STKSZ + DEBUG_STKSZ]); - /* May not be marked __init: used by software suspend */ void syscall_init(void) { + extern char _entry_trampoline[]; + extern char entry_SYSCALL_64_trampoline[]; + + int cpu = smp_processor_id(); + unsigned long SYSCALL64_entry_trampoline = + (unsigned long)get_cpu_entry_area(cpu)->entry_trampoline + + (entry_SYSCALL_64_trampoline - _entry_trampoline); + wrmsr(MSR_STAR, 0, (__USER32_CS << 16) | __KERNEL_CS); - wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); + if (static_cpu_has(X86_FEATURE_PTI)) + wrmsrl(MSR_LSTAR, SYSCALL64_entry_trampoline); + else + wrmsrl(MSR_LSTAR, (unsigned long)entry_SYSCALL_64); #ifdef CONFIG_IA32_EMULATION wrmsrl(MSR_CSTAR, (unsigned long)entry_SYSCALL_compat); @@ -1346,7 +1357,7 @@ * AMD doesn't allow SYSENTER in long mode (either 32- or 64-bit). */ wrmsrl_safe(MSR_IA32_SYSENTER_CS, (u64)__KERNEL_CS); - wrmsrl_safe(MSR_IA32_SYSENTER_ESP, 0ULL); + wrmsrl_safe(MSR_IA32_SYSENTER_ESP, (unsigned long)(cpu_entry_stack(cpu) + 1)); wrmsrl_safe(MSR_IA32_SYSENTER_EIP, (u64)entry_SYSENTER_compat); #else wrmsrl(MSR_CSTAR, (unsigned long)ignore_sysret); @@ -1490,7 +1501,7 @@ if (cpu) load_ucode_ap(); - t = &per_cpu(cpu_tss, cpu); + t = &per_cpu(cpu_tss_rw, cpu); oist = &per_cpu(orig_ist, cpu); #ifdef CONFIG_NUMA @@ -1529,7 +1540,7 @@ * set up and load the per-CPU TSS */ if (!oist->ist[0]) { - char *estacks = per_cpu(exception_stacks, cpu); + char *estacks = get_cpu_entry_area(cpu)->exception_stacks; for (v = 0; v < N_EXCEPTION_STACKS; v++) { estacks += exception_stack_sizes[v]; @@ -1540,7 +1551,7 @@ } } - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; /* * <= is required because the CPU will access up to @@ -1552,11 +1563,17 @@ mmgrab(&init_mm); me->active_mm = &init_mm; BUG_ON(me->mm); + initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, me); - load_sp0(t, ¤t->thread); - set_tss_desc(cpu, t); + /* + * Initialize the TSS. sp0 points to the entry trampoline stack + * regardless of what task is running. + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_sp0((unsigned long)(cpu_entry_stack(cpu) + 1)); + load_mm_ldt(&init_mm); clear_all_debug_regs(); @@ -1567,7 +1584,6 @@ if (is_uv_system()) uv_cpu_init(); - setup_fixmap_gdt(cpu); load_fixmap_gdt(cpu); } @@ -1577,8 +1593,7 @@ { int cpu = smp_processor_id(); struct task_struct *curr = current; - struct tss_struct *t = &per_cpu(cpu_tss, cpu); - struct thread_struct *thread = &curr->thread; + struct tss_struct *t = &per_cpu(cpu_tss_rw, cpu); wait_for_master_cpu(cpu); @@ -1606,14 +1621,19 @@ mmgrab(&init_mm); curr->active_mm = &init_mm; BUG_ON(curr->mm); + initialize_tlbstate_and_flush(); enter_lazy_tlb(&init_mm, curr); - load_sp0(t, thread); - set_tss_desc(cpu, t); + /* + * Initialize the TSS. Don't bother initializing sp0, as the initial + * task never enters user mode. + */ + set_tss_desc(cpu, &get_cpu_entry_area(cpu)->tss.x86_tss); load_TR_desc(); + load_mm_ldt(&init_mm); - t->x86_tss.io_bitmap_base = offsetof(struct tss_struct, io_bitmap); + t->x86_tss.io_bitmap_base = IO_BITMAP_OFFSET; #ifdef CONFIG_DOUBLEFAULT /* Set up doublefault TSS pointer in the GDT */ @@ -1625,7 +1645,6 @@ fpu__init_cpu(); - setup_fixmap_gdt(cpu); load_fixmap_gdt(cpu); } #endif --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/cpu.h +++ linux-azure-4.13.0/arch/x86/kernel/cpu/cpu.h @@ -46,4 +46,7 @@ extern void get_cpu_cap(struct cpuinfo_x86 *c); extern void cpu_detect_cache_sizes(struct cpuinfo_x86 *c); + +unsigned int aperfmperf_get_khz(int cpu); + #endif /* ARCH_X86_CPU_H */ --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/cpuid-deps.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/cpuid-deps.c @@ -0,0 +1,121 @@ +/* Declare dependencies between CPUIDs */ +#include +#include +#include +#include + +struct cpuid_dep { + unsigned int feature; + unsigned int depends; +}; + +/* + * Table of CPUID features that depend on others. + * + * This only includes dependencies that can be usefully disabled, not + * features part of the base set (like FPU). + * + * Note this all is not __init / __initdata because it can be + * called from cpu hotplug. It shouldn't do anything in this case, + * but it's difficult to tell that to the init reference checker. + */ +const static struct cpuid_dep cpuid_deps[] = { + { X86_FEATURE_XSAVEOPT, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVEC, X86_FEATURE_XSAVE }, + { X86_FEATURE_XSAVES, X86_FEATURE_XSAVE }, + { X86_FEATURE_AVX, X86_FEATURE_XSAVE }, + { X86_FEATURE_PKU, X86_FEATURE_XSAVE }, + { X86_FEATURE_MPX, X86_FEATURE_XSAVE }, + { X86_FEATURE_XGETBV1, X86_FEATURE_XSAVE }, + { X86_FEATURE_FXSR_OPT, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM, X86_FEATURE_FXSR }, + { X86_FEATURE_XMM2, X86_FEATURE_XMM }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_1, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM4_2, X86_FEATURE_XMM2 }, + { X86_FEATURE_XMM3, X86_FEATURE_XMM2 }, + { X86_FEATURE_PCLMULQDQ, X86_FEATURE_XMM2 }, + { X86_FEATURE_SSSE3, X86_FEATURE_XMM2, }, + { X86_FEATURE_F16C, X86_FEATURE_XMM2, }, + { X86_FEATURE_AES, X86_FEATURE_XMM2 }, + { X86_FEATURE_SHA_NI, X86_FEATURE_XMM2 }, + { X86_FEATURE_FMA, X86_FEATURE_AVX }, + { X86_FEATURE_AVX2, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512F, X86_FEATURE_AVX, }, + { X86_FEATURE_AVX512IFMA, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512PF, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512ER, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512CD, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512DQ, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512BW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VL, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512VBMI, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VBMI2, X86_FEATURE_AVX512VL }, + { X86_FEATURE_GFNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VAES, X86_FEATURE_AVX512VL }, + { X86_FEATURE_VPCLMULQDQ, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_VNNI, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_BITALG, X86_FEATURE_AVX512VL }, + { X86_FEATURE_AVX512_4VNNIW, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_4FMAPS, X86_FEATURE_AVX512F }, + { X86_FEATURE_AVX512_VPOPCNTDQ, X86_FEATURE_AVX512F }, + {} +}; + +static inline void clear_feature(struct cpuinfo_x86 *c, unsigned int feature) +{ + /* + * Note: This could use the non atomic __*_bit() variants, but the + * rest of the cpufeature code uses atomics as well, so keep it for + * consistency. Cleanup all of it separately. + */ + if (!c) { + clear_cpu_cap(&boot_cpu_data, feature); + set_bit(feature, (unsigned long *)cpu_caps_cleared); + } else { + clear_bit(feature, (unsigned long *)c->x86_capability); + } +} + +/* Take the capabilities and the BUG bits into account */ +#define MAX_FEATURE_BITS ((NCAPINTS + NBUGINTS) * sizeof(u32) * 8) + +static void do_clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + DECLARE_BITMAP(disable, MAX_FEATURE_BITS); + const struct cpuid_dep *d; + bool changed; + + if (WARN_ON(feature >= MAX_FEATURE_BITS)) + return; + + clear_feature(c, feature); + + /* Collect all features to disable, handling dependencies */ + memset(disable, 0, sizeof(disable)); + __set_bit(feature, disable); + + /* Loop until we get a stable state. */ + do { + changed = false; + for (d = cpuid_deps; d->feature; d++) { + if (!test_bit(d->depends, disable)) + continue; + if (__test_and_set_bit(d->feature, disable)) + continue; + + changed = true; + clear_feature(c, d->feature); + } + } while (changed); +} + +void clear_cpu_cap(struct cpuinfo_x86 *c, unsigned int feature) +{ + do_clear_cpu_cap(c, feature); +} + +void setup_clear_cpu_cap(unsigned int feature) +{ + do_clear_cpu_cap(NULL, feature); +} --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/hypervisor.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/hypervisor.c @@ -26,6 +26,12 @@ #include #include +extern const struct hypervisor_x86 x86_hyper_vmware; +extern const struct hypervisor_x86 x86_hyper_ms_hyperv; +extern const struct hypervisor_x86 x86_hyper_xen_pv; +extern const struct hypervisor_x86 x86_hyper_xen_hvm; +extern const struct hypervisor_x86 x86_hyper_kvm; + static const __initconst struct hypervisor_x86 * const hypervisors[] = { #ifdef CONFIG_XEN_PV @@ -41,54 +47,52 @@ #endif }; -const struct hypervisor_x86 *x86_hyper; -EXPORT_SYMBOL(x86_hyper); +enum x86_hypervisor_type x86_hyper_type; +EXPORT_SYMBOL(x86_hyper_type); -static inline void __init +static inline const struct hypervisor_x86 * __init detect_hypervisor_vendor(void) { - const struct hypervisor_x86 *h, * const *p; + const struct hypervisor_x86 *h = NULL, * const *p; uint32_t pri, max_pri = 0; for (p = hypervisors; p < hypervisors + ARRAY_SIZE(hypervisors); p++) { - h = *p; - pri = h->detect(); - if (pri != 0 && pri > max_pri) { + pri = (*p)->detect(); + if (pri > max_pri) { max_pri = pri; - x86_hyper = h; + h = *p; } } - if (max_pri) - pr_info("Hypervisor detected: %s\n", x86_hyper->name); -} - -void __init init_hypervisor_platform(void) -{ - - detect_hypervisor_vendor(); + if (h) + pr_info("Hypervisor detected: %s\n", h->name); - if (!x86_hyper) - return; - - if (x86_hyper->init_platform) - x86_hyper->init_platform(); + return h; } -bool __init hypervisor_x2apic_available(void) +static void __init copy_array(const void *src, void *target, unsigned int size) { - return x86_hyper && - x86_hyper->x2apic_available && - x86_hyper->x2apic_available(); + unsigned int i, n = size / sizeof(void *); + const void * const *from = (const void * const *)src; + const void **to = (const void **)target; + + for (i = 0; i < n; i++) + if (from[i]) + to[i] = from[i]; } -void hypervisor_pin_vcpu(int cpu) +void __init init_hypervisor_platform(void) { - if (!x86_hyper) + const struct hypervisor_x86 *h; + + h = detect_hypervisor_vendor(); + + if (!h) return; - if (x86_hyper->pin_vcpu) - x86_hyper->pin_vcpu(cpu); - else - WARN_ONCE(1, "vcpu pinning requested but not supported!\n"); + copy_array(&h->init, &x86_init.hyper, sizeof(h->init)); + copy_array(&h->runtime, &x86_platform.hyper, sizeof(h->runtime)); + + x86_hyper_type = h->type; + x86_init.hyper.init_platform(); } --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/intel.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/intel.c @@ -627,6 +627,20 @@ init_intel_energy_perf(c); init_intel_misc_features(c); + + if (!c->cpu_index) { + if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { + printk(KERN_INFO "FEATURE SPEC_CTRL Present\n"); + set_ibrs_supported(); + set_ibpb_supported(); + if (ibrs_inuse) + sysctl_ibrs_enabled = 1; + if (ibpb_inuse) + sysctl_ibpb_enabled = 1; + } else { + printk(KERN_INFO "FEATURE SPEC_CTRL Not Present\n"); + } + } } #ifdef CONFIG_X86_32 --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/intel_cacheinfo.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/intel_cacheinfo.c @@ -811,7 +811,24 @@ struct cacheinfo *this_leaf; int i, sibling; - if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { + /* + * For L3, always use the pre-calculated cpu_llc_shared_mask + * to derive shared_cpu_map. + */ + if (index == 3) { + for_each_cpu(i, cpu_llc_shared_mask(cpu)) { + this_cpu_ci = get_cpu_cacheinfo(i); + if (!this_cpu_ci->info_list) + continue; + this_leaf = this_cpu_ci->info_list + index; + for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { + if (!cpu_online(sibling)) + continue; + cpumask_set_cpu(sibling, + &this_leaf->shared_cpu_map); + } + } + } else if (boot_cpu_has(X86_FEATURE_TOPOEXT)) { unsigned int apicid, nshared, first, last; this_leaf = this_cpu_ci->info_list + index; @@ -837,19 +854,6 @@ continue; cpumask_set_cpu(sibling, &this_leaf->shared_cpu_map); - } - } - } else if (index == 3) { - for_each_cpu(i, cpu_llc_shared_mask(cpu)) { - this_cpu_ci = get_cpu_cacheinfo(i); - if (!this_cpu_ci->info_list) - continue; - this_leaf = this_cpu_ci->info_list + index; - for_each_cpu(sibling, cpu_llc_shared_mask(cpu)) { - if (!cpu_online(sibling)) - continue; - cpumask_set_cpu(sibling, - &this_leaf->shared_cpu_map); } } } else --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/intel_rdt.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/intel_rdt.c @@ -30,7 +30,8 @@ #include #include -#include +#include +#include "intel_rdt.h" #define MAX_MBA_BW 100u #define MBA_IS_LINEAR 0x4 @@ -38,7 +39,13 @@ /* Mutex to protect rdtgroup access. */ DEFINE_MUTEX(rdtgroup_mutex); -DEFINE_PER_CPU_READ_MOSTLY(int, cpu_closid); +/* + * The cached intel_pqr_state is strictly per CPU and can never be + * updated from a remote CPU. Functions which modify the state + * are called with interrupts disabled and no preemption, which + * is sufficient for the protection. + */ +DEFINE_PER_CPU(struct intel_pqr_state, pqr_state); /* * Used to store the max resource name width and max resource data width @@ -46,6 +53,12 @@ */ int max_name_width, max_data_width; +/* + * Global boolean for rdt_alloc which is true if any + * resource allocation is enabled. + */ +bool rdt_alloc_capable; + static void mba_wrmsr(struct rdt_domain *d, struct msr_param *m, struct rdt_resource *r); static void @@ -54,7 +67,9 @@ #define domain_init(id) LIST_HEAD_INIT(rdt_resources_all[id].domains) struct rdt_resource rdt_resources_all[] = { + [RDT_RESOURCE_L3] = { + .rid = RDT_RESOURCE_L3, .name = "L3", .domains = domain_init(RDT_RESOURCE_L3), .msr_base = IA32_L3_CBM_BASE, @@ -67,8 +82,11 @@ }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3DATA] = { + .rid = RDT_RESOURCE_L3DATA, .name = "L3DATA", .domains = domain_init(RDT_RESOURCE_L3DATA), .msr_base = IA32_L3_CBM_BASE, @@ -81,8 +99,11 @@ }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L3CODE] = { + .rid = RDT_RESOURCE_L3CODE, .name = "L3CODE", .domains = domain_init(RDT_RESOURCE_L3CODE), .msr_base = IA32_L3_CBM_BASE, @@ -95,8 +116,11 @@ }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_L2] = { + .rid = RDT_RESOURCE_L2, .name = "L2", .domains = domain_init(RDT_RESOURCE_L2), .msr_base = IA32_L2_CBM_BASE, @@ -109,8 +133,11 @@ }, .parse_ctrlval = parse_cbm, .format_str = "%d=%0*x", + .fflags = RFTYPE_RES_CACHE, }, + [RDT_RESOURCE_MBA] = { + .rid = RDT_RESOURCE_MBA, .name = "MB", .domains = domain_init(RDT_RESOURCE_MBA), .msr_base = IA32_MBA_THRTL_BASE, @@ -118,6 +145,7 @@ .cache_level = 3, .parse_ctrlval = parse_bw, .format_str = "%d=%*d", + .fflags = RFTYPE_RES_MB, }, }; @@ -144,33 +172,28 @@ * is always 20 on hsw server parts. The minimum cache bitmask length * allowed for HSW server is always 2 bits. Hardcode all of them. */ -static inline bool cache_alloc_hsw_probe(void) +static inline void cache_alloc_hsw_probe(void) { - if (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL && - boot_cpu_data.x86 == 6 && - boot_cpu_data.x86_model == INTEL_FAM6_HASWELL_X) { - struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - u32 l, h, max_cbm = BIT_MASK(20) - 1; - - if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) - return false; - rdmsr(IA32_L3_CBM_BASE, l, h); + struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; + u32 l, h, max_cbm = BIT_MASK(20) - 1; - /* If all the bits were set in MSR, return success */ - if (l != max_cbm) - return false; + if (wrmsr_safe(IA32_L3_CBM_BASE, max_cbm, 0)) + return; + rdmsr(IA32_L3_CBM_BASE, l, h); - r->num_closid = 4; - r->default_ctrl = max_cbm; - r->cache.cbm_len = 20; - r->cache.min_cbm_bits = 2; - r->capable = true; - r->enabled = true; + /* If all the bits were set in MSR, return success */ + if (l != max_cbm) + return; - return true; - } + r->num_closid = 4; + r->default_ctrl = max_cbm; + r->cache.cbm_len = 20; + r->cache.shareable_bits = 0xc0000; + r->cache.min_cbm_bits = 2; + r->alloc_capable = true; + r->alloc_enabled = true; - return false; + rdt_alloc_capable = true; } /* @@ -213,15 +236,14 @@ return false; } r->data_width = 3; - rdt_get_mba_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; return true; } -static void rdt_get_cache_config(int idx, struct rdt_resource *r) +static void rdt_get_cache_alloc_cfg(int idx, struct rdt_resource *r) { union cpuid_0x10_1_eax eax; union cpuid_0x10_x_edx edx; @@ -231,10 +253,10 @@ r->num_closid = edx.split.cos_max + 1; r->cache.cbm_len = eax.split.cbm_len + 1; r->default_ctrl = BIT_MASK(eax.split.cbm_len + 1) - 1; + r->cache.shareable_bits = ebx & r->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - rdt_get_cache_infofile(r); - r->capable = true; - r->enabled = true; + r->alloc_capable = true; + r->alloc_enabled = true; } static void rdt_get_cdp_l3_config(int type) @@ -246,12 +268,12 @@ r->cache.cbm_len = r_l3->cache.cbm_len; r->default_ctrl = r_l3->default_ctrl; r->data_width = (r->cache.cbm_len + 3) / 4; - r->capable = true; + r->alloc_capable = true; /* * By default, CDP is disabled. CDP can be enabled by mount parameter * "cdp" during resctrl file system mount time. */ - r->enabled = false; + r->alloc_enabled = false; } static int get_cache_id(int cpu, int level) @@ -300,6 +322,19 @@ wrmsrl(r->msr_base + cbm_idx(r, i), d->ctrl_val[i]); } +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r) +{ + struct rdt_domain *d; + + list_for_each_entry(d, &r->domains, list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->cpu_mask)) + return d; + } + + return NULL; +} + void rdt_ctrl_update(void *arg) { struct msr_param *m = arg; @@ -307,12 +342,10 @@ int cpu = smp_processor_id(); struct rdt_domain *d; - list_for_each_entry(d, &r->domains, list) { - /* Find the domain that contains this CPU */ - if (cpumask_test_cpu(cpu, &d->cpu_mask)) { - r->msr_update(d, m, r); - return; - } + d = get_domain_from_cpu(cpu, r); + if (d) { + r->msr_update(d, m, r); + return; } pr_warn_once("cpu %d not found in any domain for resource %s\n", cpu, r->name); @@ -326,8 +359,8 @@ * caller, return the first domain whose id is bigger than the input id. * The domain list is sorted by id in ascending order. */ -static struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, - struct list_head **pos) +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos) { struct rdt_domain *d; struct list_head *l; @@ -377,6 +410,44 @@ return 0; } +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_domain *d) +{ + size_t tsize; + + if (is_llc_occupancy_enabled()) { + d->rmid_busy_llc = kcalloc(BITS_TO_LONGS(r->num_rmid), + sizeof(unsigned long), + GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + } + if (is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_total) { + kfree(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(r->num_rmid, tsize, GFP_KERNEL); + if (!d->mbm_local) { + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + if (is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL); + } + + return 0; +} + /* * domain_add_cpu - Add a cpu to a resource's domain list. * @@ -412,14 +483,26 @@ return; d->id = id; + cpumask_set_cpu(cpu, &d->cpu_mask); - if (domain_setup_ctrlval(r, d)) { + if (r->alloc_capable && domain_setup_ctrlval(r, d)) { + kfree(d); + return; + } + + if (r->mon_capable && domain_setup_mon_state(r, d)) { kfree(d); return; } - cpumask_set_cpu(cpu, &d->cpu_mask); list_add_tail(&d->list, add_pos); + + /* + * If resctrl is mounted, add + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + mkdir_mondata_subdir_allrdtgrp(r, d); } static void domain_remove_cpu(int cpu, struct rdt_resource *r) @@ -435,19 +518,58 @@ cpumask_clear_cpu(cpu, &d->cpu_mask); if (cpumask_empty(&d->cpu_mask)) { - kfree(d->ctrl_val); + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (static_branch_unlikely(&rdt_mon_enable_key)) + rmdir_mondata_subdir_allrdtgrp(r, d->id); list_del(&d->list); + if (is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (is_llc_occupancy_enabled() && has_busy_rmid(r, d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + + kfree(d->ctrl_val); + kfree(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); kfree(d); + return; + } + + if (r == &rdt_resources_all[RDT_RESOURCE_L3]) { + if (is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0); + } + if (is_llc_occupancy_enabled() && cpu == d->cqm_work_cpu && + has_busy_rmid(r, d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0); + } } } -static void clear_closid(int cpu) +static void clear_closid_rmid(int cpu) { struct intel_pqr_state *state = this_cpu_ptr(&pqr_state); - per_cpu(cpu_closid, cpu) = 0; - state->closid = 0; - wrmsr(MSR_IA32_PQR_ASSOC, state->rmid, 0); + state->default_closid = 0; + state->default_rmid = 0; + state->cur_closid = 0; + state->cur_rmid = 0; + wrmsr(IA32_PQR_ASSOC, 0, 0); } static int intel_rdt_online_cpu(unsigned int cpu) @@ -459,12 +581,23 @@ domain_add_cpu(cpu, r); /* The cpu is set in default rdtgroup after online. */ cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; } +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) { + break; + } + } +} + static int intel_rdt_offline_cpu(unsigned int cpu) { struct rdtgroup *rdtgrp; @@ -474,10 +607,12 @@ for_each_capable_rdt_resource(r) domain_remove_cpu(cpu, r); list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { - if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); break; + } } - clear_closid(cpu); + clear_closid_rmid(cpu); mutex_unlock(&rdtgroup_mutex); return 0; @@ -492,7 +627,7 @@ struct rdt_resource *r; int cl; - for_each_capable_rdt_resource(r) { + for_each_alloc_capable_rdt_resource(r) { cl = strlen(r->name); if (cl > max_name_width) max_name_width = cl; @@ -502,38 +637,153 @@ } } -static __init bool get_rdt_resources(void) +enum { + RDT_FLAG_CMT, + RDT_FLAG_MBM_TOTAL, + RDT_FLAG_MBM_LOCAL, + RDT_FLAG_L3_CAT, + RDT_FLAG_L3_CDP, + RDT_FLAG_L2_CAT, + RDT_FLAG_MBA, +}; + +#define RDT_OPT(idx, n, f) \ +[idx] = { \ + .name = n, \ + .flag = f \ +} + +struct rdt_options { + char *name; + int flag; + bool force_off, force_on; +}; + +static struct rdt_options rdt_options[] __initdata = { + RDT_OPT(RDT_FLAG_CMT, "cmt", X86_FEATURE_CQM_OCCUP_LLC), + RDT_OPT(RDT_FLAG_MBM_TOTAL, "mbmtotal", X86_FEATURE_CQM_MBM_TOTAL), + RDT_OPT(RDT_FLAG_MBM_LOCAL, "mbmlocal", X86_FEATURE_CQM_MBM_LOCAL), + RDT_OPT(RDT_FLAG_L3_CAT, "l3cat", X86_FEATURE_CAT_L3), + RDT_OPT(RDT_FLAG_L3_CDP, "l3cdp", X86_FEATURE_CDP_L3), + RDT_OPT(RDT_FLAG_L2_CAT, "l2cat", X86_FEATURE_CAT_L2), + RDT_OPT(RDT_FLAG_MBA, "mba", X86_FEATURE_MBA), +}; +#define NUM_RDT_OPTIONS ARRAY_SIZE(rdt_options) + +static int __init set_rdt_options(char *str) +{ + struct rdt_options *o; + bool force_off; + char *tok; + + if (*str == '=') + str++; + while ((tok = strsep(&str, ",")) != NULL) { + force_off = *tok == '!'; + if (force_off) + tok++; + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (strcmp(tok, o->name) == 0) { + if (force_off) + o->force_off = true; + else + o->force_on = true; + break; + } + } + } + return 1; +} +__setup("rdt", set_rdt_options); + +static bool __init rdt_cpu_has(int flag) +{ + bool ret = boot_cpu_has(flag); + struct rdt_options *o; + + if (!ret) + return ret; + + for (o = rdt_options; o < &rdt_options[NUM_RDT_OPTIONS]; o++) { + if (flag == o->flag) { + if (o->force_off) + ret = false; + if (o->force_on) + ret = true; + break; + } + } + return ret; +} + +static __init bool get_rdt_alloc_resources(void) { bool ret = false; - if (cache_alloc_hsw_probe()) + if (rdt_alloc_capable) return true; if (!boot_cpu_has(X86_FEATURE_RDT_A)) return false; - if (boot_cpu_has(X86_FEATURE_CAT_L3)) { - rdt_get_cache_config(1, &rdt_resources_all[RDT_RESOURCE_L3]); - if (boot_cpu_has(X86_FEATURE_CDP_L3)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L3)) { + rdt_get_cache_alloc_cfg(1, &rdt_resources_all[RDT_RESOURCE_L3]); + if (rdt_cpu_has(X86_FEATURE_CDP_L3)) { rdt_get_cdp_l3_config(RDT_RESOURCE_L3DATA); rdt_get_cdp_l3_config(RDT_RESOURCE_L3CODE); } ret = true; } - if (boot_cpu_has(X86_FEATURE_CAT_L2)) { + if (rdt_cpu_has(X86_FEATURE_CAT_L2)) { /* CPUID 0x10.2 fields are same format at 0x10.1 */ - rdt_get_cache_config(2, &rdt_resources_all[RDT_RESOURCE_L2]); + rdt_get_cache_alloc_cfg(2, &rdt_resources_all[RDT_RESOURCE_L2]); ret = true; } - if (boot_cpu_has(X86_FEATURE_MBA)) { + if (rdt_cpu_has(X86_FEATURE_MBA)) { if (rdt_get_mem_config(&rdt_resources_all[RDT_RESOURCE_MBA])) ret = true; } - return ret; } +static __init bool get_rdt_mon_resources(void) +{ + if (rdt_cpu_has(X86_FEATURE_CQM_OCCUP_LLC)) + rdt_mon_features |= (1 << QOS_L3_OCCUP_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_TOTAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_TOTAL_EVENT_ID); + if (rdt_cpu_has(X86_FEATURE_CQM_MBM_LOCAL)) + rdt_mon_features |= (1 << QOS_L3_MBM_LOCAL_EVENT_ID); + + if (!rdt_mon_features) + return false; + + return !rdt_get_mon_l3_config(&rdt_resources_all[RDT_RESOURCE_L3]); +} + +static __init void rdt_quirks(void) +{ + switch (boot_cpu_data.x86_model) { + case INTEL_FAM6_HASWELL_X: + if (!rdt_options[RDT_FLAG_L3_CAT].force_off) + cache_alloc_hsw_probe(); + break; + case INTEL_FAM6_SKYLAKE_X: + if (boot_cpu_data.x86_mask <= 4) + set_rdt_options("!cmt,!mbmtotal,!mbmlocal,!l3cat"); + } +} + +static __init bool get_rdt_resources(void) +{ + rdt_quirks(); + rdt_alloc_capable = get_rdt_alloc_resources(); + rdt_mon_capable = get_rdt_mon_resources(); + + return (rdt_mon_capable || rdt_alloc_capable); +} + static int __init intel_rdt_late_init(void) { struct rdt_resource *r; @@ -556,9 +806,12 @@ return ret; } - for_each_capable_rdt_resource(r) + for_each_alloc_capable_rdt_resource(r) pr_info("Intel RDT %s allocation detected\n", r->name); + for_each_mon_capable_rdt_resource(r) + pr_info("Intel RDT %s monitoring detected\n", r->name); + return 0; } --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/intel_rdt.h +++ linux-azure-4.13.0/arch/x86/kernel/cpu/intel_rdt.h @@ -0,0 +1,440 @@ +#ifndef _ASM_X86_INTEL_RDT_H +#define _ASM_X86_INTEL_RDT_H + +#include +#include +#include + +#define IA32_L3_QOS_CFG 0xc81 +#define IA32_L3_CBM_BASE 0xc90 +#define IA32_L2_CBM_BASE 0xd10 +#define IA32_MBA_THRTL_BASE 0xd50 + +#define L3_QOS_CDP_ENABLE 0x01ULL + +/* + * Event IDs are used to program IA32_QM_EVTSEL before reading event + * counter from IA32_QM_CTR + */ +#define QOS_L3_OCCUP_EVENT_ID 0x01 +#define QOS_L3_MBM_TOTAL_EVENT_ID 0x02 +#define QOS_L3_MBM_LOCAL_EVENT_ID 0x03 + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +#define MBM_CNTR_WIDTH 24 +#define MBM_OVERFLOW_INTERVAL 1000 + +#define RMID_VAL_ERROR BIT_ULL(63) +#define RMID_VAL_UNAVAIL BIT_ULL(62) + +DECLARE_STATIC_KEY_FALSE(rdt_enable_key); + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + */ +struct mon_evt { + u32 evtid; + char *name; + struct list_head list; +}; + +/** + * struct mon_data_bits - Monitoring details for each event file + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file + * @domid: The domain to which the event file belongs + */ +union mon_data_bits { + void *priv; + struct { + unsigned int rid : 10; + unsigned int evtid : 8; + unsigned int domid : 14; + } u; +}; + +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_domain *d; + int evtid; + bool first; + u64 val; +}; + +extern unsigned int intel_cqm_threshold; +extern bool rdt_alloc_capable; +extern bool rdt_mon_capable; +extern unsigned int rdt_mon_features; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn kernlfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) +#define RFTYPE_BASE BIT(1) +#define RF_CTRLSHIFT 4 +#define RF_MONSHIFT 5 +#define RFTYPE_CTRL BIT(RF_CTRLSHIFT) +#define RFTYPE_MON BIT(RF_MONSHIFT) +#define RFTYPE_RES_CACHE BIT(8) +#define RFTYPE_RES_MB BIT(9) +#define RF_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) +#define RF_MON_INFO (RFTYPE_INFO | RFTYPE_MON) +#define RF_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width, max_data_width; + +int __init rdtgroup_init(void); + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RF_* or RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @chunks: Total data moved (multiply by rdt_group.mon_scale to get bytes) + * @prev_msr Value of IA32_QM_CTR for this RMID last time we read it + */ +struct mbm_state { + u64 chunks; + u64 prev_msr; +}; + +/** + * struct rdt_domain - group of cpus sharing an RDT resource + * @list: all instances of this resource + * @id: unique id for this instance + * @cpu_mask: which cpus share this resource + * @rmid_busy_llc: + * bitmap of which limbo RMIDs are above threshold + * @mbm_total: saved state for MBM total bandwidth + * @mbm_local: saved state for MBM local bandwidth + * @mbm_over: worker to periodically read MBM h/w counters + * @cqm_limbo: worker to periodically read CQM h/w counters + * @mbm_work_cpu: + * worker cpu for MBM h/w counters + * @cqm_work_cpu: + * worker cpu for CQM h/w counters + * @ctrl_val: array of cache or mem ctrl values (indexed by CLOSID) + * @new_ctrl: new ctrl value to be loaded + * @have_new_ctrl: did user provide new_ctrl for this domain + */ +struct rdt_domain { + struct list_head list; + int id; + struct cpumask cpu_mask; + unsigned long *rmid_busy_llc; + struct mbm_state *mbm_total; + struct mbm_state *mbm_local; + struct delayed_work mbm_over; + struct delayed_work cqm_limbo; + int mbm_work_cpu; + int cqm_work_cpu; + u32 *ctrl_val; + u32 new_ctrl; + bool have_new_ctrl; +}; + +/** + * struct msr_param - set a range of MSRs from a domain + * @res: The resource to use + * @low: Beginning index from base MSR + * @high: End index + */ +struct msr_param { + struct rdt_resource *res; + int low; + int high; +}; + +/** + * struct rdt_cache - Cache allocation related data + * @cbm_len: Length of the cache bit mask + * @min_cbm_bits: Minimum number of consecutive bits to be set + * @cbm_idx_mult: Multiplier of CBM index + * @cbm_idx_offset: Offset of CBM index. CBM index is computed by: + * closid * cbm_idx_multi + cbm_idx_offset + * in a cache bit mask + * @shareable_bits: Bitmask of shareable resource with other + * executing entities + */ +struct rdt_cache { + unsigned int cbm_len; + unsigned int min_cbm_bits; + unsigned int cbm_idx_mult; + unsigned int cbm_idx_offset; + unsigned int shareable_bits; +}; + +/** + * struct rdt_membw - Memory bandwidth allocation related data + * @max_delay: Max throttle delay. Delay is the hardware + * representation for memory bandwidth. + * @min_bw: Minimum memory bandwidth percentage user can request + * @bw_gran: Granularity at which the memory bandwidth is allocated + * @delay_linear: True if memory B/W delay is in linear scale + * @mb_map: Mapping of memory B/W percentage to memory B/W delay + */ +struct rdt_membw { + u32 max_delay; + u32 min_bw; + u32 bw_gran; + u32 delay_linear; + u32 *mb_map; +}; + +static inline bool is_llc_occupancy_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_OCCUP_EVENT_ID)); +} + +static inline bool is_mbm_total_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_TOTAL_EVENT_ID)); +} + +static inline bool is_mbm_local_enabled(void) +{ + return (rdt_mon_features & (1 << QOS_L3_MBM_LOCAL_EVENT_ID)); +} + +static inline bool is_mbm_enabled(void) +{ + return (is_mbm_total_enabled() || is_mbm_local_enabled()); +} + +static inline bool is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/** + * struct rdt_resource - attributes of an RDT resource + * @rid: The index of the resource + * @alloc_enabled: Is allocation enabled on this machine + * @mon_enabled: Is monitoring enabled for this feature + * @alloc_capable: Is allocation available on this machine + * @mon_capable: Is monitor feature available on this machine + * @name: Name to use in "schemata" file + * @num_closid: Number of CLOSIDs available + * @cache_level: Which cache level defines scope of this resource + * @default_ctrl: Specifies default cache cbm or memory B/W percent. + * @msr_base: Base MSR address for CBMs + * @msr_update: Function pointer to update QOS MSRs + * @data_width: Character width of data when displaying + * @domains: All domains for this resource + * @cache: Cache allocation related data + * @format_str: Per resource format string to show domain value + * @parse_ctrlval: Per resource function pointer to parse control values + * @evt_list: List of monitoring events + * @num_rmid: Number of RMIDs available + * @mon_scale: cqm counter * mon_scale = occupancy in bytes + * @fflags: flags to choose base and info files + */ +struct rdt_resource { + int rid; + bool alloc_enabled; + bool mon_enabled; + bool alloc_capable; + bool mon_capable; + char *name; + int num_closid; + int cache_level; + u32 default_ctrl; + unsigned int msr_base; + void (*msr_update) (struct rdt_domain *d, struct msr_param *m, + struct rdt_resource *r); + int data_width; + struct list_head domains; + struct rdt_cache cache; + struct rdt_membw membw; + const char *format_str; + int (*parse_ctrlval) (char *buf, struct rdt_resource *r, + struct rdt_domain *d); + struct list_head evt_list; + int num_rmid; + unsigned int mon_scale; + unsigned long fflags; +}; + +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d); +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d); + +extern struct mutex rdtgroup_mutex; + +extern struct rdt_resource rdt_resources_all[]; +extern struct rdtgroup rdtgroup_default; +DECLARE_STATIC_KEY_FALSE(rdt_alloc_enable_key); + +int __init rdtgroup_init(void); + +enum { + RDT_RESOURCE_L3, + RDT_RESOURCE_L3DATA, + RDT_RESOURCE_L3CODE, + RDT_RESOURCE_L2, + RDT_RESOURCE_MBA, + + /* Must be the last */ + RDT_NUM_RESOURCES, +}; + +#define for_each_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable || r->mon_capable) + +#define for_each_alloc_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_capable) + +#define for_each_mon_capable_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_capable) + +#define for_each_alloc_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->alloc_enabled) + +#define for_each_mon_enabled_rdt_resource(r) \ + for (r = rdt_resources_all; r < rdt_resources_all + RDT_NUM_RESOURCES;\ + r++) \ + if (r->mon_enabled) + +/* CPUID.(EAX=10H, ECX=ResID=1).EAX */ +union cpuid_0x10_1_eax { + struct { + unsigned int cbm_len:5; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID=3).EAX */ +union cpuid_0x10_3_eax { + struct { + unsigned int max_delay:12; + } split; + unsigned int full; +}; + +/* CPUID.(EAX=10H, ECX=ResID).EDX */ +union cpuid_0x10_x_edx { + struct { + unsigned int cos_max:16; + } split; + unsigned int full; +}; + +void rdt_ctrl_update(void *arg); +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); +void rdtgroup_kn_unlock(struct kernfs_node *kn); +struct rdt_domain *rdt_find_domain(struct rdt_resource *r, int id, + struct list_head **pos); +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); +struct rdt_domain *get_domain_from_cpu(int cpu, struct rdt_resource *r); +int alloc_rmid(void); +void free_rmid(u32 rmid); +int rdt_get_mon_l3_config(struct rdt_resource *r); +void mon_event_count(void *info); +int rdtgroup_mondata_show(struct seq_file *m, void *arg); +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + unsigned int dom_id); +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d); +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first); +void mbm_setup_overflow_handler(struct rdt_domain *dom, + unsigned long delay_ms); +void mbm_handle_overflow(struct work_struct *work); +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms); +void cqm_handle_limbo(struct work_struct *work); +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d); +void __check_limbo(struct rdt_domain *d, bool force_free); + +#endif /* _ASM_X86_INTEL_RDT_H */ --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/intel_rdt_ctrlmondata.c @@ -0,0 +1,341 @@ +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu + * Tony Luck + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include "intel_rdt.h" + +/* + * Check whether MBA bandwidth percentage value is correct. The value is + * checked against the minimum and max bandwidth values specified by the + * hardware. The allocated bandwidth percentage is rounded to the next + * control step available on the hardware. + */ +static bool bw_validate(char *buf, unsigned long *data, struct rdt_resource *r) +{ + unsigned long bw; + int ret; + + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (!r->membw.delay_linear) + return false; + + ret = kstrtoul(buf, 10, &bw); + if (ret) + return false; + + if (bw < r->membw.min_bw || bw > r->default_ctrl) + return false; + + *data = roundup(bw, (unsigned long)r->membw.bw_gran); + return true; +} + +int parse_bw(char *buf, struct rdt_resource *r, struct rdt_domain *d) +{ + unsigned long data; + + if (d->have_new_ctrl) + return -EINVAL; + + if (!bw_validate(buf, &data, r)) + return -EINVAL; + d->new_ctrl = data; + d->have_new_ctrl = true; + + return 0; +} + +/* + * Check whether a cache bit mask is valid. The SDM says: + * Please note that all (and only) contiguous '1' combinations + * are allowed (e.g. FFFFH, 0FF0H, 003CH, etc.). + * Additionally Haswell requires at least two bits set. + */ +static bool cbm_validate(char *buf, unsigned long *data, struct rdt_resource *r) +{ + unsigned long first_bit, zero_bit, val; + unsigned int cbm_len = r->cache.cbm_len; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret) + return false; + + if (val == 0 || val > r->default_ctrl) + return false; + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + if (find_next_bit(&val, cbm_len, zero_bit) < cbm_len) + return false; + + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) + return false; + + *data = val; + return true; +} + +/* + * Read one cache bit mask (hex). Check that it is valid for the current + * resource type. + */ +int parse_cbm(char *buf, struct rdt_resource *r, struct rdt_domain *d) +{ + unsigned long data; + + if (d->have_new_ctrl) + return -EINVAL; + + if(!cbm_validate(buf, &data, r)) + return -EINVAL; + d->new_ctrl = data; + d->have_new_ctrl = true; + + return 0; +} + +/* + * For each domain in this resource we expect to find a series of: + * id=mask + * separated by ";". The "id" is in decimal, and must match one of + * the "id"s for this resource. + */ +static int parse_line(char *line, struct rdt_resource *r) +{ + char *dom = NULL, *id; + struct rdt_domain *d; + unsigned long dom_id; + +next: + if (!line || line[0] == '\0') + return 0; + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) + return -EINVAL; + dom = strim(dom); + list_for_each_entry(d, &r->domains, list) { + if (d->id == dom_id) { + if (r->parse_ctrlval(dom, r, d)) + return -EINVAL; + goto next; + } + } + return -EINVAL; +} + +static int update_domains(struct rdt_resource *r, int closid) +{ + struct msr_param msr_param; + cpumask_var_t cpu_mask; + struct rdt_domain *d; + int cpu; + + if (!zalloc_cpumask_var(&cpu_mask, GFP_KERNEL)) + return -ENOMEM; + + msr_param.low = closid; + msr_param.high = msr_param.low + 1; + msr_param.res = r; + + list_for_each_entry(d, &r->domains, list) { + if (d->have_new_ctrl && d->new_ctrl != d->ctrl_val[closid]) { + cpumask_set_cpu(cpumask_any(&d->cpu_mask), cpu_mask); + d->ctrl_val[closid] = d->new_ctrl; + } + } + if (cpumask_empty(cpu_mask)) + goto done; + cpu = get_cpu(); + /* Update CBM on this cpu if it's in cpu_mask. */ + if (cpumask_test_cpu(cpu, cpu_mask)) + rdt_ctrl_update(&msr_param); + /* Update CBM on other cpus. */ + smp_call_function_many(cpu_mask, rdt_ctrl_update, &msr_param, 1); + put_cpu(); + +done: + free_cpumask_var(cpu_mask); + + return 0; +} + +static int rdtgroup_parse_resource(char *resname, char *tok, int closid) +{ + struct rdt_resource *r; + + for_each_alloc_enabled_rdt_resource(r) { + if (!strcmp(resname, r->name) && closid < r->num_closid) + return parse_line(tok, r); + } + return -EINVAL; +} + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + struct rdt_domain *dom; + struct rdt_resource *r; + char *tok, *resname; + int closid, ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + closid = rdtgrp->closid; + + for_each_alloc_enabled_rdt_resource(r) { + list_for_each_entry(dom, &r->domains, list) + dom->have_new_ctrl = false; + } + + while ((tok = strsep(&buf, "\n")) != NULL) { + resname = strim(strsep(&tok, ":")); + if (!tok) { + ret = -EINVAL; + goto out; + } + ret = rdtgroup_parse_resource(resname, tok, closid); + if (ret) + goto out; + } + + for_each_alloc_enabled_rdt_resource(r) { + ret = update_domains(r, closid); + if (ret) + goto out; + } + +out: + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +static void show_doms(struct seq_file *s, struct rdt_resource *r, int closid) +{ + struct rdt_domain *dom; + bool sep = false; + + seq_printf(s, "%*s:", max_name_width, r->name); + list_for_each_entry(dom, &r->domains, list) { + if (sep) + seq_puts(s, ";"); + seq_printf(s, r->format_str, dom->id, max_data_width, + dom->ctrl_val[closid]); + sep = true; + } + seq_puts(s, "\n"); +} + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + int ret = 0; + u32 closid; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) { + closid = rdtgrp->closid; + for_each_alloc_enabled_rdt_resource(r) { + if (closid < r->num_closid) + show_doms(s, r, closid); + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + return ret; +} + +void mon_event_read(struct rmid_read *rr, struct rdt_domain *d, + struct rdtgroup *rdtgrp, int evtid, int first) +{ + /* + * setup the parameters to send to the IPI to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->d = d; + rr->val = 0; + rr->first = first; + + smp_call_function_any(&d->cpu_mask, mon_event_count, rr, 1); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + u32 resid, evtid, domid; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + union mon_data_bits md; + struct rdt_domain *d; + struct rmid_read rr; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + md.priv = of->kn->priv; + resid = md.u.rid; + domid = md.u.domid; + evtid = md.u.evtid; + + r = &rdt_resources_all[resid]; + d = rdt_find_domain(r, domid, NULL); + if (!d) { + ret = -ENOENT; + goto out; + } + + mon_event_read(&rr, d, rdtgrp, evtid, false); + + if (rr.val & RMID_VAL_ERROR) + seq_puts(m, "Error\n"); + else if (rr.val & RMID_VAL_UNAVAIL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val * r->mon_scale); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/intel_rdt_monitor.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/intel_rdt_monitor.c @@ -0,0 +1,499 @@ +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#include +#include +#include +#include "intel_rdt.h" + +#define MSR_IA32_QM_CTR 0x0c8e +#define MSR_IA32_QM_EVTSEL 0x0c8d + +struct rmid_entry { + u32 rmid; + int busy; + struct list_head list; +}; + +/** + * @rmid_free_lru A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/** + * @rmid_limbo_count count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > intel_cqm_threshold. User can change + * the threshold occupancy value. + */ +unsigned int rmid_limbo_count; + +/** + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * Global boolean for rdt_monitor which is true if any + * resource monitoring is enabled. + */ +bool rdt_mon_capable; + +/* + * Global to indicate which monitoring events are enabled. + */ +unsigned int rdt_mon_features; + +/* + * This is the threshold cache occupancy at which we will consider an + * RMID available for re-allocation. + */ +unsigned int intel_cqm_threshold; + +static inline struct rmid_entry *__rmid_entry(u32 rmid) +{ + struct rmid_entry *entry; + + entry = &rmid_ptrs[rmid]; + WARN_ON(entry->rmid != rmid); + + return entry; +} + +static u64 __rmid_read(u32 rmid, u32 eventid) +{ + u64 val; + + /* + * As per the SDM, when IA32_QM_EVTSEL.EvtID (bits 7:0) is configured + * with a valid event code for supported resource type and the bits + * IA32_QM_EVTSEL.RMID (bits 41:32) are configured with valid RMID, + * IA32_QM_CTR.data (bits 61:0) reports the monitored data. + * IA32_QM_CTR.Error (bit 63) and IA32_QM_CTR.Unavailable (bit 62) + * are error bits. + */ + wrmsr(MSR_IA32_QM_EVTSEL, eventid, rmid); + rdmsrl(MSR_IA32_QM_CTR, val); + + return val; +} + +static bool rmid_dirty(struct rmid_entry *entry) +{ + u64 val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + + return val >= intel_cqm_threshold; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_domain *d, bool force_free) +{ + struct rmid_entry *entry; + struct rdt_resource *r; + u32 crmid = 1, nrmid; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + nrmid = find_next_bit(d->rmid_busy_llc, r->num_rmid, crmid); + if (nrmid >= r->num_rmid) + break; + + entry = __rmid_entry(nrmid); + if (force_free || !rmid_dirty(entry)) { + clear_bit(entry->rmid, d->rmid_busy_llc); + if (!--entry->busy) { + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + } + } + crmid = nrmid + 1; + } +} + +bool has_busy_rmid(struct rdt_resource *r, struct rdt_domain *d) +{ + return find_first_bit(d->rmid_busy_llc, r->num_rmid) != r->num_rmid; +} + +/* + * As of now the RMIDs allocation is global. + * However we keep track of which packages the RMIDs + * are used to optimize the limbo list management. + */ +int alloc_rmid(void) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? -EBUSY : -ENOSPC; + + entry = list_first_entry(&rmid_free_lru, + struct rmid_entry, list); + list_del(&entry->list); + + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r; + struct rdt_domain *d; + int cpu; + u64 val; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + entry->busy = 0; + cpu = get_cpu(); + list_for_each_entry(d, &r->domains, list) { + if (cpumask_test_cpu(cpu, &d->cpu_mask)) { + val = __rmid_read(entry->rmid, QOS_L3_OCCUP_EVENT_ID); + if (val <= intel_cqm_threshold) + continue; + } + + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(r, d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL); + set_bit(entry->rmid, d->rmid_busy_llc); + entry->busy++; + } + put_cpu(); + + if (entry->busy) + rmid_limbo_count++; + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +void free_rmid(u32 rmid) +{ + struct rmid_entry *entry; + + if (!rmid) + return; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = __rmid_entry(rmid); + + if (is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static int __mon_event_count(u32 rmid, struct rmid_read *rr) +{ + u64 chunks, shift, tval; + struct mbm_state *m; + + tval = __rmid_read(rmid, rr->evtid); + if (tval & (RMID_VAL_ERROR | RMID_VAL_UNAVAIL)) { + rr->val = tval; + return -EINVAL; + } + switch (rr->evtid) { + case QOS_L3_OCCUP_EVENT_ID: + rr->val += tval; + return 0; + case QOS_L3_MBM_TOTAL_EVENT_ID: + m = &rr->d->mbm_total[rmid]; + break; + case QOS_L3_MBM_LOCAL_EVENT_ID: + m = &rr->d->mbm_local[rmid]; + break; + default: + /* + * Code would never reach here because + * an invalid event id would fail the __rmid_read. + */ + return -EINVAL; + } + + if (rr->first) { + m->prev_msr = tval; + m->chunks = 0; + return 0; + } + + shift = 64 - MBM_CNTR_WIDTH; + chunks = (tval << shift) - (m->prev_msr << shift); + chunks >>= shift; + m->chunks += chunks; + m->prev_msr = tval; + + rr->val += m->chunks; + return 0; +} + +/* + * This is called via IPI to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + + rdtgrp = rr->rgrp; + + if (__mon_event_count(rdtgrp->mon.rmid, rr)) + return; + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->mon.rmid, rr)) + return; + } + } +} + +static void mbm_update(struct rdt_domain *d, int rmid) +{ + struct rmid_read rr; + + rr.first = false; + rr.d = d; + + /* + * This is protected from concurrent reads from user + * as both the user and we hold the global mutex. + */ + if (is_mbm_total_enabled()) { + rr.evtid = QOS_L3_MBM_TOTAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } + if (is_mbm_local_enabled()) { + rr.evtid = QOS_L3_MBM_LOCAL_EVENT_ID; + __mon_event_count(rmid, &rr); + } +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + int cpu = smp_processor_id(); + struct rdt_resource *r; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + d = get_domain_from_cpu(cpu, r); + + if (!d) { + pr_warn_once("Failure to get domain for limbo worker\n"); + goto out_unlock; + } + + __check_limbo(d, false); + + if (has_busy_rmid(r, d)) + schedule_delayed_work_on(cpu, &d->cqm_limbo, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void cqm_setup_limbo_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + struct rdt_resource *r; + int cpu; + + r = &rdt_resources_all[RDT_RESOURCE_L3]; + + cpu = cpumask_any(&dom->cpu_mask); + dom->cqm_work_cpu = cpu; + + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + int cpu = smp_processor_id(); + struct list_head *head; + struct rdt_domain *d; + + mutex_lock(&rdtgroup_mutex); + + if (!static_branch_likely(&rdt_enable_key)) + goto out_unlock; + + d = get_domain_from_cpu(cpu, &rdt_resources_all[RDT_RESOURCE_L3]); + if (!d) + goto out_unlock; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(d, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(d, crgrp->mon.rmid); + } + + schedule_delayed_work_on(cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +void mbm_setup_overflow_handler(struct rdt_domain *dom, unsigned long delay_ms) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + if (!static_branch_likely(&rdt_enable_key)) + return; + cpu = cpumask_any(&dom->cpu_mask); + dom->mbm_work_cpu = cpu; + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + struct rmid_entry *entry = NULL; + int i, nr_rmids; + + nr_rmids = r->num_rmid; + rmid_ptrs = kcalloc(nr_rmids, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) + return -ENOMEM; + + for (i = 0; i < nr_rmids; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + entry->rmid = i; + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RMID 0 is special and is always allocated. It's used for all + * tasks that are not monitored. + */ + entry = __rmid_entry(0); + list_del(&entry->list); + + return 0; +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +int rdt_get_mon_l3_config(struct rdt_resource *r) +{ + int ret; + + r->mon_scale = boot_cpu_data.x86_cache_occ_scale; + r->num_rmid = boot_cpu_data.x86_cache_max_rmid + 1; + + /* + * A reasonable upper limit on the max threshold is the number + * of lines tagged per RMID if all RMIDs have the same number of + * lines tagged in the LLC. + * + * For a 35MB LLC and 56 RMIDs, this is ~1.8% of the LLC. + */ + intel_cqm_threshold = boot_cpu_data.x86_cache_size * 1024 / r->num_rmid; + + /* h/w works in units of "boot_cpu_data.x86_cache_occ_scale" */ + intel_cqm_threshold /= r->mon_scale; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + r->mon_capable = true; + r->mon_enabled = true; + + return 0; +} --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -32,17 +32,25 @@ #include -#include -#include +#include +#include "intel_rdt.h" DEFINE_STATIC_KEY_FALSE(rdt_enable_key); -struct kernfs_root *rdt_root; +DEFINE_STATIC_KEY_FALSE(rdt_mon_enable_key); +DEFINE_STATIC_KEY_FALSE(rdt_alloc_enable_key); +static struct kernfs_root *rdt_root; struct rdtgroup rdtgroup_default; LIST_HEAD(rdt_all_groups); /* Kernel fs node for "info" directory under root */ static struct kernfs_node *kn_info; +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + /* * Trivial allocator for CLOSIDs. Since h/w only supports a small number, * we can keep a bitmap of free CLOSIDs in a single integer. @@ -66,7 +74,7 @@ int rdt_min_closid = 32; /* Compute rdt_min_closid across all resources */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) rdt_min_closid = min(rdt_min_closid, r->num_closid); closid_free_map = BIT_MASK(rdt_min_closid) - 1; @@ -75,9 +83,9 @@ closid_free_map &= ~1; } -int closid_alloc(void) +static int closid_alloc(void) { - int closid = ffs(closid_free_map); + u32 closid = ffs(closid_free_map); if (closid == 0) return -ENOSPC; @@ -125,28 +133,6 @@ return 0; } -static int rdtgroup_add_files(struct kernfs_node *kn, struct rftype *rfts, - int len) -{ - struct rftype *rft; - int ret; - - lockdep_assert_held(&rdtgroup_mutex); - - for (rft = rfts; rft < rfts + len; rft++) { - ret = rdtgroup_add_file(kn, rft); - if (ret) - goto error; - } - - return 0; -error: - pr_warn("Failed to add %s, err=%d\n", rft->name, ret); - while (--rft >= rfts) - kernfs_remove_by_name(kn, rft->name); - return ret; -} - static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) { struct kernfs_open_file *of = m->private; @@ -174,6 +160,11 @@ .seq_show = rdtgroup_seqfile_show, }; +static struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + static bool is_cpu_list(struct kernfs_open_file *of) { struct rftype *rft = of->kn->priv; @@ -203,13 +194,18 @@ /* * This is safe against intel_rdt_sched_in() called from __switch_to() * because __switch_to() is executed with interrupts disabled. A local call - * from rdt_update_closid() is proteced against __switch_to() because + * from update_closid_rmid() is proteced against __switch_to() because * preemption is disabled. */ -static void rdt_update_cpu_closid(void *closid) +static void update_cpu_closid_rmid(void *info) { - if (closid) - this_cpu_write(cpu_closid, *(int *)closid); + struct rdtgroup *r = info; + + if (r) { + this_cpu_write(pqr_state.default_closid, r->closid); + this_cpu_write(pqr_state.default_rmid, r->mon.rmid); + } + /* * We cannot unconditionally write the MSR because the current * executing task might have its own closid selected. Just reuse @@ -221,28 +217,128 @@ /* * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, * - * Per task closids must have been set up before calling this function. - * - * The per cpu closids are updated with the smp function call, when @closid - * is not NULL. If @closid is NULL then all affected percpu closids must - * have been set up before calling this function. + * Per task closids/rmids must have been set up before calling this function. */ static void -rdt_update_closid(const struct cpumask *cpu_mask, int *closid) +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) { int cpu = get_cpu(); if (cpumask_test_cpu(cpu, cpu_mask)) - rdt_update_cpu_closid(closid); - smp_call_function_many(cpu_mask, rdt_update_cpu_closid, closid, 1); + update_cpu_closid_rmid(r); + smp_call_function_many(cpu_mask, update_cpu_closid_rmid, r, 1); put_cpu(); } +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (cpumask_weight(tmpmask)) + return -EINVAL; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (cpumask_weight(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) + return -EINVAL; + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (cpumask_weight(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (cpumask_weight(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, char *buf, size_t nbytes, loff_t off) { - cpumask_var_t tmpmask, newmask; - struct rdtgroup *rdtgrp, *r; + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; int ret; if (!buf) @@ -254,6 +350,11 @@ free_cpumask_var(tmpmask); return -ENOMEM; } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } rdtgrp = rdtgroup_kn_lock_live(of->kn); if (!rdtgrp) { @@ -276,41 +377,18 @@ goto unlock; } - /* Check whether cpus are dropped from this group */ - cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); - if (cpumask_weight(tmpmask)) { - /* Can't drop from default group */ - if (rdtgrp == &rdtgroup_default) { - ret = -EINVAL; - goto unlock; - } - /* Give any dropped cpus to rdtgroup_default */ - cpumask_or(&rdtgroup_default.cpu_mask, - &rdtgroup_default.cpu_mask, tmpmask); - rdt_update_closid(tmpmask, &rdtgroup_default.closid); - } - - /* - * If we added cpus, remove them from previous group that owned them - * and update per-cpu closid - */ - cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); - if (cpumask_weight(tmpmask)) { - list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { - if (r == rdtgrp) - continue; - cpumask_andnot(&r->cpu_mask, &r->cpu_mask, tmpmask); - } - rdt_update_closid(tmpmask, &rdtgrp->closid); - } - - /* Done pushing/pulling - update this group with new mask */ - cpumask_copy(&rdtgrp->cpu_mask, newmask); + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; unlock: rdtgroup_kn_unlock(of->kn); free_cpumask_var(tmpmask); free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); return ret ?: nbytes; } @@ -336,6 +414,7 @@ if (atomic_dec_and_test(&rdtgrp->waitcount) && (rdtgrp->flags & RDT_DELETED)) { current->closid = 0; + current->rmid = 0; kfree(rdtgrp); } @@ -374,7 +453,20 @@ atomic_dec(&rdtgrp->waitcount); kfree(callback); } else { - tsk->closid = rdtgrp->closid; + /* + * For ctrl_mon groups move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTCTRL_GROUP) { + tsk->closid = rdtgrp->closid; + tsk->rmid = rdtgrp->mon.rmid; + } else if (rdtgrp->type == RDTMON_GROUP) { + if (rdtgrp->mon.parent->closid == tsk->closid) + tsk->rmid = rdtgrp->mon.rmid; + else + ret = -EINVAL; + } } return ret; } @@ -454,7 +546,8 @@ rcu_read_lock(); for_each_process_thread(p, t) { - if (t->closid == r->closid) + if ((r->type == RDTCTRL_GROUP && t->closid == r->closid) || + (r->type == RDTMON_GROUP && t->rmid == r->mon.rmid)) seq_printf(s, "%d\n", t->pid); } rcu_read_unlock(); @@ -476,39 +569,6 @@ return ret; } -/* Files in each rdtgroup */ -static struct rftype rdtgroup_base_files[] = { - { - .name = "cpus", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - }, - { - .name = "cpus_list", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_cpus_write, - .seq_show = rdtgroup_cpus_show, - .flags = RFTYPE_FLAGS_CPUS_LIST, - }, - { - .name = "tasks", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_tasks_write, - .seq_show = rdtgroup_tasks_show, - }, - { - .name = "schemata", - .mode = 0644, - .kf_ops = &rdtgroup_kf_single_ops, - .write = rdtgroup_schemata_write, - .seq_show = rdtgroup_schemata_show, - }, -}; - static int rdt_num_closids_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -536,6 +596,15 @@ return 0; } +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + static int rdt_min_bw_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -545,6 +614,28 @@ return 0; } +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) + seq_printf(seq, "%s\n", mevt->name); + + return 0; +} + static int rdt_bw_gran_show(struct kernfs_open_file *of, struct seq_file *seq, void *v) { @@ -563,100 +654,220 @@ return 0; } +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = of->kn->parent->priv; + + seq_printf(seq, "%u\n", intel_cqm_threshold * r->mon_scale); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdt_resource *r = of->kn->parent->priv; + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > (boot_cpu_data.x86_cache_size * 1024)) + return -EINVAL; + + intel_cqm_threshold = bytes / r->mon_scale; + + return nbytes; +} + /* rdtgroup information files for one cache resource. */ -static struct rftype res_cache_info_files[] = { +static struct rftype res_common_files[] = { { .name = "num_closids", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_num_closids_show, + .fflags = RF_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RF_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RF_MON_INFO, }, { .name = "cbm_mask", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_default_ctrl_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_cbm_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_cbm_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, -}; - -/* rdtgroup information files for memory bandwidth. */ -static struct rftype res_mba_info_files[] = { { - .name = "num_closids", + .name = "shareable_bits", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, - .seq_show = rdt_num_closids_show, + .seq_show = rdt_shareable_bits_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_CACHE, }, { .name = "min_bandwidth", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_min_bw_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "bandwidth_gran", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_bw_gran_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, }, { .name = "delay_linear", .mode = 0444, .kf_ops = &rdtgroup_kf_single_ops, .seq_show = rdt_delay_linear_show, + .fflags = RF_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RF_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RF_CTRL_BASE, }, }; -void rdt_get_mba_infofile(struct rdt_resource *r) +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) { - r->info_files = res_mba_info_files; - r->nr_info_files = ARRAY_SIZE(res_mba_info_files); -} + struct rftype *rfts, *rft; + int ret, len; -void rdt_get_cache_infofile(struct rdt_resource *r) -{ - r->info_files = res_cache_info_files; - r->nr_info_files = ARRAY_SIZE(res_cache_info_files); + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + for (rft = rfts; rft < rfts + len; rft++) { + if ((fflags & rft->fflags) == rft->fflags) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; } -static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +static int rdtgroup_mkdir_info_resdir(struct rdt_resource *r, char *name, + unsigned long fflags) { struct kernfs_node *kn_subdir; - struct rftype *res_info_files; - struct rdt_resource *r; - int ret, len; + int ret; - /* create the directory */ - kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); - if (IS_ERR(kn_info)) - return PTR_ERR(kn_info); + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, r); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + kernfs_get(kn_subdir); + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; +} + +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +{ + struct rdt_resource *r; + unsigned long fflags; + char name[32]; + int ret; + + /* create the directory */ + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); + if (IS_ERR(kn_info)) + return PTR_ERR(kn_info); kernfs_get(kn_info); - for_each_enabled_rdt_resource(r) { - kn_subdir = kernfs_create_dir(kn_info, r->name, - kn_info->mode, r); - if (IS_ERR(kn_subdir)) { - ret = PTR_ERR(kn_subdir); - goto out_destroy; - } - kernfs_get(kn_subdir); - ret = rdtgroup_kn_set_ugid(kn_subdir); + for_each_alloc_enabled_rdt_resource(r) { + fflags = r->fflags | RF_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(r, r->name, fflags); if (ret) goto out_destroy; + } - res_info_files = r->info_files; - len = r->nr_info_files; - - ret = rdtgroup_add_files(kn_subdir, res_info_files, len); + for_each_mon_enabled_rdt_resource(r) { + fflags = r->fflags | RF_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); if (ret) goto out_destroy; - kernfs_activate(kn_subdir); } /* @@ -678,6 +889,39 @@ return ret; } +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that @rdtgrp->kn is always accessible. + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} static void l3_qos_cfg_update(void *arg) { bool *enable = arg; @@ -718,14 +962,15 @@ struct rdt_resource *r_l3 = &rdt_resources_all[RDT_RESOURCE_L3]; int ret; - if (!r_l3->capable || !r_l3data->capable || !r_l3code->capable) + if (!r_l3->alloc_capable || !r_l3data->alloc_capable || + !r_l3code->alloc_capable) return -EINVAL; ret = set_l3_qos_cfg(r_l3, true); if (!ret) { - r_l3->enabled = false; - r_l3data->enabled = true; - r_l3code->enabled = true; + r_l3->alloc_enabled = false; + r_l3data->alloc_enabled = true; + r_l3code->alloc_enabled = true; } return ret; } @@ -734,11 +979,11 @@ { struct rdt_resource *r = &rdt_resources_all[RDT_RESOURCE_L3]; - r->enabled = r->capable; + r->alloc_enabled = r->alloc_capable; - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) { - rdt_resources_all[RDT_RESOURCE_L3DATA].enabled = false; - rdt_resources_all[RDT_RESOURCE_L3CODE].enabled = false; + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) { + rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled = false; + rdt_resources_all[RDT_RESOURCE_L3CODE].alloc_enabled = false; set_l3_qos_cfg(r, false); } } @@ -823,10 +1068,16 @@ } } +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + static struct dentry *rdt_mount(struct file_system_type *fs_type, int flags, const char *unused_dev_name, void *data) { + struct rdt_domain *dom; + struct rdt_resource *r; struct dentry *dentry; int ret; @@ -853,15 +1104,54 @@ goto out_cdp; } + if (rdt_mon_capable) { + ret = mongroup_create_dir(rdtgroup_default.kn, + NULL, "mon_groups", + &kn_mongrp); + if (ret) { + dentry = ERR_PTR(ret); + goto out_info; + } + kernfs_get(kn_mongrp); + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret) { + dentry = ERR_PTR(ret); + goto out_mongrp; + } + kernfs_get(kn_mondata); + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_destroy; + goto out_mondata; + + if (rdt_alloc_capable) + static_branch_enable(&rdt_alloc_enable_key); + if (rdt_mon_capable) + static_branch_enable(&rdt_mon_enable_key); + + if (rdt_alloc_capable || rdt_mon_capable) + static_branch_enable(&rdt_enable_key); + + if (is_mbm_enabled()) { + r = &rdt_resources_all[RDT_RESOURCE_L3]; + list_for_each_entry(dom, &r->domains, list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL); + } - static_branch_enable(&rdt_enable_key); goto out; -out_destroy: +out_mondata: + if (rdt_mon_capable) + kernfs_remove(kn_mondata); +out_mongrp: + if (rdt_mon_capable) + kernfs_remove(kn_mongrp); +out_info: kernfs_remove(kn_info); out_cdp: cdp_disable(); @@ -909,6 +1199,18 @@ return 0; } +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_alloc_capable && + (r->type == RDTCTRL_GROUP) && (t->closid == r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (rdt_mon_capable && + (r->type == RDTMON_GROUP) && (t->rmid == r->mon.rmid)); +} + /* * Move tasks from one to the other group. If @from is NULL, then all tasks * in the systems are moved unconditionally (used for teardown). @@ -924,8 +1226,11 @@ read_lock(&tasklist_lock); for_each_process_thread(p, t) { - if (!from || t->closid == from->closid) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { t->closid = to->closid; + t->rmid = to->mon.rmid; + #ifdef CONFIG_SMP /* * This is safe on x86 w/o barriers as the ordering @@ -944,6 +1249,19 @@ read_unlock(&tasklist_lock); } +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + kfree(sentry); + } +} + /* * Forcibly remove all of subdirectories under root. */ @@ -955,6 +1273,9 @@ rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + /* Remove each rdtgroup other than root */ if (rdtgrp == &rdtgroup_default) continue; @@ -967,16 +1288,20 @@ cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + free_rmid(rdtgrp->mon.rmid); + kernfs_remove(rdtgrp->kn); list_del(&rdtgrp->rdtgroup_list); kfree(rdtgrp); } /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ get_online_cpus(); - rdt_update_closid(cpu_online_mask, &rdtgroup_default.closid); + update_closid_rmid(cpu_online_mask, &rdtgroup_default); put_online_cpus(); kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); } static void rdt_kill_sb(struct super_block *sb) @@ -986,10 +1311,12 @@ mutex_lock(&rdtgroup_mutex); /*Put everything back to default values. */ - for_each_enabled_rdt_resource(r) + for_each_alloc_enabled_rdt_resource(r) reset_all_ctrls(r); cdp_disable(); rmdir_all_sub(); + static_branch_disable(&rdt_alloc_enable_key); + static_branch_disable(&rdt_mon_enable_key); static_branch_disable(&rdt_enable_key); kernfs_kill_sb(sb); mutex_unlock(&rdtgroup_mutex); @@ -1001,46 +1328,223 @@ .kill_sb = rdt_kill_sb, }; -static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, - umode_t mode) +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) { - struct rdtgroup *parent, *rdtgrp; struct kernfs_node *kn; - int ret, closid; + int ret = 0; - /* Only allow mkdir in the root directory */ - if (parent_kn != rdtgroup_default.kn) - return -EPERM; + kn = __kernfs_create_file(parent_kn, name, 0444, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); - /* Do not accept '\n' to avoid unparsable situation. */ - if (strchr(name, '\n')) - return -EINVAL; + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } - parent = rdtgroup_kn_lock_live(parent_kn); - if (!parent) { - ret = -ENODEV; - goto out_unlock; + return ret; +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups with given domain id. + */ +void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, unsigned int dom_id) +{ + struct rdtgroup *prgrp, *crgrp; + char name[32]; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + sprintf(name, "mon_%s_%02d", r->name, dom_id); + kernfs_remove_by_name(prgrp->mon.mon_data_kn, name); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + kernfs_remove_by_name(crgrp->mon.mon_data_kn, name); } +} - ret = closid_alloc(); - if (ret < 0) +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + union mon_data_bits priv; + struct kernfs_node *kn; + struct mon_evt *mevt; + struct rmid_read rr; + char name[32]; + int ret; + + sprintf(name, "mon_%s_%02d", r->name, d->id); + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + /* + * This extra ref will be put in kernfs_remove() and guarantees + * that kn is always accessible. + */ + kernfs_get(kn); + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + if (WARN_ON(list_empty(&r->evt_list))) { + ret = -EPERM; + goto out_destroy; + } + + priv.u.rid = r->rid; + priv.u.domid = d->id; + list_for_each_entry(mevt, &r->evt_list, list) { + priv.u.evtid = mevt->evtid; + ret = mon_addfile(kn, mevt->name, priv.priv); + if (ret) + goto out_destroy; + + if (is_mbm_event(mevt->evtid)) + mon_event_read(&rr, d, prgrp, mevt->evtid, true); + } + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + if (!r->mon_enabled) + return; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_domain *dom; + int ret; + + list_for_each_entry(dom, &r->domains, list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain whic are named + * in the format mon__. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, NULL, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_enabled_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + struct kernfs_node *kn; + uint files = 0; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(prgrp_kn); + if (!prdtgrp) { + ret = -ENODEV; goto out_unlock; - closid = ret; + } /* allocate the rdtgroup. */ rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); if (!rdtgrp) { ret = -ENOSPC; - goto out_closid_free; + goto out_unlock; } - rdtgrp->closid = closid; - list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); /* kernfs creates the directory for rdtgrp */ - kn = kernfs_create_dir(parent->kn, name, mode, rdtgrp); + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); if (IS_ERR(kn)) { ret = PTR_ERR(kn); - goto out_cancel_ref; + goto out_free_rgrp; } rdtgrp->kn = kn; @@ -1056,43 +1560,211 @@ if (ret) goto out_destroy; - ret = rdtgroup_add_files(kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + files = RFTYPE_BASE | RFTYPE_CTRL; + files = RFTYPE_BASE | BIT(RF_CTRLSHIFT + rtype); + ret = rdtgroup_add_files(kn, files); if (ret) goto out_destroy; + if (rdt_mon_capable) { + ret = alloc_rmid(); + if (ret < 0) + goto out_destroy; + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) + goto out_idfree; + } kernfs_activate(kn); - ret = 0; - goto out_unlock; + /* + * The caller unlocks the prgrp_kn upon success. + */ + return 0; +out_idfree: + free_rmid(rdtgrp->mon.rmid); out_destroy: kernfs_remove(rdtgrp->kn); -out_cancel_ref: - list_del(&rdtgrp->rdtgroup_list); +out_free_rgrp: kfree(rdtgrp); -out_closid_free: - closid_free(closid); out_unlock: - rdtgroup_kn_unlock(parent_kn); + rdtgroup_kn_unlock(prgrp_kn); return ret; } -static int rdtgroup_rmdir(struct kernfs_node *kn) +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + free_rmid(rgrp->mon.rmid); + kfree(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, + umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTMON_GROUP, + &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + struct kernfs_node *prgrp_kn, + const char *name, umode_t mode) { - int ret, cpu, closid = rdtgroup_default.closid; struct rdtgroup *rdtgrp; - cpumask_var_t tmpmask; + struct kernfs_node *kn; + u32 closid; + int ret; - if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) - return -ENOMEM; + ret = mkdir_rdt_prepare(parent_kn, prgrp_kn, name, mode, RDTCTRL_GROUP, + &rdtgrp); + if (ret) + return ret; - rdtgrp = rdtgroup_kn_lock_live(kn); - if (!rdtgrp) { - ret = -EPERM; - goto out; + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) + goto out_common_fail; + closid = ret; + + rdtgrp->closid = closid; + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (rdt_mon_capable) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, NULL, "mon_groups", NULL); + if (ret) + goto out_id_free; } + goto out_unlock; + +out_id_free: + closid_free(closid); + list_del(&rdtgrp->rdtgroup_list); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(prgrp_kn); + return ret; +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(kn->name, "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (rdt_alloc_capable && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, parent_kn, name, mode); + + /* + * If RDT monitoring is supported and the parent directory is a valid + * "mon_groups" directory, add a monitoring subdirectory. + */ + if (rdt_mon_capable && is_mon_groups(parent_kn, name)) + return rdtgroup_mkdir_mon(parent_kn, parent_kn->parent, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* Update per cpu rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) + per_cpu(pqr_state.default_rmid, cpu) = prdtgrp->mon.rmid; + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + /* + * one extra hold on this, will drop when we kfree(rdtgrp) + * in rdtgroup_kn_unlock() + */ + kernfs_get(kn); + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct kernfs_node *kn, struct rdtgroup *rdtgrp, + cpumask_var_t tmpmask) +{ + int cpu; + /* Give any tasks back to the default group */ rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); @@ -1100,18 +1772,28 @@ cpumask_or(&rdtgroup_default.cpu_mask, &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); - /* Update per cpu closid of the moved CPUs first */ - for_each_cpu(cpu, &rdtgrp->cpu_mask) - per_cpu(cpu_closid, cpu) = closid; + /* Update per cpu closid and rmid of the moved CPUs first */ + for_each_cpu(cpu, &rdtgrp->cpu_mask) { + per_cpu(pqr_state.default_closid, cpu) = rdtgroup_default.closid; + per_cpu(pqr_state.default_rmid, cpu) = rdtgroup_default.mon.rmid; + } + /* * Update the MSR on moved CPUs and CPUs which have moved * task running on them. */ cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); - rdt_update_closid(tmpmask, NULL); + update_closid_rmid(tmpmask, NULL); rdtgrp->flags = RDT_DELETED; closid_free(rdtgrp->closid); + free_rmid(rdtgrp->mon.rmid); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + list_del(&rdtgrp->rdtgroup_list); /* @@ -1120,7 +1802,41 @@ */ kernfs_get(kn); kernfs_remove(rdtgrp->kn); - ret = 0; + + return 0; +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn = kn->parent; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn) + ret = rdtgroup_rmdir_ctrl(kn, rdtgrp, tmpmask); + else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, kn->name)) + ret = rdtgroup_rmdir_mon(kn, rdtgrp, tmpmask); + else + ret = -EPERM; + out: rdtgroup_kn_unlock(kn); free_cpumask_var(tmpmask); @@ -1129,7 +1845,7 @@ static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) { - if (rdt_resources_all[RDT_RESOURCE_L3DATA].enabled) + if (rdt_resources_all[RDT_RESOURCE_L3DATA].alloc_enabled) seq_puts(seq, ",cdp"); return 0; } @@ -1153,10 +1869,13 @@ mutex_lock(&rdtgroup_mutex); rdtgroup_default.closid = 0; + rdtgroup_default.mon.rmid = 0; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); - ret = rdtgroup_add_files(rdt_root->kn, rdtgroup_base_files, - ARRAY_SIZE(rdtgroup_base_files)); + ret = rdtgroup_add_files(rdt_root->kn, RF_CTRL_BASE); if (ret) { kernfs_destroy_root(rdt_root); goto out; --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/mcheck/dev-mcelog.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/mcheck/dev-mcelog.c @@ -24,14 +24,6 @@ static char mce_helper[128]; static char *mce_helper_argv[2] = { mce_helper, NULL }; -#define mce_log_get_idx_check(p) \ -({ \ - RCU_LOCKDEP_WARN(!rcu_read_lock_sched_held() && \ - !lockdep_is_held(&mce_chrdev_read_mutex), \ - "suspicious mce_log_get_idx_check() usage"); \ - smp_load_acquire(&(p)); \ -}) - /* * Lockless MCE logging infrastructure. * This avoids deadlocks on printk locks without having to break locks. Also @@ -53,43 +45,32 @@ void *data) { struct mce *mce = (struct mce *)data; - unsigned int next, entry; + unsigned int entry; + + mutex_lock(&mce_chrdev_read_mutex); - wmb(); - for (;;) { - entry = mce_log_get_idx_check(mcelog.next); - for (;;) { - - /* - * When the buffer fills up discard new entries. - * Assume that the earlier errors are the more - * interesting ones: - */ - if (entry >= MCE_LOG_LEN) { - set_bit(MCE_OVERFLOW, - (unsigned long *)&mcelog.flags); - return NOTIFY_OK; - } - /* Old left over entry. Skip: */ - if (mcelog.entry[entry].finished) { - entry++; - continue; - } - break; - } - smp_rmb(); - next = entry + 1; - if (cmpxchg(&mcelog.next, entry, next) == entry) - break; + entry = mcelog.next; + + /* + * When the buffer fills up discard new entries. Assume that the + * earlier errors are the more interesting ones: + */ + if (entry >= MCE_LOG_LEN) { + set_bit(MCE_OVERFLOW, (unsigned long *)&mcelog.flags); + goto unlock; } + + mcelog.next = entry + 1; + memcpy(mcelog.entry + entry, mce, sizeof(struct mce)); - wmb(); mcelog.entry[entry].finished = 1; - wmb(); /* wake processes polling /dev/mcelog */ wake_up_interruptible(&mce_chrdev_wait); +unlock: + mutex_unlock(&mce_chrdev_read_mutex); + return NOTIFY_OK; } @@ -177,13 +158,6 @@ return 0; } -static void collect_tscs(void *data) -{ - unsigned long *cpu_tsc = (unsigned long *)data; - - cpu_tsc[smp_processor_id()] = rdtsc(); -} - static int mce_apei_read_done; /* Collect MCE record of previous boot in persistent storage via APEI ERST. */ @@ -231,14 +205,9 @@ size_t usize, loff_t *off) { char __user *buf = ubuf; - unsigned long *cpu_tsc; - unsigned prev, next; + unsigned next; int i, err; - cpu_tsc = kmalloc(nr_cpu_ids * sizeof(long), GFP_KERNEL); - if (!cpu_tsc) - return -ENOMEM; - mutex_lock(&mce_chrdev_read_mutex); if (!mce_apei_read_done) { @@ -247,65 +216,29 @@ goto out; } - next = mce_log_get_idx_check(mcelog.next); - /* Only supports full reads right now */ err = -EINVAL; if (*off != 0 || usize < MCE_LOG_LEN*sizeof(struct mce)) goto out; + next = mcelog.next; err = 0; - prev = 0; - do { - for (i = prev; i < next; i++) { - unsigned long start = jiffies; - struct mce *m = &mcelog.entry[i]; - - while (!m->finished) { - if (time_after_eq(jiffies, start + 2)) { - memset(m, 0, sizeof(*m)); - goto timeout; - } - cpu_relax(); - } - smp_rmb(); - err |= copy_to_user(buf, m, sizeof(*m)); - buf += sizeof(*m); -timeout: - ; - } - - memset(mcelog.entry + prev, 0, - (next - prev) * sizeof(struct mce)); - prev = next; - next = cmpxchg(&mcelog.next, prev, 0); - } while (next != prev); - synchronize_sched(); - - /* - * Collect entries that were still getting written before the - * synchronize. - */ - on_each_cpu(collect_tscs, cpu_tsc, 1); - - for (i = next; i < MCE_LOG_LEN; i++) { + for (i = 0; i < next; i++) { struct mce *m = &mcelog.entry[i]; - if (m->finished && m->tsc < cpu_tsc[m->cpu]) { - err |= copy_to_user(buf, m, sizeof(*m)); - smp_rmb(); - buf += sizeof(*m); - memset(m, 0, sizeof(*m)); - } + err |= copy_to_user(buf, m, sizeof(*m)); + buf += sizeof(*m); } + memset(mcelog.entry, 0, next * sizeof(struct mce)); + mcelog.next = 0; + if (err) err = -EFAULT; out: mutex_unlock(&mce_chrdev_read_mutex); - kfree(cpu_tsc); return err ? err : buf - ubuf; } --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/mcheck/mce-severity.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/mcheck/mce-severity.c @@ -245,6 +245,9 @@ if (m->status & MCI_STATUS_UC) { + if (ctx == IN_KERNEL) + return MCE_PANIC_SEVERITY; + /* * On older systems where overflow_recov flag is not present, we * should simply panic if an error overflow occurs. If @@ -255,10 +258,6 @@ if (mce_flags.smca) return mce_severity_amd_smca(m, ctx); - /* software can try to contain */ - if (!(m->mcgstatus & MCG_STATUS_RIPV) && (ctx == IN_KERNEL)) - return MCE_PANIC_SEVERITY; - /* kill current process */ return MCE_AR_SEVERITY; } else { --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/mcheck/mce.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/mcheck/mce.c @@ -51,6 +51,7 @@ #include #include #include +#include #include "mce-internal.h" @@ -1051,6 +1052,48 @@ return ret; } +#if defined(arch_unmap_kpfn) && defined(CONFIG_MEMORY_FAILURE) + +void arch_unmap_kpfn(unsigned long pfn) +{ + unsigned long decoy_addr; + + /* + * Unmap this page from the kernel 1:1 mappings to make sure + * we don't log more errors because of speculative access to + * the page. + * We would like to just call: + * set_memory_np((unsigned long)pfn_to_kaddr(pfn), 1); + * but doing that would radically increase the odds of a + * speculative access to the posion page because we'd have + * the virtual address of the kernel 1:1 mapping sitting + * around in registers. + * Instead we get tricky. We create a non-canonical address + * that looks just like the one we want, but has bit 63 flipped. + * This relies on set_memory_np() not checking whether we passed + * a legal address. + */ + +/* + * Build time check to see if we have a spare virtual bit. Don't want + * to leave this until run time because most developers don't have a + * system that can exercise this code path. This will only become a + * problem if/when we move beyond 5-level page tables. + * + * Hard code "9" here because cpp doesn't grok ilog2(PTRS_PER_PGD) + */ +#if PGDIR_SHIFT + 9 < 63 + decoy_addr = (pfn << PAGE_SHIFT) + (PAGE_OFFSET ^ BIT(63)); +#else +#error "no unused virtual bit available" +#endif + + if (set_memory_np(decoy_addr, 1)) + pr_warn("Could not invalidate pfn=0x%lx from 1:1 map\n", pfn); + +} +#endif + /* * The actual machine check handler. This only handles real * exceptions when something got corrupted coming in through int 18. --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/mcheck/mce_amd.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/mcheck/mce_amd.c @@ -26,7 +26,6 @@ #include #include #include -#include #define NR_BLOCKS 5 #define THRESHOLD_MAX 0xFFF @@ -201,8 +200,8 @@ wrmsr(smca_config, low, high); } - /* Collect bank_info using CPU 0 for now. */ - if (cpu) + /* Return early if this bank was already initialized. */ + if (smca_banks[bank].hwid) return; if (rdmsr_safe_on_cpu(cpu, MSR_AMD64_SMCA_MCx_IPID(bank), &low, &high)) { @@ -216,11 +215,6 @@ for (i = 0; i < ARRAY_SIZE(smca_hwid_mcatypes); i++) { s_hwid = &smca_hwid_mcatypes[i]; if (hwid_mcatype == s_hwid->hwid_mcatype) { - - WARN(smca_banks[bank].hwid, - "Bank %s already initialized!\n", - smca_get_name(s_hwid->bank_type)); - smca_banks[bank].hwid = s_hwid; smca_banks[bank].id = low; smca_banks[bank].sysfs_id = s_hwid->count++; @@ -792,9 +786,7 @@ asmlinkage __visible void __irq_entry smp_trace_deferred_error_interrupt(void) { entering_irq(); - trace_deferred_error_apic_entry(DEFERRED_ERROR_VECTOR); __smp_deferred_error_interrupt(); - trace_deferred_error_apic_exit(DEFERRED_ERROR_VECTOR); exiting_ack_irq(); } --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/mcheck/therm_throt.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/mcheck/therm_throt.c @@ -28,7 +28,6 @@ #include #include #include -#include /* How long to wait between reporting thermal events */ #define CHECK_INTERVAL (300 * HZ) @@ -408,9 +407,7 @@ smp_trace_thermal_interrupt(struct pt_regs *regs) { entering_irq(); - trace_thermal_apic_entry(THERMAL_APIC_VECTOR); __smp_thermal_interrupt(); - trace_thermal_apic_exit(THERMAL_APIC_VECTOR); exiting_ack_irq(); } --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/mcheck/threshold.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/mcheck/threshold.c @@ -7,7 +7,6 @@ #include #include #include -#include static void default_threshold_interrupt(void) { @@ -33,8 +32,6 @@ asmlinkage __visible void __irq_entry smp_trace_threshold_interrupt(void) { entering_irq(); - trace_threshold_apic_entry(THRESHOLD_APIC_VECTOR); __smp_threshold_interrupt(); - trace_threshold_apic_exit(THRESHOLD_APIC_VECTOR); exiting_ack_irq(); } --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/microcode/amd.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/microcode/amd.c @@ -467,6 +467,7 @@ #define F14H_MPB_MAX_SIZE 1824 #define F15H_MPB_MAX_SIZE 4096 #define F16H_MPB_MAX_SIZE 3458 +#define F17H_MPB_MAX_SIZE 3200 switch (family) { case 0x14: @@ -478,6 +479,9 @@ case 0x16: max_size = F16H_MPB_MAX_SIZE; break; + case 0x17: + max_size = F17H_MPB_MAX_SIZE; + break; default: max_size = F1XH_MPB_MAX_SIZE; break; --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/microcode/core.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/microcode/core.c @@ -122,9 +122,6 @@ bool *res = &dis_ucode_ldr; #endif - if (!have_cpuid_p()) - return *res; - /* * CPUID(1).ECX[31]: reserved for hypervisor use. This is still not * completely accurate as xen pv guests don't see that CPUID bit set but @@ -166,24 +163,36 @@ void __init load_ucode_bsp(void) { unsigned int cpuid_1_eax; + bool intel = true; - if (check_loader_disabled_bsp()) + if (!have_cpuid_p()) return; cpuid_1_eax = native_cpuid_eax(1); switch (x86_cpuid_vendor()) { case X86_VENDOR_INTEL: - if (x86_family(cpuid_1_eax) >= 6) - load_ucode_intel_bsp(); + if (x86_family(cpuid_1_eax) < 6) + return; break; + case X86_VENDOR_AMD: - if (x86_family(cpuid_1_eax) >= 0x10) - load_ucode_amd_bsp(cpuid_1_eax); + if (x86_family(cpuid_1_eax) < 0x10) + return; + intel = false; break; + default: - break; + return; } + + if (check_loader_disabled_bsp()) + return; + + if (intel) + load_ucode_intel_bsp(); + else + load_ucode_amd_bsp(cpuid_1_eax); } static bool check_loader_disabled_ap(void) @@ -526,6 +535,26 @@ } if (!ret) perf_check_microcode(); + + if (boot_cpu_has(X86_FEATURE_SPEC_CTRL)) { + printk_once(KERN_INFO "FEATURE SPEC_CTRL Present\n"); + mutex_lock(&spec_ctrl_mutex); + set_ibrs_supported(); + set_ibpb_supported(); + if (ibrs_inuse) + sysctl_ibrs_enabled = 1; + if (ibpb_inuse) + sysctl_ibpb_enabled = 1; + mutex_unlock(&spec_ctrl_mutex); + } else if (boot_cpu_has(X86_FEATURE_IBPB)) { + printk_once(KERN_INFO "FEATURE IBPB Present\n"); + mutex_lock(&spec_ctrl_mutex); + set_ibpb_supported(); + if (ibpb_inuse) + sysctl_ibpb_enabled = 1; + mutex_unlock(&spec_ctrl_mutex); + } + mutex_unlock(µcode_mutex); put_online_cpus(); --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/microcode/intel.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/microcode/intel.c @@ -34,6 +34,7 @@ #include #include +#include #include #include #include @@ -563,15 +564,6 @@ } #else -/* - * Flush global tlb. We only do this in x86_64 where paging has been enabled - * already and PGE should be enabled as well. - */ -static inline void flush_tlb_early(void) -{ - __native_flush_tlb_global_irq_disabled(); -} - static inline void print_ucode(struct ucode_cpu_info *uci) { struct microcode_intel *mc; @@ -600,10 +592,6 @@ if (rev != mc->hdr.rev) return -1; -#ifdef CONFIG_X86_64 - /* Flush global tlb. This is precaution. */ - flush_tlb_early(); -#endif uci->cpu_sig.rev = rev; if (early) @@ -917,6 +905,18 @@ return 0; } +static bool is_blacklisted(unsigned int cpu) +{ + struct cpuinfo_x86 *c = &cpu_data(cpu); + + if (c->x86 == 6 && c->x86_model == INTEL_FAM6_BROADWELL_X) { + pr_err_once("late loading on model 79 is disabled.\n"); + return true; + } + + return false; +} + static enum ucode_state request_microcode_fw(int cpu, struct device *device, bool refresh_fw) { @@ -925,6 +925,9 @@ const struct firmware *firmware; enum ucode_state ret; + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + sprintf(name, "intel-ucode/%02x-%02x-%02x", c->x86, c->x86_model, c->x86_mask); @@ -949,6 +952,9 @@ static enum ucode_state request_microcode_user(int cpu, const void __user *buf, size_t size) { + if (is_blacklisted(cpu)) + return UCODE_NFOUND; + return generic_load_microcode(cpu, (void *)buf, size, &get_ucode_user); } --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/mshyperv.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/mshyperv.c @@ -184,9 +184,15 @@ ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("HyperV: features 0x%x, hints 0x%x\n", + pr_info("Hyper-V: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS); + + pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", + ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + /* * Extract host information. */ @@ -219,7 +225,7 @@ rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_frequency = hv_lapic_frequency; - pr_info("HyperV: LAPIC Timer Frequency: %#x\n", + pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } @@ -249,12 +255,13 @@ * Setup the hook to get control post apic initialization. */ x86_platform.apic_post_init = hyperv_init; + hyperv_setup_mmu_ops(); #endif } -const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { - .name = "Microsoft HyperV", +const __initconst struct hypervisor_x86 x86_hyper_ms_hyperv = { + .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, - .init_platform = ms_hyperv_init_platform, + .type = X86_HYPER_MS_HYPERV, + .init.init_platform = ms_hyperv_init_platform, }; -EXPORT_SYMBOL(x86_hyper_ms_hyperv); --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/proc.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/proc.c @@ -2,6 +2,9 @@ #include #include #include +#include + +#include "cpu.h" /* * Get CPU information for use by the procfs. @@ -75,9 +78,16 @@ if (c->microcode) seq_printf(m, "microcode\t: 0x%x\n", c->microcode); - if (cpu_has(c, X86_FEATURE_TSC)) + if (cpu_has(c, X86_FEATURE_TSC)) { + unsigned int freq = aperfmperf_get_khz(cpu); + + if (!freq) + freq = cpufreq_quick_get(cpu); + if (!freq) + freq = cpu_khz; seq_printf(m, "cpu MHz\t\t: %u.%03u\n", - cpu_khz / 1000, (cpu_khz % 1000)); + freq / 1000, (freq % 1000)); + } /* Cache size */ if (c->x86_cache_size >= 0) --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/scattered.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/scattered.c @@ -24,6 +24,7 @@ { X86_FEATURE_INTEL_PT, CPUID_EBX, 25, 0x00000007, 0 }, { X86_FEATURE_AVX512_4VNNIW, CPUID_EDX, 2, 0x00000007, 0 }, { X86_FEATURE_AVX512_4FMAPS, CPUID_EDX, 3, 0x00000007, 0 }, + { X86_FEATURE_SPEC_CTRL, CPUID_EDX, 26, 0x00000007, 0 }, { X86_FEATURE_CAT_L3, CPUID_EBX, 1, 0x00000010, 0 }, { X86_FEATURE_CAT_L2, CPUID_EBX, 2, 0x00000010, 0 }, { X86_FEATURE_CDP_L3, CPUID_ECX, 2, 0x00000010, 1 }, --- linux-azure-4.13.0.orig/arch/x86/kernel/cpu/vmware.c +++ linux-azure-4.13.0/arch/x86/kernel/cpu/vmware.c @@ -205,10 +205,10 @@ (eax & (1 << VMWARE_PORT_CMD_LEGACY_X2APIC)) != 0; } -const __refconst struct hypervisor_x86 x86_hyper_vmware = { +const __initconst struct hypervisor_x86 x86_hyper_vmware = { .name = "VMware", .detect = vmware_platform, - .init_platform = vmware_platform_setup, - .x2apic_available = vmware_legacy_x2apic_available, + .type = X86_HYPER_VMWARE, + .init.init_platform = vmware_platform_setup, + .init.x2apic_available = vmware_legacy_x2apic_available, }; -EXPORT_SYMBOL(x86_hyper_vmware); --- linux-azure-4.13.0.orig/arch/x86/kernel/doublefault.c +++ linux-azure-4.13.0/arch/x86/kernel/doublefault.c @@ -49,25 +49,23 @@ cpu_relax(); } -struct tss_struct doublefault_tss __cacheline_aligned = { - .x86_tss = { - .sp0 = STACK_START, - .ss0 = __KERNEL_DS, - .ldt = 0, - .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, +struct x86_hw_tss doublefault_tss __cacheline_aligned = { + .sp0 = STACK_START, + .ss0 = __KERNEL_DS, + .ldt = 0, + .io_bitmap_base = INVALID_IO_BITMAP_OFFSET, - .ip = (unsigned long) doublefault_fn, - /* 0x2 bit is always set */ - .flags = X86_EFLAGS_SF | 0x2, - .sp = STACK_START, - .es = __USER_DS, - .cs = __KERNEL_CS, - .ss = __KERNEL_DS, - .ds = __USER_DS, - .fs = __KERNEL_PERCPU, + .ip = (unsigned long) doublefault_fn, + /* 0x2 bit is always set */ + .flags = X86_EFLAGS_SF | 0x2, + .sp = STACK_START, + .es = __USER_DS, + .cs = __KERNEL_CS, + .ss = __KERNEL_DS, + .ds = __USER_DS, + .fs = __KERNEL_PERCPU, - .__cr3 = __pa_nodebug(swapper_pg_dir), - } + .__cr3 = __pa_nodebug(swapper_pg_dir), }; /* dummy for do_double_fault() call */ --- linux-azure-4.13.0.orig/arch/x86/kernel/dumpstack.c +++ linux-azure-4.13.0/arch/x86/kernel/dumpstack.c @@ -18,6 +18,7 @@ #include #include +#include #include #include @@ -43,6 +44,24 @@ return true; } +bool in_entry_stack(unsigned long *stack, struct stack_info *info) +{ + struct entry_stack *ss = cpu_entry_stack(smp_processor_id()); + + void *begin = ss; + void *end = ss + 1; + + if ((void *)stack < begin || (void *)stack >= end) + return false; + + info->type = STACK_TYPE_ENTRY; + info->begin = begin; + info->end = end; + info->next_sp = NULL; + + return true; +} + static void printk_stack_address(unsigned long address, int reliable, char *log_lvl) { @@ -50,6 +69,39 @@ printk("%s %s%pB\n", log_lvl, reliable ? "" : "? ", (void *)address); } +void show_iret_regs(struct pt_regs *regs) +{ + printk(KERN_DEFAULT "RIP: %04x:%pS\n", (int)regs->cs, (void *)regs->ip); + printk(KERN_DEFAULT "RSP: %04x:%016lx EFLAGS: %08lx", (int)regs->ss, + regs->sp, regs->flags); +} + +static void show_regs_if_on_stack(struct stack_info *info, struct pt_regs *regs, + bool partial) +{ + /* + * These on_stack() checks aren't strictly necessary: the unwind code + * has already validated the 'regs' pointer. The checks are done for + * ordering reasons: if the registers are on the next stack, we don't + * want to print them out yet. Otherwise they'll be shown as part of + * the wrong stack. Later, when show_trace_log_lvl() switches to the + * next stack, this function will be called again with the same regs so + * they can be printed in the right context. + */ + if (!partial && on_stack(info, regs, sizeof(*regs))) { + __show_regs(regs, 0); + + } else if (partial && on_stack(info, (void *)regs + IRET_FRAME_OFFSET, + IRET_FRAME_SIZE)) { + /* + * When an interrupt or exception occurs in entry code, the + * full pt_regs might not have been saved yet. In that case + * just print the iret frame. + */ + show_iret_regs(regs); + } +} + void show_trace_log_lvl(struct task_struct *task, struct pt_regs *regs, unsigned long *stack, char *log_lvl) { @@ -57,11 +109,13 @@ struct stack_info stack_info = {0}; unsigned long visit_mask = 0; int graph_idx = 0; + bool partial; printk("%sCall Trace:\n", log_lvl); unwind_start(&state, task, regs, stack); stack = stack ? : get_stack_pointer(task, regs); + regs = unwind_get_entry_regs(&state, &partial); /* * Iterate through the stacks, starting with the current stack pointer. @@ -71,29 +125,36 @@ * - task stack * - interrupt stack * - HW exception stacks (double fault, nmi, debug, mce) + * - entry stack * - * x86-32 can have up to three stacks: + * x86-32 can have up to four stacks: * - task stack * - softirq stack * - hardirq stack + * - entry stack */ - for (regs = NULL; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { + for ( ; stack; stack = PTR_ALIGN(stack_info.next_sp, sizeof(long))) { const char *stack_name; - /* - * If we overflowed the task stack into a guard page, jump back - * to the bottom of the usable stack. - */ - if (task_stack_page(task) - (void *)stack < PAGE_SIZE) - stack = task_stack_page(task); - - if (get_stack_info(stack, task, &stack_info, &visit_mask)) - break; + if (get_stack_info(stack, task, &stack_info, &visit_mask)) { + /* + * We weren't on a valid stack. It's possible that + * we overflowed a valid stack into a guard page. + * See if the next page up is valid so that we can + * generate some kind of backtrace if this happens. + */ + stack = (unsigned long *)PAGE_ALIGN((unsigned long)stack); + if (get_stack_info(stack, task, &stack_info, &visit_mask)) + break; + } stack_name = stack_type_name(stack_info.type); if (stack_name) printk("%s <%s>\n", log_lvl, stack_name); + if (regs) + show_regs_if_on_stack(&stack_info, regs, partial); + /* * Scan the stack, printing any text addresses we find. At the * same time, follow proper stack frames with the unwinder. @@ -116,7 +177,7 @@ /* * Don't print regs->ip again if it was already printed - * by __show_regs() below. + * by show_regs_if_on_stack(). */ if (regs && stack == ®s->ip) { unwind_next_frame(&state); @@ -152,9 +213,9 @@ unwind_next_frame(&state); /* if the frame has entry regs, print them */ - regs = unwind_get_entry_regs(&state); + regs = unwind_get_entry_regs(&state, &partial); if (regs) - __show_regs(regs, 0); + show_regs_if_on_stack(&stack_info, regs, partial); } if (stack_name) @@ -250,11 +311,13 @@ unsigned long sp; #endif printk(KERN_DEFAULT - "%s: %04lx [#%d]%s%s%s%s\n", str, err & 0xffff, ++die_counter, + "%s: %04lx [#%d]%s%s%s%s%s\n", str, err & 0xffff, ++die_counter, IS_ENABLED(CONFIG_PREEMPT) ? " PREEMPT" : "", IS_ENABLED(CONFIG_SMP) ? " SMP" : "", debug_pagealloc_enabled() ? " DEBUG_PAGEALLOC" : "", - IS_ENABLED(CONFIG_KASAN) ? " KASAN" : ""); + IS_ENABLED(CONFIG_KASAN) ? " KASAN" : "", + IS_ENABLED(CONFIG_PAGE_TABLE_ISOLATION) ? + (boot_cpu_has(X86_FEATURE_PTI) ? " PTI" : " NOPTI") : ""); if (notify_die(DIE_OOPS, str, regs, err, current->thread.trap_nr, SIGSEGV) == NOTIFY_STOP) --- linux-azure-4.13.0.orig/arch/x86/kernel/dumpstack_32.c +++ linux-azure-4.13.0/arch/x86/kernel/dumpstack_32.c @@ -25,6 +25,9 @@ if (type == STACK_TYPE_SOFTIRQ) return "SOFTIRQ"; + if (type == STACK_TYPE_ENTRY) + return "ENTRY_TRAMPOLINE"; + return NULL; } @@ -92,6 +95,9 @@ if (task != current) goto unknown; + if (in_entry_stack(stack, info)) + goto recursion_check; + if (in_hardirq_stack(stack, info)) goto recursion_check; --- linux-azure-4.13.0.orig/arch/x86/kernel/dumpstack_64.c +++ linux-azure-4.13.0/arch/x86/kernel/dumpstack_64.c @@ -36,6 +36,15 @@ if (type == STACK_TYPE_IRQ) return "IRQ"; + if (type == STACK_TYPE_ENTRY) { + /* + * On 64-bit, we have a generic entry stack that we + * use for all the kernel entry points, including + * SYSENTER. + */ + return "ENTRY_TRAMPOLINE"; + } + if (type >= STACK_TYPE_EXCEPTION && type <= STACK_TYPE_EXCEPTION_LAST) return exception_stack_names[type - STACK_TYPE_EXCEPTION]; @@ -114,6 +123,9 @@ if (in_irq_stack(stack, info)) goto recursion_check; + if (in_entry_stack(stack, info)) + goto recursion_check; + goto unknown; recursion_check: --- linux-azure-4.13.0.orig/arch/x86/kernel/fpu/init.c +++ linux-azure-4.13.0/arch/x86/kernel/fpu/init.c @@ -249,6 +249,10 @@ */ static void __init fpu__init_parse_early_param(void) { + char arg[32]; + char *argptr = arg; + int bit; + if (cmdline_find_option_bool(boot_command_line, "no387")) setup_clear_cpu_cap(X86_FEATURE_FPU); @@ -266,6 +270,13 @@ if (cmdline_find_option_bool(boot_command_line, "noxsaves")) setup_clear_cpu_cap(X86_FEATURE_XSAVES); + + if (cmdline_find_option(boot_command_line, "clearcpuid", arg, + sizeof(arg)) && + get_option(&argptr, &bit) && + bit >= 0 && + bit < NCAPINTS * 32) + setup_clear_cpu_cap(bit); } /* --- linux-azure-4.13.0.orig/arch/x86/kernel/fpu/regset.c +++ linux-azure-4.13.0/arch/x86/kernel/fpu/regset.c @@ -131,11 +131,16 @@ fpu__activate_fpstate_write(fpu); - if (boot_cpu_has(X86_FEATURE_XSAVES)) + if (boot_cpu_has(X86_FEATURE_XSAVES)) { ret = copyin_to_xsaves(kbuf, ubuf, xsave); - else + } else { ret = user_regset_copyin(&pos, &count, &kbuf, &ubuf, xsave, 0, -1); + /* xcomp_bv must be 0 when using uncompacted format */ + if (!ret && xsave->header.xcomp_bv) + ret = -EINVAL; + } + /* * In case of failure, mark all states as init: */ --- linux-azure-4.13.0.orig/arch/x86/kernel/fpu/signal.c +++ linux-azure-4.13.0/arch/x86/kernel/fpu/signal.c @@ -329,6 +329,10 @@ } else { err = __copy_from_user(&fpu->state.xsave, buf_fx, state_size); + + /* xcomp_bv must be 0 when using uncompacted format */ + if (!err && state_size > offsetof(struct xregs_state, header) && fpu->state.xsave.header.xcomp_bv) + err = -EINVAL; } if (err || __copy_from_user(&env, buf, sizeof(env))) { --- linux-azure-4.13.0.orig/arch/x86/kernel/fpu/xstate.c +++ linux-azure-4.13.0/arch/x86/kernel/fpu/xstate.c @@ -15,6 +15,7 @@ #include #include +#include /* * Although we spell it out in here, the Processor Trace @@ -36,6 +37,19 @@ "unknown xstate feature" , }; +static short xsave_cpuid_features[] __initdata = { + X86_FEATURE_FPU, + X86_FEATURE_XMM, + X86_FEATURE_AVX, + X86_FEATURE_MPX, + X86_FEATURE_MPX, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_AVX512F, + X86_FEATURE_INTEL_PT, + X86_FEATURE_PKU, +}; + /* * Mask of xstate features supported by the CPU and the kernel: */ @@ -59,26 +73,6 @@ void fpu__xstate_clear_all_cpu_caps(void) { setup_clear_cpu_cap(X86_FEATURE_XSAVE); - setup_clear_cpu_cap(X86_FEATURE_XSAVEOPT); - setup_clear_cpu_cap(X86_FEATURE_XSAVEC); - setup_clear_cpu_cap(X86_FEATURE_XSAVES); - setup_clear_cpu_cap(X86_FEATURE_AVX); - setup_clear_cpu_cap(X86_FEATURE_AVX2); - setup_clear_cpu_cap(X86_FEATURE_AVX512F); - setup_clear_cpu_cap(X86_FEATURE_AVX512IFMA); - setup_clear_cpu_cap(X86_FEATURE_AVX512PF); - setup_clear_cpu_cap(X86_FEATURE_AVX512ER); - setup_clear_cpu_cap(X86_FEATURE_AVX512CD); - setup_clear_cpu_cap(X86_FEATURE_AVX512DQ); - setup_clear_cpu_cap(X86_FEATURE_AVX512BW); - setup_clear_cpu_cap(X86_FEATURE_AVX512VL); - setup_clear_cpu_cap(X86_FEATURE_MPX); - setup_clear_cpu_cap(X86_FEATURE_XGETBV1); - setup_clear_cpu_cap(X86_FEATURE_AVX512VBMI); - setup_clear_cpu_cap(X86_FEATURE_PKU); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4VNNIW); - setup_clear_cpu_cap(X86_FEATURE_AVX512_4FMAPS); - setup_clear_cpu_cap(X86_FEATURE_AVX512_VPOPCNTDQ); } /* @@ -702,6 +696,7 @@ unsigned int eax, ebx, ecx, edx; static int on_boot_cpu __initdata = 1; int err; + int i; WARN_ON_FPU(!on_boot_cpu); on_boot_cpu = 0; @@ -735,6 +730,14 @@ goto out_disable; } + /* + * Clear XSAVE features that are disabled in the normal CPUID. + */ + for (i = 0; i < ARRAY_SIZE(xsave_cpuid_features); i++) { + if (!boot_cpu_has(xsave_cpuid_features[i])) + xfeatures_mask &= ~BIT(i); + } + xfeatures_mask &= fpu__get_supported_xfeatures_mask(); /* Enable xstate instructions to be able to continue with initialization: */ --- linux-azure-4.13.0.orig/arch/x86/kernel/ftrace_32.S +++ linux-azure-4.13.0/arch/x86/kernel/ftrace_32.S @@ -7,6 +7,7 @@ #include #include #include +#include #ifdef CC_USING_FENTRY # define function_hook __fentry__ @@ -196,7 +197,8 @@ movl 0x4(%ebp), %edx subl $MCOUNT_INSN_SIZE, %eax - call *ftrace_trace_function + movl ftrace_trace_function, %ecx + CALL_NOSPEC %ecx popl %edx popl %ecx @@ -240,5 +242,5 @@ movl %eax, %ecx popl %edx popl %eax - jmp *%ecx + JMP_NOSPEC %ecx #endif --- linux-azure-4.13.0.orig/arch/x86/kernel/ftrace_64.S +++ linux-azure-4.13.0/arch/x86/kernel/ftrace_64.S @@ -6,7 +6,7 @@ #include #include #include - +#include .code64 .section .entry.text, "ax" @@ -285,8 +285,8 @@ * ip and parent ip are used and the list function is called when * function tracing is enabled. */ - call *ftrace_trace_function - + movq ftrace_trace_function, %r8 + CALL_NOSPEC %r8 restore_mcount_regs jmp fgraph_trace @@ -328,5 +328,5 @@ movq 8(%rsp), %rdx movq (%rsp), %rax addq $24, %rsp - jmp *%rdi + JMP_NOSPEC %rdi #endif --- linux-azure-4.13.0.orig/arch/x86/kernel/head_32.S +++ linux-azure-4.13.0/arch/x86/kernel/head_32.S @@ -25,6 +25,7 @@ #include #include #include +#include /* Physical address */ #define pa(X) ((X) - __PAGE_OFFSET) @@ -152,6 +153,7 @@ movl pa(subarch_entries)(,%eax,4), %eax subl $__PAGE_OFFSET, %eax + ANNOTATE_RETPOLINE_SAFE jmp *%eax .Lbad_subarch: @@ -213,9 +215,6 @@ #endif .Ldefault_entry: -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $(CR0_STATE & ~X86_CR0_PG),%eax movl %eax,%cr0 @@ -306,6 +305,7 @@ movl setup_once_ref,%eax andl %eax,%eax jz 1f # Did we do this already? + ANNOTATE_RETPOLINE_SAFE call *%eax 1: @@ -435,7 +435,7 @@ # 24(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 pushl $0 # Dummy error code, to make stack frame uniform .endif pushl $i # 20(%esp) Vector number --- linux-azure-4.13.0.orig/arch/x86/kernel/head_64.S +++ linux-azure-4.13.0/arch/x86/kernel/head_64.S @@ -22,6 +22,7 @@ #include #include "../entry/calling.h" #include +#include #ifdef CONFIG_PARAVIRT #include @@ -37,11 +38,12 @@ * */ -#define p4d_index(x) (((x) >> P4D_SHIFT) & (PTRS_PER_P4D-1)) #define pud_index(x) (((x) >> PUD_SHIFT) & (PTRS_PER_PUD-1)) +#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) PGD_PAGE_OFFSET = pgd_index(__PAGE_OFFSET_BASE) PGD_START_KERNEL = pgd_index(__START_KERNEL_map) +#endif L3_START_KERNEL = pud_index(__START_KERNEL_map) .text @@ -49,6 +51,7 @@ .code64 .globl startup_64 startup_64: + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded an identity mapped page table @@ -81,6 +84,7 @@ movq $(early_top_pgt - __START_KERNEL_map), %rax jmp 1f ENTRY(secondary_startup_64) + UNWIND_HINT_EMPTY /* * At this point the CPU runs in 64bit mode CS.L = 1 CS.D = 0, * and someone has loaded a mapped page table. @@ -114,8 +118,10 @@ /* Ensure I am executing from virtual addresses */ movq $1f, %rax + ANNOTATE_RETPOLINE_SAFE jmp *%rax 1: + UNWIND_HINT_EMPTY /* Check if nx is implemented */ movl $0x80000001, %eax @@ -133,9 +139,6 @@ 1: wrmsr /* Make changes effective */ /* Setup cr0 */ -#define CR0_STATE (X86_CR0_PE | X86_CR0_MP | X86_CR0_ET | \ - X86_CR0_NE | X86_CR0_WP | X86_CR0_AM | \ - X86_CR0_PG) movl $CR0_STATE, %eax /* Make changes effective */ movq %rax, %cr0 @@ -218,7 +221,7 @@ pushq %rax # target address in negative space lretq .Lafter_lret: -ENDPROC(secondary_startup_64) +END(secondary_startup_64) #include "verify_cpu.S" @@ -230,6 +233,7 @@ */ ENTRY(start_cpu0) movq initial_stack(%rip), %rsp + UNWIND_HINT_EMPTY jmp .Ljump_to_C_code ENDPROC(start_cpu0) #endif @@ -249,26 +253,24 @@ .quad init_thread_union + THREAD_SIZE - SIZEOF_PTREGS __FINITDATA -bad_address: - jmp bad_address - __INIT ENTRY(early_idt_handler_array) - # 104(%rsp) %rflags - # 96(%rsp) %cs - # 88(%rsp) %rip - # 80(%rsp) error code i = 0 .rept NUM_EXCEPTION_VECTORS - .ifeq (EXCEPTION_ERRCODE_MASK >> i) & 1 - pushq $0 # Dummy error code, to make stack frame uniform + .if ((EXCEPTION_ERRCODE_MASK >> i) & 1) == 0 + UNWIND_HINT_IRET_REGS + pushq $0 # Dummy error code, to make stack frame uniform + .else + UNWIND_HINT_IRET_REGS offset=8 .endif pushq $i # 72(%rsp) Vector number jmp early_idt_handler_common + UNWIND_HINT_IRET_REGS i = i + 1 .fill early_idt_handler_array + i*EARLY_IDT_HANDLER_SIZE - ., 1, 0xcc .endr -ENDPROC(early_idt_handler_array) + UNWIND_HINT_IRET_REGS offset=16 +END(early_idt_handler_array) early_idt_handler_common: /* @@ -296,6 +298,7 @@ pushq %r13 /* pt_regs->r13 */ pushq %r14 /* pt_regs->r14 */ pushq %r15 /* pt_regs->r15 */ + UNWIND_HINT_REGS cmpq $14,%rsi /* Page fault? */ jnz 10f @@ -310,8 +313,8 @@ 20: decl early_recursion_flag(%rip) - jmp restore_regs_and_iret -ENDPROC(early_idt_handler_common) + jmp restore_regs_and_return_to_kernel +END(early_idt_handler_common) __INITDATA @@ -323,6 +326,27 @@ .balign PAGE_SIZE; \ GLOBAL(name) +#ifdef CONFIG_PAGE_TABLE_ISOLATION +/* + * Each PGD needs to be 8k long and 8k aligned. We do not + * ever go out to userspace with these, so we do not + * strictly *need* the second page, but this allows us to + * have a single set_pgd() implementation that does not + * need to worry about whether it has 4k or 8k to work + * with. + * + * This ensures PGDs are 8k long: + */ +#define PTI_USER_PGD_FILL 512 +/* This ensures they are 8k-aligned: */ +#define NEXT_PGD_PAGE(name) \ + .balign 2 * PAGE_SIZE; \ +GLOBAL(name) +#else +#define NEXT_PGD_PAGE(name) NEXT_PAGE(name) +#define PTI_USER_PGD_FILL 0 +#endif + /* Automate the creation of 1 to 1 mapping pmd entries */ #define PMDS(START, PERM, COUNT) \ i = 0 ; \ @@ -332,30 +356,29 @@ .endr __INITDATA -NEXT_PAGE(early_top_pgt) +NEXT_PGD_PAGE(early_top_pgt) .fill 511,8,0 #ifdef CONFIG_X86_5LEVEL .quad level4_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE #else .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE #endif + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(early_dynamic_pgts) .fill 512*EARLY_DYNAMIC_PAGE_TABLES,8,0 .data -#ifndef CONFIG_XEN -NEXT_PAGE(init_top_pgt) - .fill 512,8,0 -#else -NEXT_PAGE(init_top_pgt) +#if defined(CONFIG_XEN_PV) || defined(CONFIG_XEN_PVH) +NEXT_PGD_PAGE(init_top_pgt) .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_top_pgt + PGD_PAGE_OFFSET*8, 0 .quad level3_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE .org init_top_pgt + PGD_START_KERNEL*8, 0 /* (2^48-(2*1024*1024*1024))/(2^39) = 511 */ .quad level3_kernel_pgt - __START_KERNEL_map + _PAGE_TABLE + .fill PTI_USER_PGD_FILL,8,0 NEXT_PAGE(level3_ident_pgt) .quad level2_ident_pgt - __START_KERNEL_map + _KERNPG_TABLE @@ -365,6 +388,10 @@ * Don't set NX because code runs from these pages. */ PMDS(0, __PAGE_KERNEL_IDENT_LARGE_EXEC, PTRS_PER_PMD) +#else +NEXT_PGD_PAGE(init_top_pgt) + .fill 512,8,0 + .fill PTI_USER_PGD_FILL,8,0 #endif #ifdef CONFIG_X86_5LEVEL @@ -418,7 +445,7 @@ EXPORT_SYMBOL(phys_base) #include "../../x86/xen/xen-head.S" - + __PAGE_ALIGNED_BSS NEXT_PAGE(empty_zero_page) .skip PAGE_SIZE --- linux-azure-4.13.0.orig/arch/x86/kernel/ioport.c +++ linux-azure-4.13.0/arch/x86/kernel/ioport.c @@ -30,7 +30,7 @@ if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) + if (turn_on && (!capable(CAP_SYS_RAWIO) || kernel_is_locked_down())) return -EPERM; /* @@ -66,7 +66,7 @@ * because the ->io_bitmap_max value must match the bitmap * contents: */ - tss = &per_cpu(cpu_tss, get_cpu()); + tss = &per_cpu(cpu_tss_rw, get_cpu()); if (turn_on) bitmap_clear(t->io_bitmap_ptr, from, num); @@ -120,7 +120,7 @@ return -EINVAL; /* Trying to gain more privileges? */ if (level > old) { - if (!capable(CAP_SYS_RAWIO)) + if (!capable(CAP_SYS_RAWIO) || kernel_is_locked_down()) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | --- linux-azure-4.13.0.orig/arch/x86/kernel/irq.c +++ linux-azure-4.13.0/arch/x86/kernel/irq.c @@ -19,7 +19,6 @@ #include #define CREATE_TRACE_POINTS -#include DEFINE_PER_CPU_SHARED_ALIGNED(irq_cpustat_t, irq_stat); EXPORT_PER_CPU_SYMBOL(irq_stat); @@ -222,18 +221,6 @@ /* high bit used in ret_from_ code */ unsigned vector = ~regs->orig_ax; - /* - * NB: Unlike exception entries, IRQ entries do not reliably - * handle context tracking in the low-level entry code. This is - * because syscall entries execute briefly with IRQs on before - * updating context tracking state, so we can take an IRQ from - * kernel mode with CONTEXT_USER. The low-level entry code only - * updates the context if we came from user mode, so we won't - * switch to CONTEXT_KERNEL. We'll fix that once the syscall - * code is cleaned up enough that we can cleanly defer enabling - * IRQs. - */ - entering_irq(); /* entering_irq() tells RCU that we're not quiescent. Check it. */ @@ -339,9 +326,7 @@ struct pt_regs *old_regs = set_irq_regs(regs); entering_ack_irq(); - trace_x86_platform_ipi_entry(X86_PLATFORM_IPI_VECTOR); __smp_x86_platform_ipi(); - trace_x86_platform_ipi_exit(X86_PLATFORM_IPI_VECTOR); exiting_irq(); set_irq_regs(old_regs); } --- linux-azure-4.13.0.orig/arch/x86/kernel/irq_32.c +++ linux-azure-4.13.0/arch/x86/kernel/irq_32.c @@ -19,6 +19,7 @@ #include #include +#include #ifdef CONFIG_DEBUG_STACKOVERFLOW @@ -54,11 +55,11 @@ static void call_on_stack(void *func, void *stack) { asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=b" (stack) : "0" (stack), - "D"(func) + [thunk_target] "D"(func) : "memory", "cc", "edx", "ecx", "eax"); } @@ -94,11 +95,11 @@ call_on_stack(print_stack_overflow, isp); asm volatile("xchgl %%ebx,%%esp \n" - "call *%%edi \n" + CALL_NOSPEC "movl %%ebx,%%esp \n" : "=a" (arg1), "=b" (isp) : "0" (desc), "1" (isp), - "D" (desc->handle_irq) + [thunk_target] "D" (desc->handle_irq) : "memory", "cc", "ecx"); return 1; } --- linux-azure-4.13.0.orig/arch/x86/kernel/irq_64.c +++ linux-azure-4.13.0/arch/x86/kernel/irq_64.c @@ -56,10 +56,10 @@ if (regs->sp >= estack_top && regs->sp <= estack_bottom) return; - WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx)\n", + WARN_ONCE(1, "do_IRQ(): %s has overflown the kernel stack (cur:%Lx,sp:%lx,irq stk top-bottom:%Lx-%Lx,exception stk top-bottom:%Lx-%Lx,ip:%pF)\n", current->comm, curbase, regs->sp, irq_stack_top, irq_stack_bottom, - estack_top, estack_bottom); + estack_top, estack_bottom, (void *)regs->ip); if (sysctl_panic_on_stackoverflow) panic("low stack detected by irq handler - check messages\n"); --- linux-azure-4.13.0.orig/arch/x86/kernel/irq_work.c +++ linux-azure-4.13.0/arch/x86/kernel/irq_work.c @@ -8,7 +8,6 @@ #include #include #include -#include #include static inline void __smp_irq_work_interrupt(void) @@ -27,9 +26,7 @@ __visible void __irq_entry smp_trace_irq_work_interrupt(struct pt_regs *regs) { ipi_entering_ack_irq(); - trace_irq_work_entry(IRQ_WORK_VECTOR); __smp_irq_work_interrupt(); - trace_irq_work_exit(IRQ_WORK_VECTOR); exiting_irq(); } --- linux-azure-4.13.0.orig/arch/x86/kernel/kexec-bzimage64.c +++ linux-azure-4.13.0/arch/x86/kernel/kexec-bzimage64.c @@ -179,6 +179,7 @@ if (efi_enabled(EFI_OLD_MEMMAP)) return 0; + params->secure_boot = boot_params.secure_boot; ei->efi_loader_signature = current_ei->efi_loader_signature; ei->efi_systab = current_ei->efi_systab; ei->efi_systab_hi = current_ei->efi_systab_hi; --- linux-azure-4.13.0.orig/arch/x86/kernel/kprobes/common.h +++ linux-azure-4.13.0/arch/x86/kernel/kprobes/common.h @@ -3,6 +3,15 @@ /* Kprobes and Optprobes common header */ +#include + +#ifdef CONFIG_FRAME_POINTER +# define SAVE_RBP_STRING " push %" _ASM_BP "\n" \ + " mov %" _ASM_SP ", %" _ASM_BP "\n" +#else +# define SAVE_RBP_STRING " push %" _ASM_BP "\n" +#endif + #ifdef CONFIG_X86_64 #define SAVE_REGS_STRING \ /* Skip cs, ip, orig_ax. */ \ @@ -17,7 +26,7 @@ " pushq %r10\n" \ " pushq %r11\n" \ " pushq %rbx\n" \ - " pushq %rbp\n" \ + SAVE_RBP_STRING \ " pushq %r12\n" \ " pushq %r13\n" \ " pushq %r14\n" \ @@ -48,7 +57,7 @@ " pushl %es\n" \ " pushl %ds\n" \ " pushl %eax\n" \ - " pushl %ebp\n" \ + SAVE_RBP_STRING \ " pushl %edi\n" \ " pushl %esi\n" \ " pushl %edx\n" \ --- linux-azure-4.13.0.orig/arch/x86/kernel/kvm.c +++ linux-azure-4.13.0/arch/x86/kernel/kvm.c @@ -117,7 +117,11 @@ return NULL; } -void kvm_async_pf_task_wait(u32 token) +/* + * @interrupt_kernel: Is this called from a routine which interrupts the kernel + * (other than user space)? + */ +void kvm_async_pf_task_wait(u32 token, int interrupt_kernel) { u32 key = hash_32(token, KVM_TASK_SLEEP_HASHBITS); struct kvm_task_sleep_head *b = &async_pf_sleepers[key]; @@ -140,7 +144,10 @@ n.token = token; n.cpu = smp_processor_id(); - n.halted = is_idle_task(current) || preempt_count() > 1; + n.halted = is_idle_task(current) || + (IS_ENABLED(CONFIG_PREEMPT_COUNT) + ? preempt_count() > 1 || rcu_preempt_depth() + : interrupt_kernel); init_swait_queue_head(&n.wq); hlist_add_head(&n.link, &b->list); raw_spin_unlock(&b->lock); @@ -263,12 +270,12 @@ switch (kvm_read_and_reset_pf_reason()) { default: - trace_do_page_fault(regs, error_code); + do_page_fault(regs, error_code); break; case KVM_PV_REASON_PAGE_NOT_PRESENT: /* page is swapped out by the host. */ prev_state = exception_enter(); - kvm_async_pf_task_wait((u32)read_cr2()); + kvm_async_pf_task_wait((u32)read_cr2(), !user_mode(regs)); exception_exit(prev_state); break; case KVM_PV_REASON_PAGE_READY: @@ -537,12 +544,12 @@ return kvm_cpuid_base(); } -const struct hypervisor_x86 x86_hyper_kvm __refconst = { +const __initconst struct hypervisor_x86 x86_hyper_kvm = { .name = "KVM", .detect = kvm_detect, - .x2apic_available = kvm_para_available, + .type = X86_HYPER_KVM, + .init.x2apic_available = kvm_para_available, }; -EXPORT_SYMBOL_GPL(x86_hyper_kvm); static __init int activate_jump_labels(void) { --- linux-azure-4.13.0.orig/arch/x86/kernel/ldt.c +++ linux-azure-4.13.0/arch/x86/kernel/ldt.c @@ -4,6 +4,11 @@ * Copyright (C) 2002 Andi Kleen * * This handles calls from both 32bit and 64bit mode. + * + * Lock order: + * contex.ldt_usr_sem + * mmap_sem + * context.lock */ #include @@ -12,26 +17,47 @@ #include #include #include +#include #include #include #include #include +#include #include #include #include -/* context.lock is held for us, so we don't need any locking. */ +static void refresh_ldt_segments(void) +{ +#ifdef CONFIG_X86_64 + unsigned short sel; + + /* + * Make sure that the cached DS and ES descriptors match the updated + * LDT. + */ + savesegment(ds, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(ds, sel); + + savesegment(es, sel); + if ((sel & SEGMENT_TI_MASK) == SEGMENT_LDT) + loadsegment(es, sel); +#endif +} + +/* context.lock is held by the task which issued the smp function call */ static void flush_ldt(void *__mm) { struct mm_struct *mm = __mm; - mm_context_t *pc; if (this_cpu_read(cpu_tlbstate.loaded_mm) != mm) return; - pc = &mm->context; - set_ldt(pc->ldt->entries, pc->ldt->nr_entries); + load_mm_ldt(mm); + + refresh_ldt_segments(); } /* The caller must call finalize_ldt_struct on the result. LDT starts zeroed. */ @@ -66,25 +92,143 @@ return NULL; } + /* The new LDT isn't aliased for PTI yet. */ + new_ldt->slot = -1; + new_ldt->nr_entries = num_entries; return new_ldt; } +/* + * If PTI is enabled, this maps the LDT into the kernelmode and + * usermode tables for the given mm. + * + * There is no corresponding unmap function. Even if the LDT is freed, we + * leave the PTEs around until the slot is reused or the mm is destroyed. + * This is harmless: the LDT is always in ordinary memory, and no one will + * access the freed slot. + * + * If we wanted to unmap freed LDTs, we'd also need to do a flush to make + * it useful, and the flush would slow down modify_ldt(). + */ +static int +map_ldt_struct(struct mm_struct *mm, struct ldt_struct *ldt, int slot) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + bool is_vmalloc, had_top_level_entry; + unsigned long va; + spinlock_t *ptl; + pgd_t *pgd; + int i; + + if (!static_cpu_has(X86_FEATURE_PTI)) + return 0; + + /* + * Any given ldt_struct should have map_ldt_struct() called at most + * once. + */ + WARN_ON(ldt->slot != -1); + + /* + * Did we already have the top level entry allocated? We can't + * use pgd_none() for this because it doens't do anything on + * 4-level page table kernels. + */ + pgd = pgd_offset(mm, LDT_BASE_ADDR); + had_top_level_entry = (pgd->pgd != 0); + + is_vmalloc = is_vmalloc_addr(ldt->entries); + + for (i = 0; i * PAGE_SIZE < ldt->nr_entries * LDT_ENTRY_SIZE; i++) { + unsigned long offset = i << PAGE_SHIFT; + const void *src = (char *)ldt->entries + offset; + unsigned long pfn; + pte_t pte, *ptep; + + va = (unsigned long)ldt_slot_va(slot) + offset; + pfn = is_vmalloc ? vmalloc_to_pfn(src) : + page_to_pfn(virt_to_page(src)); + /* + * Treat the PTI LDT range as a *userspace* range. + * get_locked_pte() will allocate all needed pagetables + * and account for them in this mm. + */ + ptep = get_locked_pte(mm, va, &ptl); + if (!ptep) + return -ENOMEM; + /* + * Map it RO so the easy to find address is not a primary + * target via some kernel interface which misses a + * permission check. + */ + pte = pfn_pte(pfn, __pgprot(__PAGE_KERNEL_RO & ~_PAGE_GLOBAL)); + set_pte_at(mm, va, ptep, pte); + pte_unmap_unlock(ptep, ptl); + } + + if (mm->context.ldt) { + /* + * We already had an LDT. The top-level entry should already + * have been allocated and synchronized with the usermode + * tables. + */ + WARN_ON(!had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) + WARN_ON(!kernel_to_user_pgdp(pgd)->pgd); + } else { + /* + * This is the first time we're mapping an LDT for this process. + * Sync the pgd to the usermode tables. + */ + WARN_ON(had_top_level_entry); + if (static_cpu_has(X86_FEATURE_PTI)) { + WARN_ON(kernel_to_user_pgdp(pgd)->pgd); + set_pgd(kernel_to_user_pgdp(pgd), *pgd); + } + } + + va = (unsigned long)ldt_slot_va(slot); + flush_tlb_mm_range(mm, va, va + LDT_SLOT_STRIDE, 0); + + ldt->slot = slot; +#endif + return 0; +} + +static void free_ldt_pgtables(struct mm_struct *mm) +{ +#ifdef CONFIG_PAGE_TABLE_ISOLATION + struct mmu_gather tlb; + unsigned long start = LDT_BASE_ADDR; + unsigned long end = start + (1UL << PGDIR_SHIFT); + + if (!static_cpu_has(X86_FEATURE_PTI)) + return; + + tlb_gather_mmu(&tlb, mm, start, end); + free_pgd_range(&tlb, start, end, start, end); + tlb_finish_mmu(&tlb, start, end); +#endif +} + /* After calling this, the LDT is immutable. */ static void finalize_ldt_struct(struct ldt_struct *ldt) { paravirt_alloc_ldt(ldt->entries, ldt->nr_entries); } -/* context.lock is held */ -static void install_ldt(struct mm_struct *current_mm, - struct ldt_struct *ldt) +static void install_ldt(struct mm_struct *mm, struct ldt_struct *ldt) { - /* Synchronizes with lockless_dereference in load_mm_ldt. */ - smp_store_release(¤t_mm->context.ldt, ldt); + mutex_lock(&mm->context.lock); + + /* Synchronizes with READ_ONCE in load_mm_ldt. */ + smp_store_release(&mm->context.ldt, ldt); - /* Activate the LDT for all CPUs using current_mm. */ - on_each_cpu_mask(mm_cpumask(current_mm), flush_ldt, current_mm, true); + /* Activate the LDT for all CPUs using currents mm. */ + on_each_cpu_mask(mm_cpumask(mm), flush_ldt, mm, true); + + mutex_unlock(&mm->context.lock); } static void free_ldt_struct(struct ldt_struct *ldt) @@ -101,27 +245,20 @@ } /* - * we do not have to muck with descriptors here, that is - * done in switch_mm() as needed. + * Called on fork from arch_dup_mmap(). Just copy the current LDT state, + * the new task is not running, so nothing can be installed. */ -int init_new_context_ldt(struct task_struct *tsk, struct mm_struct *mm) +int ldt_dup_context(struct mm_struct *old_mm, struct mm_struct *mm) { struct ldt_struct *new_ldt; - struct mm_struct *old_mm; int retval = 0; - mutex_init(&mm->context.lock); - old_mm = current->mm; - if (!old_mm) { - mm->context.ldt = NULL; + if (!old_mm) return 0; - } mutex_lock(&old_mm->context.lock); - if (!old_mm->context.ldt) { - mm->context.ldt = NULL; + if (!old_mm->context.ldt) goto out_unlock; - } new_ldt = alloc_ldt_struct(old_mm->context.ldt->nr_entries); if (!new_ldt) { @@ -133,6 +270,12 @@ new_ldt->nr_entries * LDT_ENTRY_SIZE); finalize_ldt_struct(new_ldt); + retval = map_ldt_struct(mm, new_ldt, 0); + if (retval) { + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } mm->context.ldt = new_ldt; out_unlock: @@ -151,13 +294,18 @@ mm->context.ldt = NULL; } +void ldt_arch_exit_mmap(struct mm_struct *mm) +{ + free_ldt_pgtables(mm); +} + static int read_ldt(void __user *ptr, unsigned long bytecount) { struct mm_struct *mm = current->mm; unsigned long entries_size; int retval; - mutex_lock(&mm->context.lock); + down_read(&mm->context.ldt_usr_sem); if (!mm->context.ldt) { retval = 0; @@ -186,7 +334,7 @@ retval = bytecount; out_unlock: - mutex_unlock(&mm->context.lock); + up_read(&mm->context.ldt_usr_sem); return retval; } @@ -246,7 +394,8 @@ ldt.avl = 0; } - mutex_lock(&mm->context.lock); + if (down_write_killable(&mm->context.ldt_usr_sem)) + return -EINTR; old_ldt = mm->context.ldt; old_nr_entries = old_ldt ? old_ldt->nr_entries : 0; @@ -263,18 +412,37 @@ new_ldt->entries[ldt_info.entry_number] = ldt; finalize_ldt_struct(new_ldt); + /* + * If we are using PTI, map the new LDT into the userspace pagetables. + * If there is already an LDT, use the other slot so that other CPUs + * will continue to use the old LDT until install_ldt() switches + * them over to the new LDT. + */ + error = map_ldt_struct(mm, new_ldt, old_ldt ? !old_ldt->slot : 0); + if (error) { + /* + * This only can fail for the first LDT setup. If an LDT is + * already installed then the PTE page is already + * populated. Mop up a half populated page table. + */ + if (!WARN_ON_ONCE(old_ldt)) + free_ldt_pgtables(mm); + free_ldt_struct(new_ldt); + goto out_unlock; + } + install_ldt(mm, new_ldt); free_ldt_struct(old_ldt); error = 0; out_unlock: - mutex_unlock(&mm->context.lock); + up_write(&mm->context.ldt_usr_sem); out: return error; } -asmlinkage int sys_modify_ldt(int func, void __user *ptr, - unsigned long bytecount) +SYSCALL_DEFINE3(modify_ldt, int , func , void __user * , ptr , + unsigned long , bytecount) { int ret = -ENOSYS; @@ -292,5 +460,14 @@ ret = write_ldt(ptr, bytecount, 0); break; } - return ret; + /* + * The SYSCALL_DEFINE() macros give us an 'unsigned long' + * return type, but tht ABI for sys_modify_ldt() expects + * 'int'. This cast gives us an int-sized value in %rax + * for the return code. The 'unsigned' is necessary so + * the compiler does not try to sign-extend the negative + * return codes into the high half of the register when + * taking the value from int->long. + */ + return (unsigned int)ret; } --- linux-azure-4.13.0.orig/arch/x86/kernel/module.c +++ linux-azure-4.13.0/arch/x86/kernel/module.c @@ -35,6 +35,7 @@ #include #include #include +#include #if 0 #define DEBUGP(fmt, ...) \ @@ -213,7 +214,7 @@ struct module *me) { const Elf_Shdr *s, *text = NULL, *alt = NULL, *locks = NULL, - *para = NULL; + *para = NULL, *orc = NULL, *orc_ip = NULL; char *secstrings = (void *)hdr + sechdrs[hdr->e_shstrndx].sh_offset; for (s = sechdrs; s < sechdrs + hdr->e_shnum; s++) { @@ -225,6 +226,10 @@ locks = s; if (!strcmp(".parainstructions", secstrings + s->sh_name)) para = s; + if (!strcmp(".orc_unwind", secstrings + s->sh_name)) + orc = s; + if (!strcmp(".orc_unwind_ip", secstrings + s->sh_name)) + orc_ip = s; } if (alt) { @@ -248,6 +253,10 @@ /* make jump label nops */ jump_label_apply_nops(me); + if (orc && orc_ip) + unwind_module_init(me, (void *)orc_ip->sh_addr, orc_ip->sh_size, + (void *)orc->sh_addr, orc->sh_size); + return 0; } --- linux-azure-4.13.0.orig/arch/x86/kernel/msr.c +++ linux-azure-4.13.0/arch/x86/kernel/msr.c @@ -84,6 +84,9 @@ int err = 0; ssize_t bytes = 0; + if (kernel_is_locked_down()) + return -EPERM; + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -131,6 +134,10 @@ err = -EBADF; break; } + if (kernel_is_locked_down()) { + err = -EPERM; + break; + } if (copy_from_user(®s, uregs, sizeof regs)) { err = -EFAULT; break; --- linux-azure-4.13.0.orig/arch/x86/kernel/paravirt.c +++ linux-azure-4.13.0/arch/x86/kernel/paravirt.c @@ -319,9 +319,6 @@ .irq_enable = __PV_IS_CALLEE_SAVE(native_irq_enable), .safe_halt = native_safe_halt, .halt = native_halt, -#ifdef CONFIG_X86_64 - .adjust_exception_frame = paravirt_nop, -#endif }; __visible struct pv_cpu_ops pv_cpu_ops = { @@ -330,7 +327,6 @@ .set_debugreg = native_set_debugreg, .read_cr0 = native_read_cr0, .write_cr0 = native_write_cr0, - .read_cr4 = native_read_cr4, .write_cr4 = native_write_cr4, #ifdef CONFIG_X86_64 .read_cr8 = native_read_cr8, @@ -346,7 +342,6 @@ .set_ldt = native_set_ldt, .load_gdt = native_load_gdt, .load_idt = native_load_idt, - .store_idt = native_store_idt, .store_tr = native_store_tr, .load_tls = native_load_tls, #ifdef CONFIG_X86_64 @@ -414,8 +409,6 @@ .set_pte = native_set_pte, .set_pte_at = native_set_pte_at, .set_pmd = native_set_pmd, - .set_pmd_at = native_set_pmd_at, - .pte_update = paravirt_nop, .ptep_modify_prot_start = __ptep_modify_prot_start, .ptep_modify_prot_commit = __ptep_modify_prot_commit, @@ -427,7 +420,6 @@ .pmd_clear = native_pmd_clear, #endif .set_pud = native_set_pud, - .set_pud_at = native_set_pud_at, .pmd_val = PTE_IDENT, .make_pmd = PTE_IDENT, --- linux-azure-4.13.0.orig/arch/x86/kernel/paravirt_patch_64.c +++ linux-azure-4.13.0/arch/x86/kernel/paravirt_patch_64.c @@ -9,7 +9,6 @@ DEF_NATIVE(pv_mmu_ops, read_cr2, "movq %cr2, %rax"); DEF_NATIVE(pv_mmu_ops, read_cr3, "movq %cr3, %rax"); DEF_NATIVE(pv_mmu_ops, write_cr3, "movq %rdi, %cr3"); -DEF_NATIVE(pv_mmu_ops, flush_tlb_single, "invlpg (%rdi)"); DEF_NATIVE(pv_cpu_ops, wbinvd, "wbinvd"); DEF_NATIVE(pv_cpu_ops, usergs_sysret64, "swapgs; sysretq"); @@ -59,7 +58,6 @@ PATCH_SITE(pv_mmu_ops, read_cr2); PATCH_SITE(pv_mmu_ops, read_cr3); PATCH_SITE(pv_mmu_ops, write_cr3); - PATCH_SITE(pv_mmu_ops, flush_tlb_single); PATCH_SITE(pv_cpu_ops, wbinvd); #if defined(CONFIG_PARAVIRT_SPINLOCKS) case PARAVIRT_PATCH(pv_lock_ops.queued_spin_unlock): --- linux-azure-4.13.0.orig/arch/x86/kernel/process.c +++ linux-azure-4.13.0/arch/x86/kernel/process.c @@ -46,9 +46,25 @@ * section. Since TSS's are completely CPU-local, we want them * on exact cacheline boundaries, to eliminate cacheline ping-pong. */ -__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tss_struct, cpu_tss) = { +__visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, cpu_tss_rw) = { .x86_tss = { - .sp0 = TOP_OF_INIT_STACK, + /* + * .sp0 is only used when entering ring 0 from a lower + * privilege level. Since the init task never runs anything + * but ring 0 code, there is no need for a valid value here. + * Poison it. + */ + .sp0 = (1UL << (BITS_PER_LONG-1)) + 1, + +#ifdef CONFIG_X86_64 + /* + * .sp1 is cpu_current_top_of_stack. The init task never + * runs user code, but cpu_current_top_of_stack should still + * be well defined before the first context switch. + */ + .sp1 = TOP_OF_INIT_STACK, +#endif + #ifdef CONFIG_X86_32 .ss0 = __KERNEL_DS, .ss1 = __KERNEL_CS, @@ -64,11 +80,8 @@ */ .io_bitmap = { [0 ... IO_BITMAP_LONGS] = ~0 }, #endif -#ifdef CONFIG_X86_32 - .SYSENTER_stack_canary = STACK_END_MAGIC, -#endif }; -EXPORT_PER_CPU_SYMBOL(cpu_tss); +EXPORT_PER_CPU_SYMBOL(cpu_tss_rw); DEFINE_PER_CPU(bool, __tss_limit_invalid); EXPORT_PER_CPU_SYMBOL_GPL(__tss_limit_invalid); @@ -97,7 +110,7 @@ struct fpu *fpu = &t->fpu; if (bp) { - struct tss_struct *tss = &per_cpu(cpu_tss, get_cpu()); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, get_cpu()); t->io_bitmap_ptr = NULL; clear_thread_flag(TIF_IO_BITMAP); @@ -434,11 +447,19 @@ mb(); /* quirk */ } + if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, 0); + __monitor((void *)¤t_thread_info()->flags, 0, 0); - if (!need_resched()) + if (!need_resched()) { __sti_mwait(0, 0); - else + if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); + } else { + if (ibrs_inuse) + native_wrmsrl(MSR_IA32_SPEC_CTRL, FEATURE_ENABLE_IBRS); local_irq_enable(); + } trace_cpu_idle_rcuidle(PWR_EVENT_EXIT, smp_processor_id()); } else { local_irq_enable(); --- linux-azure-4.13.0.orig/arch/x86/kernel/process_32.c +++ linux-azure-4.13.0/arch/x86/kernel/process_32.c @@ -56,7 +56,7 @@ #include #include #include -#include +#include #include void __show_regs(struct pt_regs *regs, int all) @@ -234,7 +234,7 @@ struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); /* never put a printk in __switch_to... printk() calls wake_up*() indirectly */ @@ -284,9 +284,11 @@ /* * Reload esp0 and cpu_current_top_of_stack. This changes - * current_thread_info(). + * current_thread_info(). Refresh the SYSENTER configuration in + * case prev or next is vm86. */ - load_sp0(tss, next); + update_sp0(next_p); + refresh_sysenter_cs(next); this_cpu_write(cpu_current_top_of_stack, (unsigned long)task_stack_page(next_p) + THREAD_SIZE); --- linux-azure-4.13.0.orig/arch/x86/kernel/process_64.c +++ linux-azure-4.13.0/arch/x86/kernel/process_64.c @@ -52,7 +52,7 @@ #include #include #include -#include +#include #include #ifdef CONFIG_IA32_EMULATION /* Not included via unistd.h */ @@ -69,10 +69,8 @@ unsigned int fsindex, gsindex; unsigned int ds, cs, es; - printk(KERN_DEFAULT "RIP: %04lx:%pS\n", regs->cs & 0xffff, - (void *)regs->ip); - printk(KERN_DEFAULT "RSP: %04lx:%016lx EFLAGS: %08lx", regs->ss, - regs->sp, regs->flags); + show_iret_regs(regs); + if (regs->orig_ax != -1) pr_cont(" ORIG_RAX: %016lx\n", regs->orig_ax); else @@ -89,6 +87,9 @@ printk(KERN_DEFAULT "R13: %016lx R14: %016lx R15: %016lx\n", regs->r13, regs->r14, regs->r15); + if (!all) + return; + asm("movl %%ds,%0" : "=r" (ds)); asm("movl %%cs,%0" : "=r" (cs)); asm("movl %%es,%0" : "=r" (es)); @@ -99,9 +100,6 @@ rdmsrl(MSR_GS_BASE, gs); rdmsrl(MSR_KERNEL_GS_BASE, shadowgs); - if (!all) - return; - cr0 = read_cr0(); cr2 = read_cr2(); cr3 = __read_cr3(); @@ -149,6 +147,123 @@ } } +enum which_selector { + FS, + GS +}; + +/* + * Saves the FS or GS base for an outgoing thread if FSGSBASE extensions are + * not available. The goal is to be reasonably fast on non-FSGSBASE systems. + * It's forcibly inlined because it'll generate better code and this function + * is hot. + */ +static __always_inline void save_base_legacy(struct task_struct *prev_p, + unsigned short selector, + enum which_selector which) +{ + if (likely(selector == 0)) { + /* + * On Intel (without X86_BUG_NULL_SEG), the segment base could + * be the pre-existing saved base or it could be zero. On AMD + * (with X86_BUG_NULL_SEG), the segment base could be almost + * anything. + * + * This branch is very hot (it's hit twice on almost every + * context switch between 64-bit programs), and avoiding + * the RDMSR helps a lot, so we just assume that whatever + * value is already saved is correct. This matches historical + * Linux behavior, so it won't break existing applications. + * + * To avoid leaking state, on non-X86_BUG_NULL_SEG CPUs, if we + * report that the base is zero, it needs to actually be zero: + * see the corresponding logic in load_seg_legacy. + */ + } else { + /* + * If the selector is 1, 2, or 3, then the base is zero on + * !X86_BUG_NULL_SEG CPUs and could be anything on + * X86_BUG_NULL_SEG CPUs. In the latter case, Linux + * has never attempted to preserve the base across context + * switches. + * + * If selector > 3, then it refers to a real segment, and + * saving the base isn't necessary. + */ + if (which == FS) + prev_p->thread.fsbase = 0; + else + prev_p->thread.gsbase = 0; + } +} + +static __always_inline void save_fsgs(struct task_struct *task) +{ + savesegment(fs, task->thread.fsindex); + savesegment(gs, task->thread.gsindex); + save_base_legacy(task, task->thread.fsindex, FS); + save_base_legacy(task, task->thread.gsindex, GS); +} + +static __always_inline void loadseg(enum which_selector which, + unsigned short sel) +{ + if (which == FS) + loadsegment(fs, sel); + else + load_gs_index(sel); +} + +static __always_inline void load_seg_legacy(unsigned short prev_index, + unsigned long prev_base, + unsigned short next_index, + unsigned long next_base, + enum which_selector which) +{ + if (likely(next_index <= 3)) { + /* + * The next task is using 64-bit TLS, is not using this + * segment at all, or is having fun with arcane CPU features. + */ + if (next_base == 0) { + /* + * Nasty case: on AMD CPUs, we need to forcibly zero + * the base. + */ + if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { + loadseg(which, __USER_DS); + loadseg(which, next_index); + } else { + /* + * We could try to exhaustively detect cases + * under which we can skip the segment load, + * but there's really only one case that matters + * for performance: if both the previous and + * next states are fully zeroed, we can skip + * the load. + * + * (This assumes that prev_base == 0 has no + * false positives. This is the case on + * Intel-style CPUs.) + */ + if (likely(prev_index | next_index | prev_base)) + loadseg(which, next_index); + } + } else { + if (prev_index != next_index) + loadseg(which, next_index); + wrmsrl(which == FS ? MSR_FS_BASE : MSR_KERNEL_GS_BASE, + next_base); + } + } else { + /* + * The next task is using a real segment. Loading the selector + * is sufficient. + */ + loadseg(which, next_index); + } +} + int copy_thread_tls(unsigned long clone_flags, unsigned long sp, unsigned long arg, struct task_struct *p, unsigned long tls) { @@ -158,7 +273,6 @@ struct inactive_task_frame *frame; struct task_struct *me = current; - p->thread.sp0 = (unsigned long)task_stack_page(p) + THREAD_SIZE; childregs = task_pt_regs(p); fork_frame = container_of(childregs, struct fork_frame, regs); frame = &fork_frame->frame; @@ -229,10 +343,19 @@ unsigned long new_sp, unsigned int _cs, unsigned int _ss, unsigned int _ds) { + WARN_ON_ONCE(regs != current_pt_regs()); + + if (static_cpu_has(X86_BUG_NULL_SEG)) { + /* Loading zero below won't clear the base. */ + loadsegment(fs, __USER_DS); + load_gs_index(__USER_DS); + } + loadsegment(fs, 0); loadsegment(es, _ds); loadsegment(ds, _ds); load_gs_index(0); + regs->ip = new_ip; regs->sp = new_sp; regs->cs = _cs; @@ -276,8 +399,10 @@ struct fpu *prev_fpu = &prev->fpu; struct fpu *next_fpu = &next->fpu; int cpu = smp_processor_id(); - struct tss_struct *tss = &per_cpu(cpu_tss, cpu); - unsigned prev_fsindex, prev_gsindex; + struct tss_struct *tss = &per_cpu(cpu_tss_rw, cpu); + + WARN_ON_ONCE(IS_ENABLED(CONFIG_DEBUG_ENTRY) && + this_cpu_read(irq_count) != -1); switch_fpu_prepare(prev_fpu, cpu); @@ -286,8 +411,7 @@ * * (e.g. xen_load_tls()) */ - savesegment(fs, prev_fsindex); - savesegment(gs, prev_gsindex); + save_fsgs(prev_p); /* * Load TLS before restoring any segments so that segment loads @@ -326,108 +450,10 @@ if (unlikely(next->ds | prev->ds)) loadsegment(ds, next->ds); - /* - * Switch FS and GS. - * - * These are even more complicated than DS and ES: they have - * 64-bit bases are that controlled by arch_prctl. The bases - * don't necessarily match the selectors, as user code can do - * any number of things to cause them to be inconsistent. - * - * We don't promise to preserve the bases if the selectors are - * nonzero. We also don't promise to preserve the base if the - * selector is zero and the base doesn't match whatever was - * most recently passed to ARCH_SET_FS/GS. (If/when the - * FSGSBASE instructions are enabled, we'll need to offer - * stronger guarantees.) - * - * As an invariant, - * (fsbase != 0 && fsindex != 0) || (gsbase != 0 && gsindex != 0) is - * impossible. - */ - if (next->fsindex) { - /* Loading a nonzero value into FS sets the index and base. */ - loadsegment(fs, next->fsindex); - } else { - if (next->fsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_fsindex) - loadsegment(fs, 0); - wrmsrl(MSR_FS_BASE, next->fsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - */ - loadsegment(fs, __USER_DS); - loadsegment(fs, 0); - } else { - /* - * If the previous index is zero and ARCH_SET_FS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->fsbase || prev_fsindex) - loadsegment(fs, 0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_fsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_fsindex) - prev->fsbase = 0; - prev->fsindex = prev_fsindex; - - if (next->gsindex) { - /* Loading a nonzero value into GS sets the index and base. */ - load_gs_index(next->gsindex); - } else { - if (next->gsbase) { - /* Next index is zero but next base is nonzero. */ - if (prev_gsindex) - load_gs_index(0); - wrmsrl(MSR_KERNEL_GS_BASE, next->gsbase); - } else { - /* Next base and index are both zero. */ - if (static_cpu_has_bug(X86_BUG_NULL_SEG)) { - /* - * We don't know the previous base and can't - * find out without RDMSR. Forcibly clear it. - * - * This contains a pointless SWAPGS pair. - * Fixing it would involve an explicit check - * for Xen or a new pvop. - */ - load_gs_index(__USER_DS); - load_gs_index(0); - } else { - /* - * If the previous index is zero and ARCH_SET_GS - * didn't change the base, then the base is - * also zero and we don't need to do anything. - */ - if (prev->gsbase || prev_gsindex) - load_gs_index(0); - } - } - } - /* - * Save the old state and preserve the invariant. - * NB: if prev_gsindex == 0, then we can't reliably learn the base - * without RDMSR because Intel user code can zero it without telling - * us and AMD user code can program any 32-bit value without telling - * us. - */ - if (prev_gsindex) - prev->gsbase = 0; - prev->gsindex = prev_gsindex; + load_seg_legacy(prev->fsindex, prev->fsbase, + next->fsindex, next->fsbase, FS); + load_seg_legacy(prev->gsindex, prev->gsbase, + next->gsindex, next->gsbase, GS); switch_fpu_finish(next_fpu, cpu); @@ -435,9 +461,10 @@ * Switch the PDA and FPU contexts. */ this_cpu_write(current_task, next_p); + this_cpu_write(cpu_current_top_of_stack, task_top_of_stack(next_p)); - /* Reload esp0 and ss1. This changes current_thread_info(). */ - load_sp0(tss, next); + /* Reload sp0. */ + update_sp0(next_p); /* * Now maybe reload the debug registers and handle I/O bitmaps --- linux-azure-4.13.0.orig/arch/x86/kernel/reboot.c +++ linux-azure-4.13.0/arch/x86/kernel/reboot.c @@ -31,6 +31,7 @@ #include #include #include +#include /* * Power off function, if any @@ -111,11 +112,11 @@ /* Jump to the identity-mapped low memory code */ #ifdef CONFIG_X86_32 - asm volatile("jmpl *%0" : : + asm volatile(ANNOTATE_RETPOLINE_SAFE "jmpl *%0" : : "rm" (real_mode_header->machine_real_restart_asm), "a" (type)); #else - asm volatile("ljmpl *%0" : : + asm volatile(ANNOTATE_RETPOLINE_SAFE "ljmpl *%0" : : "m" (real_mode_header->machine_real_restart_asm), "D" (type)); #endif @@ -454,7 +455,46 @@ DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), }, }, - + { /* Handle problems with rebooting on the Latitude E6520. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6520", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6520"), + }, + }, + { /* Handle problems with rebooting on the OptiPlex 790. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 790", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 790"), + }, + }, + { /* Handle problems with rebooting on the OptiPlex 990. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + }, + }, + { /* Handle problems with rebooting on the Latitude E6220. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6220", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6220"), + }, + }, + { /* Handle problems with rebooting on the OptiPlex 390. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 390", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 390"), + }, + }, { } }; --- linux-azure-4.13.0.orig/arch/x86/kernel/relocate_kernel_32.S +++ linux-azure-4.13.0/arch/x86/kernel/relocate_kernel_32.S @@ -10,6 +10,7 @@ #include #include #include +#include /* * Must be relocatable PIC code callable as a C function @@ -167,6 +168,7 @@ movl CP_PA_SWAP_PAGE(%edi), %esp addl $PAGE_SIZE, %esp 2: + ANNOTATE_RETPOLINE_SAFE call *%edx /* get the re-entry point of the peer system */ --- linux-azure-4.13.0.orig/arch/x86/kernel/relocate_kernel_64.S +++ linux-azure-4.13.0/arch/x86/kernel/relocate_kernel_64.S @@ -11,6 +11,7 @@ #include #include #include +#include /* * Must be relocatable PIC code callable as a C function @@ -172,6 +173,7 @@ 1: popq %rdx leaq PAGE_SIZE(%r10), %rsp + ANNOTATE_RETPOLINE_SAFE call *%rdx /* get the re-entry point of the peer system */ --- linux-azure-4.13.0.orig/arch/x86/kernel/setup.c +++ linux-azure-4.13.0/arch/x86/kernel/setup.c @@ -69,6 +69,12 @@ #include #include #include +#include + +#include +#include +#include +#include #include #include