--- linux-azure-4.11.0.orig/Documentation/ABI/testing/debugfs-aufs +++ linux-azure-4.11.0/Documentation/ABI/testing/debugfs-aufs @@ -0,0 +1,50 @@ +What: /debug/aufs/si_/ +Date: March 2009 +Contact: J. R. Okajima +Description: + Under /debug/aufs, a directory named si_ is created + per aufs mount, where is a unique id generated + internally. + +What: /debug/aufs/si_/plink +Date: Apr 2013 +Contact: J. R. Okajima +Description: + It has three lines and shows the information about the + pseudo-link. The first line is a single number + representing a number of buckets. The second line is a + number of pseudo-links per buckets (separated by a + blank). The last line is a single number representing a + total number of psedo-links. + When the aufs mount option 'noplink' is specified, it + will show "1\n0\n0\n". + +What: /debug/aufs/si_/xib +Date: March 2009 +Contact: J. R. Okajima +Description: + It shows the consumed blocks by xib (External Inode Number + Bitmap), its block size and file size. + When the aufs mount option 'noxino' is specified, it + will be empty. About XINO files, see the aufs manual. + +What: /debug/aufs/si_/xino0, xino1 ... xinoN +Date: March 2009 +Contact: J. R. Okajima +Description: + It shows the consumed blocks by xino (External Inode Number + Translation Table), its link count, block size and file + size. + When the aufs mount option 'noxino' is specified, it + will be empty. About XINO files, see the aufs manual. + +What: /debug/aufs/si_/xigen +Date: March 2009 +Contact: J. R. Okajima +Description: + It shows the consumed blocks by xigen (External Inode + Generation Table), its block size and file size. + If CONFIG_AUFS_EXPORT is disabled, this entry will not + be created. + When the aufs mount option 'noxino' is specified, it + will be empty. About XINO files, see the aufs manual. --- linux-azure-4.11.0.orig/Documentation/ABI/testing/sysfs-aufs +++ linux-azure-4.11.0/Documentation/ABI/testing/sysfs-aufs @@ -0,0 +1,31 @@ +What: /sys/fs/aufs/si_/ +Date: March 2009 +Contact: J. R. Okajima +Description: + Under /sys/fs/aufs, a directory named si_ is created + per aufs mount, where is a unique id generated + internally. + +What: /sys/fs/aufs/si_/br0, br1 ... brN +Date: March 2009 +Contact: J. R. Okajima +Description: + It shows the abolute path of a member directory (which + is called branch) in aufs, and its permission. + +What: /sys/fs/aufs/si_/brid0, brid1 ... bridN +Date: July 2013 +Contact: J. R. Okajima +Description: + It shows the id of a member directory (which is called + branch) in aufs. + +What: /sys/fs/aufs/si_/xi_path +Date: March 2009 +Contact: J. R. Okajima +Description: + It shows the abolute path of XINO (External Inode Number + Bitmap, Translation Table and Generation Table) file + even if it is the default path. + When the aufs mount option 'noxino' is specified, it + will be empty. About XINO files, see the aufs manual. --- linux-azure-4.11.0.orig/Documentation/acpi/acpi-lid.txt +++ linux-azure-4.11.0/Documentation/acpi/acpi-lid.txt @@ -59,20 +59,28 @@ If the userspace hasn't been prepared to ignore the unreliable "opened" events and the unreliable initial state notification, Linux users can use the following kernel parameters to handle the possible issues: -A. button.lid_init_state=open: +A. button.lid_init_state=method: + When this option is specified, the ACPI button driver reports the + initial lid state using the returning value of the _LID control method + and whether the "opened"/"closed" events are paired fully relies on the + firmware implementation. + This option can be used to fix some platforms where the returning value + of the _LID control method is reliable but the initial lid state + notification is missing. + This option is the default behavior during the period the userspace + isn't ready to handle the buggy AML tables. +B. button.lid_init_state=open: When this option is specified, the ACPI button driver always reports the initial lid state as "opened" and whether the "opened"/"closed" events are paired fully relies on the firmware implementation. This may fix some platforms where the returning value of the _LID control method is not reliable and the initial lid state notification is missing. - This option is the default behavior during the period the userspace - isn't ready to handle the buggy AML tables. If the userspace has been prepared to ignore the unreliable "opened" events and the unreliable initial state notification, Linux users should always use the following kernel parameter: -B. button.lid_init_state=ignore: +C. button.lid_init_state=ignore: When this option is specified, the ACPI button driver never reports the initial lid state and there is a compensation mechanism implemented to ensure that the reliable "closed" notifications can always be delievered --- linux-azure-4.11.0.orig/Documentation/admin-guide/kernel-parameters.txt +++ linux-azure-4.11.0/Documentation/admin-guide/kernel-parameters.txt @@ -650,6 +650,10 @@ /proc//coredump_filter. See also Documentation/filesystems/proc.txt. + cpufreq_driver= [X86] Allow only the named cpu frequency scaling driver + to register. Example: cpufreq_driver=powernow-k8 + Format: { none | STRING } + cpuidle.off=1 [CPU_IDLE] disable the cpuidle sub-system @@ -1644,6 +1648,12 @@ nobypass [PPC/POWERNV] Disable IOMMU bypass, using IOMMU for PCI devices. + iommu.passthrough= + [ARM64] Configure DMA to bypass the IOMMU by default. + Format: { "0" | "1" } + 0 - Use IOMMU translation for DMA. + 1 - Bypass the IOMMU for DMA. + unset - Use IOMMU translation for DMA. io7= [HW] IO7 for Marvel based alpha systems See comment before marvel_specify_io7 in @@ -1814,6 +1824,18 @@ for all guests. Default is 1 (enabled) if in 64-bit or 32-bit PAE mode. + kvm-arm.vgic_v3_group0_trap= + [KVM,ARM] Trap guest accesses to GICv3 group-0 + system registers + + kvm-arm.vgic_v3_group1_trap= + [KVM,ARM] Trap guest accesses to GICv3 group-1 + system registers + + kvm-arm.vgic_v3_common_trap= + [KVM,ARM] Trap guest accesses to GICv3 common + system registers + kvm-intel.ept= [KVM,Intel] Disable extended page tables (virtualized MMU) support on capable Intel chips. Default is 1 (enabled) @@ -3779,6 +3801,13 @@ spia_pedr= spia_peddr= + stack_guard_gap= [MM] + override the default stack gap protection. The value + is in page units and it defines how many pages prior + to (for stacks growing down) resp. after (for stacks + growing up) the main stack are reserved for no other + mapping. Default value is 256 pages. + stacktrace [FTRACE] Enabled the stack tracer on boot up. --- linux-azure-4.11.0.orig/Documentation/arm64/silicon-errata.txt +++ linux-azure-4.11.0/Documentation/arm64/silicon-errata.txt @@ -54,6 +54,7 @@ | ARM | Cortex-A57 | #852523 | N/A | | ARM | Cortex-A57 | #834220 | ARM64_ERRATUM_834220 | | ARM | Cortex-A72 | #853709 | N/A | +| ARM | Cortex-A73 | #858921 | ARM64_ERRATUM_858921 | | ARM | MMU-500 | #841119,#826419 | N/A | | | | | | | Cavium | ThunderX ITS | #22375, #24313 | CAVIUM_ERRATUM_22375 | @@ -61,6 +62,7 @@ | Cavium | ThunderX GICv3 | #23154 | CAVIUM_ERRATUM_23154 | | Cavium | ThunderX Core | #27456 | CAVIUM_ERRATUM_27456 | | Cavium | ThunderX SMMUv2 | #27704 | N/A | +| Cavium | ThunderX Core | #30115 | CAVIUM_ERRATUM_30115 | | | | | | | Freescale/NXP | LS2080A/LS1043A | A-008585 | FSL_ERRATUM_A008585 | | | | | | --- linux-azure-4.11.0.orig/Documentation/arm64/tagged-pointers.txt +++ linux-azure-4.11.0/Documentation/arm64/tagged-pointers.txt @@ -11,24 +11,56 @@ The kernel configures the translation tables so that translations made via TTBR0 (i.e. userspace mappings) have the top byte (bits 63:56) of the virtual address ignored by the translation hardware. This frees up -this byte for application use, with the following caveats: +this byte for application use. - (1) The kernel requires that all user addresses passed to EL1 - are tagged with tag 0x00. This means that any syscall - parameters containing user virtual addresses *must* have - their top byte cleared before trapping to the kernel. - - (2) Non-zero tags are not preserved when delivering signals. - This means that signal handlers in applications making use - of tags cannot rely on the tag information for user virtual - addresses being maintained for fields inside siginfo_t. - One exception to this rule is for signals raised in response - to watchpoint debug exceptions, where the tag information - will be preserved. - - (3) Special care should be taken when using tagged pointers, - since it is likely that C compilers will not hazard two - virtual addresses differing only in the upper byte. + +Passing tagged addresses to the kernel +-------------------------------------- + +All interpretation of userspace memory addresses by the kernel assumes +an address tag of 0x00. + +This includes, but is not limited to, addresses found in: + + - pointer arguments to system calls, including pointers in structures + passed to system calls, + + - the stack pointer (sp), e.g. when interpreting it to deliver a + signal, + + - the frame pointer (x29) and frame records, e.g. when interpreting + them to generate a backtrace or call graph. + +Using non-zero address tags in any of these locations may result in an +error code being returned, a (fatal) signal being raised, or other modes +of failure. + +For these reasons, passing non-zero address tags to the kernel via +system calls is forbidden, and using a non-zero address tag for sp is +strongly discouraged. + +Programs maintaining a frame pointer and frame records that use non-zero +address tags may suffer impaired or inaccurate debug and profiling +visibility. + + +Preserving tags +--------------- + +Non-zero tags are not preserved when delivering signals. This means that +signal handlers in applications making use of tags cannot rely on the +tag information for user virtual addresses being maintained for fields +inside siginfo_t. One exception to this rule is for signals raised in +response to watchpoint debug exceptions, where the tag information will +be preserved. The architecture prevents the use of a tagged PC, so the upper byte will be set to a sign-extension of bit 55 on exception return. + + +Other considerations +-------------------- + +Special care should be taken when using tagged pointers, since it is +likely that C compilers will not hazard two virtual addresses differing +only in the upper byte. --- linux-azure-4.11.0.orig/Documentation/cgroups/namespace.txt +++ linux-azure-4.11.0/Documentation/cgroups/namespace.txt @@ -0,0 +1,142 @@ + CGroup Namespaces + +CGroup Namespace provides a mechanism to virtualize the view of the +/proc//cgroup file. The CLONE_NEWCGROUP clone-flag can be used with +clone() and unshare() syscalls to create a new cgroup namespace. +The process running inside the cgroup namespace will have its /proc//cgroup +output restricted to cgroupns-root. cgroupns-root is the cgroup of the process +at the time of creation of the cgroup namespace. + +Prior to CGroup Namespace, the /proc//cgroup file used to show complete +path of the cgroup of a process. In a container setup (where a set of cgroups +and namespaces are intended to isolate processes), the /proc//cgroup file +may leak potential system level information to the isolated processes. + +For Example: + $ cat /proc/self/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/container_id1 + +The path '/batchjobs/container_id1' can generally be considered as system-data +and its desirable to not expose it to the isolated process. + +CGroup Namespaces can be used to restrict visibility of this path. +For Example: + # Before creating cgroup namespace + $ ls -l /proc/self/ns/cgroup + lrwxrwxrwx 1 root root 0 2014-07-15 10:37 /proc/self/ns/cgroup -> cgroup:[4026531835] + $ cat /proc/self/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/container_id1 + + # unshare(CLONE_NEWCGROUP) and exec /bin/bash + $ ~/unshare -c + [ns]$ ls -l /proc/self/ns/cgroup + lrwxrwxrwx 1 root root 0 2014-07-15 10:35 /proc/self/ns/cgroup -> cgroup:[4026532183] + # From within new cgroupns, process sees that its in the root cgroup + [ns]$ cat /proc/self/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/ + + # From global cgroupns: + $ cat /proc//cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/container_id1 + + # Unshare cgroupns along with userns and mountns + # Following calls unshare(CLONE_NEWCGROUP|CLONE_NEWUSER|CLONE_NEWNS), then + # sets up uid/gid map and execs /bin/bash + $ ~/unshare -c -u -m + # Originally, we were in /batchjobs/container_id1 cgroup. Mount our own cgroup + # hierarchy. + [ns]$ mount -t cgroup cgroup /tmp/cgroup + [ns]$ ls -l /tmp/cgroup + total 0 + -r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.controllers + -r--r--r-- 1 root root 0 2014-10-13 09:32 cgroup.populated + -rw-r--r-- 1 root root 0 2014-10-13 09:25 cgroup.procs + -rw-r--r-- 1 root root 0 2014-10-13 09:32 cgroup.subtree_control + +The cgroupns-root (/batchjobs/container_id1 in above example) becomes the +filesystem root for the namespace specific cgroupfs mount. + +The virtualization of /proc/self/cgroup file combined with restricting +the view of cgroup hierarchy by namespace-private cgroupfs mount +should provide a completely isolated cgroup view inside the container. + +In its current form, the cgroup namespaces patcheset provides following +behavior: + +(1) The 'cgroupns-root' for a cgroup namespace is the cgroup in which + the process calling unshare is running. + For ex. if a process in /batchjobs/container_id1 cgroup calls unshare, + cgroup /batchjobs/container_id1 becomes the cgroupns-root. + For the init_cgroup_ns, this is the real root ('/') cgroup + (identified in code as cgrp_dfl_root.cgrp). + +(2) The cgroupns-root cgroup does not change even if the namespace + creator process later moves to a different cgroup. + $ ~/unshare -c # unshare cgroupns in some cgroup + [ns]$ cat /proc/self/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/ + [ns]$ mkdir sub_cgrp_1 + [ns]$ echo 0 > sub_cgrp_1/cgroup.procs + [ns]$ cat /proc/self/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1 + +(3) Each process gets its CGROUPNS specific view of /proc//cgroup +(a) Processes running inside the cgroup namespace will be able to see + cgroup paths (in /proc/self/cgroup) only inside their root cgroup + [ns]$ sleep 100000 & # From within unshared cgroupns + [1] 7353 + [ns]$ echo 7353 > sub_cgrp_1/cgroup.procs + [ns]$ cat /proc/7353/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1 + +(b) From global cgroupns, the real cgroup path will be visible: + $ cat /proc/7353/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/batchjobs/container_id1/sub_cgrp_1 + +(c) From a sibling cgroupns (cgroupns root-ed at a different cgroup), cgroup + path relative to its own cgroupns-root will be shown: + # ns2's cgroupns-root is at '/batchjobs/container_id2' + [ns2]$ cat /proc/7353/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/../container_id2/sub_cgrp_1 + + Note that the relative path always starts with '/' to indicate that its + relative to the cgroupns-root of the caller. + +(4) Processes inside a cgroupns can move in-and-out of the cgroupns-root + (if they have proper access to external cgroups). + # From inside cgroupns (with cgroupns-root at /batchjobs/container_id1), and + # assuming that the global hierarchy is still accessible inside cgroupns: + $ cat /proc/7353/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/sub_cgrp_1 + $ echo 7353 > batchjobs/container_id2/cgroup.procs + $ cat /proc/7353/cgroup + 0:cpuset,cpu,cpuacct,memory,devices,freezer,hugetlb:/../container_id2 + + Note that this kind of setup is not encouraged. A task inside cgroupns + should only be exposed to its own cgroupns hierarchy. Otherwise it makes + the virtualization of /proc//cgroup less useful. + +(5) Setns to another cgroup namespace is allowed when: + (a) the process has CAP_SYS_ADMIN in its current userns + (b) the process has CAP_SYS_ADMIN in the target cgroupns' userns + No implicit cgroup changes happen with attaching to another cgroupns. It + is expected that the somone moves the attaching process under the target + cgroupns-root. + +(6) When some thread from a multi-threaded process unshares its + cgroup-namespace, the new cgroupns gets applied to the entire process (all + the threads). For the unified-hierarchy this is expected as it only allows + process-level containerization. For the legacy hierarchies this may be + unexpected. So all the threads in the process will have the same cgroup. + +(7) The cgroup namespace is alive as long as there is atleast 1 + process inside it. When the last process exits, the cgroup + namespace is destroyed. The cgroupns-root and the actual cgroups + remain though. + +(8) Namespace specific cgroup hierarchy can be mounted by a process running + inside cgroupns: + $ mount -t cgroup -o __DEVEL__sane_behavior cgroup $MOUNT_POINT + + This will mount the unified cgroup hierarchy with cgroupns-root as the + filesystem root. The process needs CAP_SYS_ADMIN in its userns and mntns. --- linux-azure-4.11.0.orig/Documentation/devicetree/bindings/arm/hisilicon/hisilicon-low-pin-count.txt +++ linux-azure-4.11.0/Documentation/devicetree/bindings/arm/hisilicon/hisilicon-low-pin-count.txt @@ -0,0 +1,33 @@ +Hisilicon Hip06 low-pin-count device + Hisilicon Hip06 SoCs implement a Low Pin Count (LPC) controller, which + provides I/O access to some legacy ISA devices. + Hip06 is based on arm64 architecture where there is no I/O space. So, the + I/O ports here are not cpu addresses, and there is no 'ranges' property in + LPC device node. + +Required properties: +- compatible: value should be as follows: + (a) "hisilicon,hip06-lpc" + (b) "hisilicon,hip07-lpc" +- #address-cells: must be 2 which stick to the ISA/EISA binding doc. +- #size-cells: must be 1 which stick to the ISA/EISA binding doc. +- reg: base memory range where the LPC register set is mapped. + +Note: + The node name before '@' must be "isa" to represent the binding stick to the + ISA/EISA binding specification. + +Example: + +isa@a01b0000 { + compatible = "hisilicon,hip06-lpc"; + #address-cells = <2>; + #size-cells = <1>; + reg = <0x0 0xa01b0000 0x0 0x1000>; + + ipmi0: bt@e4 { + compatible = "ipmi-bt"; + device_type = "ipmi"; + reg = <0x01 0xe4 0x04>; + }; +}; --- linux-azure-4.11.0.orig/Documentation/devicetree/bindings/chosen.txt +++ linux-azure-4.11.0/Documentation/devicetree/bindings/chosen.txt @@ -52,3 +52,48 @@ book3e) by some versions of kexec-tools to tell the new kernel that it is being booted by kexec, as the booting environment may differ (e.g. a different secondary CPU release mechanism) + +linux,usable-memory-range +------------------------- + +This property (arm64 only) holds a base address and size, describing a +limited region in which memory may be considered available for use by +the kernel. Memory outside of this range is not available for use. + +This property describes a limitation: memory within this range is only +valid when also described through another mechanism that the kernel +would otherwise use to determine available memory (e.g. memory nodes +or the EFI memory map). Valid memory may be sparse within the range. +e.g. + +/ { + chosen { + linux,usable-memory-range = <0x9 0xf0000000 0x0 0x10000000>; + }; +}; + +The main usage is for crash dump kernel to identify its own usable +memory and exclude, at its boot time, any other memory areas that are +part of the panicked kernel's memory. + +While this property does not represent a real hardware, the address +and the size are expressed in #address-cells and #size-cells, +respectively, of the root node. + +linux,elfcorehdr +---------------- + +This property (currently used only on arm64) holds the memory range, +the address and the size, of the elf core header which mainly describes +the panicked kernel's memory layout as PT_LOAD segments of elf format. +e.g. + +/ { + chosen { + linux,elfcorehdr = <0x9 0xfffff000 0x0 0x800>; + }; +}; + +While this property does not represent a real hardware, the address +and the size are expressed in #address-cells and #size-cells, +respectively, of the root node. --- linux-azure-4.11.0.orig/Documentation/devicetree/bindings/mfd/axp20x.txt +++ linux-azure-4.11.0/Documentation/devicetree/bindings/mfd/axp20x.txt @@ -28,6 +28,9 @@ regulator to drive the OTG VBus, rather then as an input pin which signals whether the board is driving OTG VBus or not. +- x-powers,master-mode: Boolean (axp806 only). Set this when the PMIC is + wired for master mode. The default is slave mode. + - -supply: a phandle to the regulator supply node. May be omitted if inputs are unregulated, such as using the IPSOUT output from the PMIC. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/README +++ linux-azure-4.11.0/Documentation/filesystems/aufs/README @@ -0,0 +1,393 @@ + +Aufs4 -- advanced multi layered unification filesystem version 4.x +http://aufs.sf.net +Junjiro R. Okajima + + +0. Introduction +---------------------------------------- +In the early days, aufs was entirely re-designed and re-implemented +Unionfs Version 1.x series. Adding many original ideas, approaches, +improvements and implementations, it becomes totally different from +Unionfs while keeping the basic features. +Recently, Unionfs Version 2.x series begin taking some of the same +approaches to aufs1's. +Unionfs is being developed by Professor Erez Zadok at Stony Brook +University and his team. + +Aufs4 supports linux-4.0 and later, and for linux-3.x series try aufs3. +If you want older kernel version support, try aufs2-2.6.git or +aufs2-standalone.git repository, aufs1 from CVS on SourceForge. + +Note: it becomes clear that "Aufs was rejected. Let's give it up." + According to Christoph Hellwig, linux rejects all union-type + filesystems but UnionMount. + + +PS. Al Viro seems have a plan to merge aufs as well as overlayfs and + UnionMount, and he pointed out an issue around a directory mutex + lock and aufs addressed it. But it is still unsure whether aufs will + be merged (or any other union solution). + + + +1. Features +---------------------------------------- +- unite several directories into a single virtual filesystem. The member + directory is called as a branch. +- you can specify the permission flags to the branch, which are 'readonly', + 'readwrite' and 'whiteout-able.' +- by upper writable branch, internal copyup and whiteout, files/dirs on + readonly branch are modifiable logically. +- dynamic branch manipulation, add, del. +- etc... + +Also there are many enhancements in aufs, such as: +- test only the highest one for the directory permission (dirperm1) +- copyup on open (coo=) +- 'move' policy for copy-up between two writable branches, after + checking free space. +- xattr, acl +- readdir(3) in userspace. +- keep inode number by external inode number table +- keep the timestamps of file/dir in internal copyup operation +- seekable directory, supporting NFS readdir. +- whiteout is hardlinked in order to reduce the consumption of inodes + on branch +- do not copyup, nor create a whiteout when it is unnecessary +- revert a single systemcall when an error occurs in aufs +- remount interface instead of ioctl +- maintain /etc/mtab by an external command, /sbin/mount.aufs. +- loopback mounted filesystem as a branch +- kernel thread for removing the dir who has a plenty of whiteouts +- support copyup sparse file (a file which has a 'hole' in it) +- default permission flags for branches +- selectable permission flags for ro branch, whether whiteout can + exist or not +- export via NFS. +- support /fs/aufs and /aufs. +- support multiple writable branches, some policies to select one + among multiple writable branches. +- a new semantics for link(2) and rename(2) to support multiple + writable branches. +- no glibc changes are required. +- pseudo hardlink (hardlink over branches) +- allow a direct access manually to a file on branch, e.g. bypassing aufs. + including NFS or remote filesystem branch. +- userspace wrapper for pathconf(3)/fpathconf(3) with _PC_LINK_MAX. +- and more... + +Currently these features are dropped temporary from aufs4. +See design/08plan.txt in detail. +- nested mount, i.e. aufs as readonly no-whiteout branch of another aufs + (robr) +- statistics of aufs thread (/sys/fs/aufs/stat) + +Features or just an idea in the future (see also design/*.txt), +- reorder the branch index without del/re-add. +- permanent xino files for NFSD +- an option for refreshing the opened files after add/del branches +- light version, without branch manipulation. (unnecessary?) +- copyup in userspace +- inotify in userspace +- readv/writev + + +2. Download +---------------------------------------- +There are three GIT trees for aufs4, aufs4-linux.git, +aufs4-standalone.git, and aufs-util.git. Note that there is no "4" in +"aufs-util.git." +While the aufs-util is always necessary, you need either of aufs4-linux +or aufs4-standalone. + +The aufs4-linux tree includes the whole linux mainline GIT tree, +git://git.kernel.org/.../torvalds/linux.git. +And you cannot select CONFIG_AUFS_FS=m for this version, eg. you cannot +build aufs4 as an external kernel module. +Several extra patches are not included in this tree. Only +aufs4-standalone tree contains them. They are described in the later +section "Configuration and Compilation." + +On the other hand, the aufs4-standalone tree has only aufs source files +and necessary patches, and you can select CONFIG_AUFS_FS=m. +But you need to apply all aufs patches manually. + +You will find GIT branches whose name is in form of "aufs4.x" where "x" +represents the linux kernel version, "linux-4.x". For instance, +"aufs4.0" is for linux-4.0. For latest "linux-4.x-rcN", use +"aufs4.x-rcN" branch. + +o aufs4-linux tree +$ git clone --reference /your/linux/git/tree \ + git://github.com/sfjro/aufs4-linux.git aufs4-linux.git +- if you don't have linux GIT tree, then remove "--reference ..." +$ cd aufs4-linux.git +$ git checkout origin/aufs4.0 + +Or You may want to directly git-pull aufs into your linux GIT tree, and +leave the patch-work to GIT. +$ cd /your/linux/git/tree +$ git remote add aufs4 git://github.com/sfjro/aufs4-linux.git +$ git fetch aufs4 +$ git checkout -b my4.0 v4.0 +$ (add your local change...) +$ git pull aufs4 aufs4.0 +- now you have v4.0 + your_changes + aufs4.0 in you my4.0 branch. +- you may need to solve some conflicts between your_changes and + aufs4.0. in this case, git-rerere is recommended so that you can + solve the similar conflicts automatically when you upgrade to 4.1 or + later in the future. + +o aufs4-standalone tree +$ git clone git://github.com/sfjro/aufs4-standalone.git aufs4-standalone.git +$ cd aufs4-standalone.git +$ git checkout origin/aufs4.0 + +o aufs-util tree +$ git clone git://git.code.sf.net/p/aufs/aufs-util aufs-util.git +- note that the public aufs-util.git is on SourceForge instead of + GitHUB. +$ cd aufs-util.git +$ git checkout origin/aufs4.0 + +Note: The 4.x-rcN branch is to be used with `rc' kernel versions ONLY. +The minor version number, 'x' in '4.x', of aufs may not always +follow the minor version number of the kernel. +Because changes in the kernel that cause the use of a new +minor version number do not always require changes to aufs-util. + +Since aufs-util has its own minor version number, you may not be +able to find a GIT branch in aufs-util for your kernel's +exact minor version number. +In this case, you should git-checkout the branch for the +nearest lower number. + +For (an unreleased) example: +If you are using "linux-4.10" and the "aufs4.10" branch +does not exist in aufs-util repository, then "aufs4.9", "aufs4.8" +or something numerically smaller is the branch for your kernel. + +Also you can view all branches by + $ git branch -a + + +3. Configuration and Compilation +---------------------------------------- +Make sure you have git-checkout'ed the correct branch. + +For aufs4-linux tree, +- enable CONFIG_AUFS_FS. +- set other aufs configurations if necessary. + +For aufs4-standalone tree, +There are several ways to build. + +1. +- apply ./aufs4-kbuild.patch to your kernel source files. +- apply ./aufs4-base.patch too. +- apply ./aufs4-mmap.patch too. +- apply ./aufs4-standalone.patch too, if you have a plan to set + CONFIG_AUFS_FS=m. otherwise you don't need ./aufs4-standalone.patch. +- copy ./{Documentation,fs,include/uapi/linux/aufs_type.h} files to your + kernel source tree. Never copy $PWD/include/uapi/linux/Kbuild. +- enable CONFIG_AUFS_FS, you can select either + =m or =y. +- and build your kernel as usual. +- install the built kernel. + Note: Since linux-3.9, every filesystem module requires an alias + "fs-". You should make sure that "fs-aufs" is listed in your + modules.aliases file if you set CONFIG_AUFS_FS=m. +- install the header files too by "make headers_install" to the + directory where you specify. By default, it is $PWD/usr. + "make help" shows a brief note for headers_install. +- and reboot your system. + +2. +- module only (CONFIG_AUFS_FS=m). +- apply ./aufs4-base.patch to your kernel source files. +- apply ./aufs4-mmap.patch too. +- apply ./aufs4-standalone.patch too. +- build your kernel, don't forget "make headers_install", and reboot. +- edit ./config.mk and set other aufs configurations if necessary. + Note: You should read $PWD/fs/aufs/Kconfig carefully which describes + every aufs configurations. +- build the module by simple "make". + Note: Since linux-3.9, every filesystem module requires an alias + "fs-". You should make sure that "fs-aufs" is listed in your + modules.aliases file. +- you can specify ${KDIR} make variable which points to your kernel + source tree. +- install the files + + run "make install" to install the aufs module, or copy the built + $PWD/aufs.ko to /lib/modules/... and run depmod -a (or reboot simply). + + run "make install_headers" (instead of headers_install) to install + the modified aufs header file (you can specify DESTDIR which is + available in aufs standalone version's Makefile only), or copy + $PWD/usr/include/linux/aufs_type.h to /usr/include/linux or wherever + you like manually. By default, the target directory is $PWD/usr. +- no need to apply aufs4-kbuild.patch, nor copying source files to your + kernel source tree. + +Note: The header file aufs_type.h is necessary to build aufs-util + as well as "make headers_install" in the kernel source tree. + headers_install is subject to be forgotten, but it is essentially + necessary, not only for building aufs-util. + You may not meet problems without headers_install in some older + version though. + +And then, +- read README in aufs-util, build and install it +- note that your distribution may contain an obsoleted version of + aufs_type.h in /usr/include/linux or something. When you build aufs + utilities, make sure that your compiler refers the correct aufs header + file which is built by "make headers_install." +- if you want to use readdir(3) in userspace or pathconf(3) wrapper, + then run "make install_ulib" too. And refer to the aufs manual in + detail. + +There several other patches in aufs4-standalone.git. They are all +optional. When you meet some problems, they will help you. +- aufs4-loopback.patch + Supports a nested loopback mount in a branch-fs. This patch is + unnecessary until aufs produces a message like "you may want to try + another patch for loopback file". +- vfs-ino.patch + Modifies a system global kernel internal function get_next_ino() in + order to stop assigning 0 for an inode-number. Not directly related to + aufs, but recommended generally. +- tmpfs-idr.patch + Keeps the tmpfs inode number as the lowest value. Effective to reduce + the size of aufs XINO files for tmpfs branch. Also it prevents the + duplication of inode number, which is important for backup tools and + other utilities. When you find aufs XINO files for tmpfs branch + growing too much, try this patch. +- lockdep-debug.patch + Because aufs is not only an ordinary filesystem (callee of VFS), but + also a caller of VFS functions for branch filesystems, subclassing of + the internal locks for LOCKDEP is necessary. LOCKDEP is a debugging + feature of linux kernel. If you enable CONFIG_LOCKDEP, then you will + need to apply this debug patch to expand several constant values. + If don't know what LOCKDEP, then you don't have apply this patch. + + +4. Usage +---------------------------------------- +At first, make sure aufs-util are installed, and please read the aufs +manual, aufs.5 in aufs-util.git tree. +$ man -l aufs.5 + +And then, +$ mkdir /tmp/rw /tmp/aufs +# mount -t aufs -o br=/tmp/rw:${HOME} none /tmp/aufs + +Here is another example. The result is equivalent. +# mount -t aufs -o br=/tmp/rw=rw:${HOME}=ro none /tmp/aufs + Or +# mount -t aufs -o br:/tmp/rw none /tmp/aufs +# mount -o remount,append:${HOME} /tmp/aufs + +Then, you can see whole tree of your home dir through /tmp/aufs. If +you modify a file under /tmp/aufs, the one on your home directory is +not affected, instead the same named file will be newly created under +/tmp/rw. And all of your modification to a file will be applied to +the one under /tmp/rw. This is called the file based Copy on Write +(COW) method. +Aufs mount options are described in aufs.5. +If you run chroot or something and make your aufs as a root directory, +then you need to customize the shutdown script. See the aufs manual in +detail. + +Additionally, there are some sample usages of aufs which are a +diskless system with network booting, and LiveCD over NFS. +See sample dir in CVS tree on SourceForge. + + +5. Contact +---------------------------------------- +When you have any problems or strange behaviour in aufs, please let me +know with: +- /proc/mounts (instead of the output of mount(8)) +- /sys/module/aufs/* +- /sys/fs/aufs/* (if you have them) +- /debug/aufs/* (if you have them) +- linux kernel version + if your kernel is not plain, for example modified by distributor, + the url where i can download its source is necessary too. +- aufs version which was printed at loading the module or booting the + system, instead of the date you downloaded. +- configuration (define/undefine CONFIG_AUFS_xxx) +- kernel configuration or /proc/config.gz (if you have it) +- behaviour which you think to be incorrect +- actual operation, reproducible one is better +- mailto: aufs-users at lists.sourceforge.net + +Usually, I don't watch the Public Areas(Bugs, Support Requests, Patches, +and Feature Requests) on SourceForge. Please join and write to +aufs-users ML. + + +6. Acknowledgements +---------------------------------------- +Thanks to everyone who have tried and are using aufs, whoever +have reported a bug or any feedback. + +Especially donators: +Tomas Matejicek(slax.org) made a donation (much more than once). + Since Apr 2010, Tomas M (the author of Slax and Linux Live + scripts) is making "doubling" donations. + Unfortunately I cannot list all of the donators, but I really + appreciate. + It ends Aug 2010, but the ordinary donation URL is still available. + +Dai Itasaka made a donation (2007/8). +Chuck Smith made a donation (2008/4, 10 and 12). +Henk Schoneveld made a donation (2008/9). +Chih-Wei Huang, ASUS, CTC donated Eee PC 4G (2008/10). +Francois Dupoux made a donation (2008/11). +Bruno Cesar Ribas and Luis Carlos Erpen de Bona, C3SL serves public + aufs2 GIT tree (2009/2). +William Grant made a donation (2009/3). +Patrick Lane made a donation (2009/4). +The Mail Archive (mail-archive.com) made donations (2009/5). +Nippy Networks (Ed Wildgoose) made a donation (2009/7). +New Dream Network, LLC (www.dreamhost.com) made a donation (2009/11). +Pavel Pronskiy made a donation (2011/2). +Iridium and Inmarsat satellite phone retailer (www.mailasail.com), Nippy + Networks (Ed Wildgoose) made a donation for hardware (2011/3). +Max Lekomcev (DOM-TV project) made a donation (2011/7, 12, 2012/3, 6 and +11). +Sam Liddicott made a donation (2011/9). +Era Scarecrow made a donation (2013/4). +Bor Ratajc made a donation (2013/4). +Alessandro Gorreta made a donation (2013/4). +POIRETTE Marc made a donation (2013/4). +Alessandro Gorreta made a donation (2013/4). +lauri kasvandik made a donation (2013/5). +"pemasu from Finland" made a donation (2013/7). +The Parted Magic Project made a donation (2013/9 and 11). +Pavel Barta made a donation (2013/10). +Nikolay Pertsev made a donation (2014/5). +James B made a donation (2014/7 and 2015/7). +Stefano Di Biase made a donation (2014/8). +Daniel Epellei made a donation (2015/1). +OmegaPhil made a donation (2016/1). +Tomasz Szewczyk made a donation (2016/4). +James Burry made a donation (2016/12). + +Thank you very much. +Donations are always, including future donations, very important and +helpful for me to keep on developing aufs. + + +7. +---------------------------------------- +If you are an experienced user, no explanation is needed. Aufs is +just a linux filesystem. + + +Enjoy! + +# Local variables: ; +# mode: text; +# End: ; --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/01intro.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/01intro.txt @@ -0,0 +1,170 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Introduction +---------------------------------------- + +aufs [ei ju: ef es] | [a u f s] +1. abbrev. for "advanced multi-layered unification filesystem". +2. abbrev. for "another unionfs". +3. abbrev. for "auf das" in German which means "on the" in English. + Ex. "Butter aufs Brot"(G) means "butter onto bread"(E). + But "Filesystem aufs Filesystem" is hard to understand. + +AUFS is a filesystem with features: +- multi layered stackable unification filesystem, the member directory + is called as a branch. +- branch permission and attribute, 'readonly', 'real-readonly', + 'readwrite', 'whiteout-able', 'link-able whiteout', etc. and their + combination. +- internal "file copy-on-write". +- logical deletion, whiteout. +- dynamic branch manipulation, adding, deleting and changing permission. +- allow bypassing aufs, user's direct branch access. +- external inode number translation table and bitmap which maintains the + persistent aufs inode number. +- seekable directory, including NFS readdir. +- file mapping, mmap and sharing pages. +- pseudo-link, hardlink over branches. +- loopback mounted filesystem as a branch. +- several policies to select one among multiple writable branches. +- revert a single systemcall when an error occurs in aufs. +- and more... + + +Multi Layered Stackable Unification Filesystem +---------------------------------------------------------------------- +Most people already knows what it is. +It is a filesystem which unifies several directories and provides a +merged single directory. When users access a file, the access will be +passed/re-directed/converted (sorry, I am not sure which English word is +correct) to the real file on the member filesystem. The member +filesystem is called 'lower filesystem' or 'branch' and has a mode +'readonly' and 'readwrite.' And the deletion for a file on the lower +readonly branch is handled by creating 'whiteout' on the upper writable +branch. + +On LKML, there have been discussions about UnionMount (Jan Blunck, +Bharata B Rao and Valerie Aurora) and Unionfs (Erez Zadok). They took +different approaches to implement the merged-view. +The former tries putting it into VFS, and the latter implements as a +separate filesystem. +(If I misunderstand about these implementations, please let me know and +I shall correct it. Because it is a long time ago when I read their +source files last time). + +UnionMount's approach will be able to small, but may be hard to share +branches between several UnionMount since the whiteout in it is +implemented in the inode on branch filesystem and always +shared. According to Bharata's post, readdir does not seems to be +finished yet. +There are several missing features known in this implementations such as +- for users, the inode number may change silently. eg. copy-up. +- link(2) may break by copy-up. +- read(2) may get an obsoleted filedata (fstat(2) too). +- fcntl(F_SETLK) may be broken by copy-up. +- unnecessary copy-up may happen, for example mmap(MAP_PRIVATE) after + open(O_RDWR). + +In linux-3.18, "overlay" filesystem (formerly known as "overlayfs") was +merged into mainline. This is another implementation of UnionMount as a +separated filesystem. All the limitations and known problems which +UnionMount are equally inherited to "overlay" filesystem. + +Unionfs has a longer history. When I started implementing a stackable +filesystem (Aug 2005), it already existed. It has virtual super_block, +inode, dentry and file objects and they have an array pointing lower +same kind objects. After contributing many patches for Unionfs, I +re-started my project AUFS (Jun 2006). + +In AUFS, the structure of filesystem resembles to Unionfs, but I +implemented my own ideas, approaches and enhancements and it became +totally different one. + +Comparing DM snapshot and fs based implementation +- the number of bytes to be copied between devices is much smaller. +- the type of filesystem must be one and only. +- the fs must be writable, no readonly fs, even for the lower original + device. so the compression fs will not be usable. but if we use + loopback mount, we may address this issue. + for instance, + mount /cdrom/squashfs.img /sq + losetup /sq/ext2.img + losetup /somewhere/cow + dmsetup "snapshot /dev/loop0 /dev/loop1 ..." +- it will be difficult (or needs more operations) to extract the + difference between the original device and COW. +- DM snapshot-merge may help a lot when users try merging. in the + fs-layer union, users will use rsync(1). + +You may want to read my old paper "Filesystems in LiveCD" +(http://aufs.sourceforge.net/aufs2/report/sq/sq.pdf). + + +Several characters/aspects/persona of aufs +---------------------------------------------------------------------- + +Aufs has several characters, aspects or persona. +1. a filesystem, callee of VFS helper +2. sub-VFS, caller of VFS helper for branches +3. a virtual filesystem which maintains persistent inode number +4. reader/writer of files on branches such like an application + +1. Callee of VFS Helper +As an ordinary linux filesystem, aufs is a callee of VFS. For instance, +unlink(2) from an application reaches sys_unlink() kernel function and +then vfs_unlink() is called. vfs_unlink() is one of VFS helper and it +calls filesystem specific unlink operation. Actually aufs implements the +unlink operation but it behaves like a redirector. + +2. Caller of VFS Helper for Branches +aufs_unlink() passes the unlink request to the branch filesystem as if +it were called from VFS. So the called unlink operation of the branch +filesystem acts as usual. As a caller of VFS helper, aufs should handle +every necessary pre/post operation for the branch filesystem. +- acquire the lock for the parent dir on a branch +- lookup in a branch +- revalidate dentry on a branch +- mnt_want_write() for a branch +- vfs_unlink() for a branch +- mnt_drop_write() for a branch +- release the lock on a branch + +3. Persistent Inode Number +One of the most important issue for a filesystem is to maintain inode +numbers. This is particularly important to support exporting a +filesystem via NFS. Aufs is a virtual filesystem which doesn't have a +backend block device for its own. But some storage is necessary to +keep and maintain the inode numbers. It may be a large space and may not +suit to keep in memory. Aufs rents some space from its first writable +branch filesystem (by default) and creates file(s) on it. These files +are created by aufs internally and removed soon (currently) keeping +opened. +Note: Because these files are removed, they are totally gone after + unmounting aufs. It means the inode numbers are not persistent + across unmount or reboot. I have a plan to make them really + persistent which will be important for aufs on NFS server. + +4. Read/Write Files Internally (copy-on-write) +Because a branch can be readonly, when you write a file on it, aufs will +"copy-up" it to the upper writable branch internally. And then write the +originally requested thing to the file. Generally kernel doesn't +open/read/write file actively. In aufs, even a single write may cause a +internal "file copy". This behaviour is very similar to cp(1) command. + +Some people may think it is better to pass such work to user space +helper, instead of doing in kernel space. Actually I am still thinking +about it. But currently I have implemented it in kernel space. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/02struct.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/02struct.txt @@ -0,0 +1,258 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Basic Aufs Internal Structure + +Superblock/Inode/Dentry/File Objects +---------------------------------------------------------------------- +As like an ordinary filesystem, aufs has its own +superblock/inode/dentry/file objects. All these objects have a +dynamically allocated array and store the same kind of pointers to the +lower filesystem, branch. +For example, when you build a union with one readwrite branch and one +readonly, mounted /au, /rw and /ro respectively. +- /au = /rw + /ro +- /ro/fileA exists but /rw/fileA + +Aufs lookup operation finds /ro/fileA and gets dentry for that. These +pointers are stored in a aufs dentry. The array in aufs dentry will be, +- [0] = NULL (because /rw/fileA doesn't exist) +- [1] = /ro/fileA + +This style of an array is essentially same to the aufs +superblock/inode/dentry/file objects. + +Because aufs supports manipulating branches, ie. add/delete/change +branches dynamically, these objects has its own generation. When +branches are changed, the generation in aufs superblock is +incremented. And a generation in other object are compared when it is +accessed. When a generation in other objects are obsoleted, aufs +refreshes the internal array. + + +Superblock +---------------------------------------------------------------------- +Additionally aufs superblock has some data for policies to select one +among multiple writable branches, XIB files, pseudo-links and kobject. +See below in detail. +About the policies which supports copy-down a directory, see +wbr_policy.txt too. + + +Branch and XINO(External Inode Number Translation Table) +---------------------------------------------------------------------- +Every branch has its own xino (external inode number translation table) +file. The xino file is created and unlinked by aufs internally. When two +members of a union exist on the same filesystem, they share the single +xino file. +The struct of a xino file is simple, just a sequence of aufs inode +numbers which is indexed by the lower inode number. +In the above sample, assume the inode number of /ro/fileA is i111 and +aufs assigns the inode number i999 for fileA. Then aufs writes 999 as +4(8) bytes at 111 * 4(8) bytes offset in the xino file. + +When the inode numbers are not contiguous, the xino file will be sparse +which has a hole in it and doesn't consume as much disk space as it +might appear. If your branch filesystem consumes disk space for such +holes, then you should specify 'xino=' option at mounting aufs. + +Aufs has a mount option to free the disk blocks for such holes in XINO +files on tmpfs or ramdisk. But it is not so effective actually. If you +meet a problem of disk shortage due to XINO files, then you should try +"tmpfs-ino.patch" (and "vfs-ino.patch" too) in aufs4-standalone.git. +The patch localizes the assignment inumbers per tmpfs-mount and avoid +the holes in XINO files. + +Also a writable branch has three kinds of "whiteout bases". All these +are existed when the branch is joined to aufs, and their names are +whiteout-ed doubly, so that users will never see their names in aufs +hierarchy. +1. a regular file which will be hardlinked to all whiteouts. +2. a directory to store a pseudo-link. +3. a directory to store an "orphan"-ed file temporary. + +1. Whiteout Base + When you remove a file on a readonly branch, aufs handles it as a + logical deletion and creates a whiteout on the upper writable branch + as a hardlink of this file in order not to consume inode on the + writable branch. +2. Pseudo-link Dir + See below, Pseudo-link. +3. Step-Parent Dir + When "fileC" exists on the lower readonly branch only and it is + opened and removed with its parent dir, and then user writes + something into it, then aufs copies-up fileC to this + directory. Because there is no other dir to store fileC. After + creating a file under this dir, the file is unlinked. + +Because aufs supports manipulating branches, ie. add/delete/change +dynamically, a branch has its own id. When the branch order changes, +aufs finds the new index by searching the branch id. + + +Pseudo-link +---------------------------------------------------------------------- +Assume "fileA" exists on the lower readonly branch only and it is +hardlinked to "fileB" on the branch. When you write something to fileA, +aufs copies-up it to the upper writable branch. Additionally aufs +creates a hardlink under the Pseudo-link Directory of the writable +branch. The inode of a pseudo-link is kept in aufs super_block as a +simple list. If fileB is read after unlinking fileA, aufs returns +filedata from the pseudo-link instead of the lower readonly +branch. Because the pseudo-link is based upon the inode, to keep the +inode number by xino (see above) is essentially necessary. + +All the hardlinks under the Pseudo-link Directory of the writable branch +should be restored in a proper location later. Aufs provides a utility +to do this. The userspace helpers executed at remounting and unmounting +aufs by default. +During this utility is running, it puts aufs into the pseudo-link +maintenance mode. In this mode, only the process which began the +maintenance mode (and its child processes) is allowed to operate in +aufs. Some other processes which are not related to the pseudo-link will +be allowed to run too, but the rest have to return an error or wait +until the maintenance mode ends. If a process already acquires an inode +mutex (in VFS), it has to return an error. + + +XIB(external inode number bitmap) +---------------------------------------------------------------------- +Addition to the xino file per a branch, aufs has an external inode number +bitmap in a superblock object. It is also an internal file such like a +xino file. +It is a simple bitmap to mark whether the aufs inode number is in-use or +not. +To reduce the file I/O, aufs prepares a single memory page to cache xib. + +As well as XINO files, aufs has a feature to truncate/refresh XIB to +reduce the number of consumed disk blocks for these files. + + +Virtual or Vertical Dir, and Readdir in Userspace +---------------------------------------------------------------------- +In order to support multiple layers (branches), aufs readdir operation +constructs a virtual dir block on memory. For readdir, aufs calls +vfs_readdir() internally for each dir on branches, merges their entries +with eliminating the whiteout-ed ones, and sets it to file (dir) +object. So the file object has its entry list until it is closed. The +entry list will be updated when the file position is zero and becomes +obsoleted. This decision is made in aufs automatically. + +The dynamically allocated memory block for the name of entries has a +unit of 512 bytes (by default) and stores the names contiguously (no +padding). Another block for each entry is handled by kmem_cache too. +During building dir blocks, aufs creates hash list and judging whether +the entry is whiteouted by its upper branch or already listed. +The merged result is cached in the corresponding inode object and +maintained by a customizable life-time option. + +Some people may call it can be a security hole or invite DoS attack +since the opened and once readdir-ed dir (file object) holds its entry +list and becomes a pressure for system memory. But I'd say it is similar +to files under /proc or /sys. The virtual files in them also holds a +memory page (generally) while they are opened. When an idea to reduce +memory for them is introduced, it will be applied to aufs too. +For those who really hate this situation, I've developed readdir(3) +library which operates this merging in userspace. You just need to set +LD_PRELOAD environment variable, and aufs will not consume no memory in +kernel space for readdir(3). + + +Workqueue +---------------------------------------------------------------------- +Aufs sometimes requires privilege access to a branch. For instance, +in copy-up/down operation. When a user process is going to make changes +to a file which exists in the lower readonly branch only, and the mode +of one of ancestor directories may not be writable by a user +process. Here aufs copy-up the file with its ancestors and they may +require privilege to set its owner/group/mode/etc. +This is a typical case of a application character of aufs (see +Introduction). + +Aufs uses workqueue synchronously for this case. It creates its own +workqueue. The workqueue is a kernel thread and has privilege. Aufs +passes the request to call mkdir or write (for example), and wait for +its completion. This approach solves a problem of a signal handler +simply. +If aufs didn't adopt the workqueue and changed the privilege of the +process, then the process may receive the unexpected SIGXFSZ or other +signals. + +Also aufs uses the system global workqueue ("events" kernel thread) too +for asynchronous tasks, such like handling inotify/fsnotify, re-creating a +whiteout base and etc. This is unrelated to a privilege. +Most of aufs operation tries acquiring a rw_semaphore for aufs +superblock at the beginning, at the same time waits for the completion +of all queued asynchronous tasks. + + +Whiteout +---------------------------------------------------------------------- +The whiteout in aufs is very similar to Unionfs's. That is represented +by its filename. UnionMount takes an approach of a file mode, but I am +afraid several utilities (find(1) or something) will have to support it. + +Basically the whiteout represents "logical deletion" which stops aufs to +lookup further, but also it represents "dir is opaque" which also stop +further lookup. + +In aufs, rmdir(2) and rename(2) for dir uses whiteout alternatively. +In order to make several functions in a single systemcall to be +revertible, aufs adopts an approach to rename a directory to a temporary +unique whiteouted name. +For example, in rename(2) dir where the target dir already existed, aufs +renames the target dir to a temporary unique whiteouted name before the +actual rename on a branch, and then handles other actions (make it opaque, +update the attributes, etc). If an error happens in these actions, aufs +simply renames the whiteouted name back and returns an error. If all are +succeeded, aufs registers a function to remove the whiteouted unique +temporary name completely and asynchronously to the system global +workqueue. + + +Copy-up +---------------------------------------------------------------------- +It is a well-known feature or concept. +When user modifies a file on a readonly branch, aufs operate "copy-up" +internally and makes change to the new file on the upper writable branch. +When the trigger systemcall does not update the timestamps of the parent +dir, aufs reverts it after copy-up. + + +Move-down (aufs3.9 and later) +---------------------------------------------------------------------- +"Copy-up" is one of the essential feature in aufs. It copies a file from +the lower readonly branch to the upper writable branch when a user +changes something about the file. +"Move-down" is an opposite action of copy-up. Basically this action is +ran manually instead of automatically and internally. +For desgin and implementation, aufs has to consider these issues. +- whiteout for the file may exist on the lower branch. +- ancestor directories may not exist on the lower branch. +- diropq for the ancestor directories may exist on the upper branch. +- free space on the lower branch will reduce. +- another access to the file may happen during moving-down, including + UDBA (see "Revalidate Dentry and UDBA"). +- the file should not be hard-linked nor pseudo-linked. they should be + handled by auplink utility later. + +Sometimes users want to move-down a file from the upper writable branch +to the lower readonly or writable branch. For instance, +- the free space of the upper writable branch is going to run out. +- create a new intermediate branch between the upper and lower branch. +- etc. + +For this purpose, use "aumvdown" command in aufs-util.git. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/03atomic_open.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/03atomic_open.txt @@ -0,0 +1,85 @@ + +# Copyright (C) 2015-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Support for a branch who has its ->atomic_open() +---------------------------------------------------------------------- +The filesystems who implement its ->atomic_open() are not majority. For +example NFSv4 does, and aufs should call NFSv4 ->atomic_open, +particularly for open(O_CREAT|O_EXCL, 0400) case. Other than +->atomic_open(), NFSv4 returns an error for this open(2). While I am not +sure whether all filesystems who have ->atomic_open() behave like this, +but NFSv4 surely returns the error. + +In order to support ->atomic_open() for aufs, there are a few +approaches. + +A. Introduce aufs_atomic_open() + - calls one of VFS:do_last(), lookup_open() or atomic_open() for + branch fs. +B. Introduce aufs_atomic_open() calling create, open and chmod. this is + an aufs user Pip Cet's approach + - calls aufs_create(), VFS finish_open() and notify_change(). + - pass fake-mode to finish_open(), and then correct the mode by + notify_change(). +C. Extend aufs_open() to call branch fs's ->atomic_open() + - no aufs_atomic_open(). + - aufs_lookup() registers the TID to an aufs internal object. + - aufs_create() does nothing when the matching TID is registered, but + registers the mode. + - aufs_open() calls branch fs's ->atomic_open() when the matching + TID is registered. +D. Extend aufs_open() to re-try branch fs's ->open() with superuser's + credential + - no aufs_atomic_open(). + - aufs_create() registers the TID to an internal object. this info + represents "this process created this file just now." + - when aufs gets EACCES from branch fs's ->open(), then confirm the + registered TID and re-try open() with superuser's credential. + +Pros and cons for each approach. + +A. + - straightforward but highly depends upon VFS internal. + - the atomic behavaiour is kept. + - some of parameters such as nameidata are hard to reproduce for + branch fs. + - large overhead. +B. + - easy to implement. + - the atomic behavaiour is lost. +C. + - the atomic behavaiour is kept. + - dirty and tricky. + - VFS checks whether the file is created correctly after calling + ->create(), which means this approach doesn't work. +D. + - easy to implement. + - the atomic behavaiour is lost. + - to open a file with superuser's credential and give it to a user + process is a bad idea, since the file object keeps the credential + in it. It may affect LSM or something. This approach doesn't work + either. + +The approach A is ideal, but it hard to implement. So here is a +variation of A, which is to be implemented. + +A-1. Introduce aufs_atomic_open() + - calls branch fs ->atomic_open() if exists. otherwise calls + vfs_create() and finish_open(). + - the demerit is that the several checks after branch fs + ->atomic_open() are lost. in the ordinary case, the checks are + done by VFS:do_last(), lookup_open() and atomic_open(). some can + be implemented in aufs, but not all I am afraid. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/03lookup.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/03lookup.txt @@ -0,0 +1,113 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Lookup in a Branch +---------------------------------------------------------------------- +Since aufs has a character of sub-VFS (see Introduction), it operates +lookup for branches as VFS does. It may be a heavy work. But almost all +lookup operation in aufs is the simplest case, ie. lookup only an entry +directly connected to its parent. Digging down the directory hierarchy +is unnecessary. VFS has a function lookup_one_len() for that use, and +aufs calls it. + +When a branch is a remote filesystem, aufs basically relies upon its +->d_revalidate(), also aufs forces the hardest revalidate tests for +them. +For d_revalidate, aufs implements three levels of revalidate tests. See +"Revalidate Dentry and UDBA" in detail. + + +Test Only the Highest One for the Directory Permission (dirperm1 option) +---------------------------------------------------------------------- +Let's try case study. +- aufs has two branches, upper readwrite and lower readonly. + /au = /rw + /ro +- "dirA" exists under /ro, but /rw. and its mode is 0700. +- user invoked "chmod a+rx /au/dirA" +- the internal copy-up is activated and "/rw/dirA" is created and its + permission bits are set to world readable. +- then "/au/dirA" becomes world readable? + +In this case, /ro/dirA is still 0700 since it exists in readonly branch, +or it may be a natively readonly filesystem. If aufs respects the lower +branch, it should not respond readdir request from other users. But user +allowed it by chmod. Should really aufs rejects showing the entries +under /ro/dirA? + +To be honest, I don't have a good solution for this case. So aufs +implements 'dirperm1' and 'nodirperm1' mount options, and leave it to +users. +When dirperm1 is specified, aufs checks only the highest one for the +directory permission, and shows the entries. Otherwise, as usual, checks +every dir existing on all branches and rejects the request. + +As a side effect, dirperm1 option improves the performance of aufs +because the number of permission check is reduced when the number of +branch is many. + + +Revalidate Dentry and UDBA (User's Direct Branch Access) +---------------------------------------------------------------------- +Generally VFS helpers re-validate a dentry as a part of lookup. +0. digging down the directory hierarchy. +1. lock the parent dir by its i_mutex. +2. lookup the final (child) entry. +3. revalidate it. +4. call the actual operation (create, unlink, etc.) +5. unlock the parent dir + +If the filesystem implements its ->d_revalidate() (step 3), then it is +called. Actually aufs implements it and checks the dentry on a branch is +still valid. +But it is not enough. Because aufs has to release the lock for the +parent dir on a branch at the end of ->lookup() (step 2) and +->d_revalidate() (step 3) while the i_mutex of the aufs dir is still +held by VFS. +If the file on a branch is changed directly, eg. bypassing aufs, after +aufs released the lock, then the subsequent operation may cause +something unpleasant result. + +This situation is a result of VFS architecture, ->lookup() and +->d_revalidate() is separated. But I never say it is wrong. It is a good +design from VFS's point of view. It is just not suitable for sub-VFS +character in aufs. + +Aufs supports such case by three level of revalidation which is +selectable by user. +1. Simple Revalidate + Addition to the native flow in VFS's, confirm the child-parent + relationship on the branch just after locking the parent dir on the + branch in the "actual operation" (step 4). When this validation + fails, aufs returns EBUSY. ->d_revalidate() (step 3) in aufs still + checks the validation of the dentry on branches. +2. Monitor Changes Internally by Inotify/Fsnotify + Addition to above, in the "actual operation" (step 4) aufs re-lookup + the dentry on the branch, and returns EBUSY if it finds different + dentry. + Additionally, aufs sets the inotify/fsnotify watch for every dir on branches + during it is in cache. When the event is notified, aufs registers a + function to kernel 'events' thread by schedule_work(). And the + function sets some special status to the cached aufs dentry and inode + private data. If they are not cached, then aufs has nothing to + do. When the same file is accessed through aufs (step 0-3) later, + aufs will detect the status and refresh all necessary data. + In this mode, aufs has to ignore the event which is fired by aufs + itself. +3. No Extra Validation + This is the simplest test and doesn't add any additional revalidation + test, and skip the revalidation in step 4. It is useful and improves + aufs performance when system surely hide the aufs branches from user, + by over-mounting something (or another method). --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/04branch.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/04branch.txt @@ -0,0 +1,74 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Branch Manipulation + +Since aufs supports dynamic branch manipulation, ie. add/remove a branch +and changing its permission/attribute, there are a lot of works to do. + + +Add a Branch +---------------------------------------------------------------------- +o Confirm the adding dir exists outside of aufs, including loopback + mount, and its various attributes. +o Initialize the xino file and whiteout bases if necessary. + See struct.txt. + +o Check the owner/group/mode of the directory + When the owner/group/mode of the adding directory differs from the + existing branch, aufs issues a warning because it may impose a + security risk. + For example, when a upper writable branch has a world writable empty + top directory, a malicious user can create any files on the writable + branch directly, like copy-up and modify manually. If something like + /etc/{passwd,shadow} exists on the lower readonly branch but the upper + writable branch, and the writable branch is world-writable, then a + malicious guy may create /etc/passwd on the writable branch directly + and the infected file will be valid in aufs. + I am afraid it can be a security issue, but aufs can do nothing except + producing a warning. + + +Delete a Branch +---------------------------------------------------------------------- +o Confirm the deleting branch is not busy + To be general, there is one merit to adopt "remount" interface to + manipulate branches. It is to discard caches. At deleting a branch, + aufs checks the still cached (and connected) dentries and inodes. If + there are any, then they are all in-use. An inode without its + corresponding dentry can be alive alone (for example, inotify/fsnotify case). + + For the cached one, aufs checks whether the same named entry exists on + other branches. + If the cached one is a directory, because aufs provides a merged view + to users, as long as one dir is left on any branch aufs can show the + dir to users. In this case, the branch can be removed from aufs. + Otherwise aufs rejects deleting the branch. + + If any file on the deleting branch is opened by aufs, then aufs + rejects deleting. + + +Modify the Permission of a Branch +---------------------------------------------------------------------- +o Re-initialize or remove the xino file and whiteout bases if necessary. + See struct.txt. + +o rw --> ro: Confirm the modifying branch is not busy + Aufs rejects the request if any of these conditions are true. + - a file on the branch is mmap-ed. + - a regular file on the branch is opened for write and there is no + same named entry on the upper branch. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/05wbr_policy.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/05wbr_policy.txt @@ -0,0 +1,64 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Policies to Select One among Multiple Writable Branches +---------------------------------------------------------------------- +When the number of writable branch is more than one, aufs has to decide +the target branch for file creation or copy-up. By default, the highest +writable branch which has the parent (or ancestor) dir of the target +file is chosen (top-down-parent policy). +By user's request, aufs implements some other policies to select the +writable branch, for file creation several policies, round-robin, +most-free-space, and other policies. For copy-up, top-down-parent, +bottom-up-parent, bottom-up and others. + +As expected, the round-robin policy selects the branch in circular. When +you have two writable branches and creates 10 new files, 5 files will be +created for each branch. mkdir(2) systemcall is an exception. When you +create 10 new directories, all will be created on the same branch. +And the most-free-space policy selects the one which has most free +space among the writable branches. The amount of free space will be +checked by aufs internally, and users can specify its time interval. + +The policies for copy-up is more simple, +top-down-parent is equivalent to the same named on in create policy, +bottom-up-parent selects the writable branch where the parent dir +exists and the nearest upper one from the copyup-source, +bottom-up selects the nearest upper writable branch from the +copyup-source, regardless the existence of the parent dir. + +There are some rules or exceptions to apply these policies. +- If there is a readonly branch above the policy-selected branch and + the parent dir is marked as opaque (a variation of whiteout), or the + target (creating) file is whiteout-ed on the upper readonly branch, + then the result of the policy is ignored and the target file will be + created on the nearest upper writable branch than the readonly branch. +- If there is a writable branch above the policy-selected branch and + the parent dir is marked as opaque or the target file is whiteouted + on the branch, then the result of the policy is ignored and the target + file will be created on the highest one among the upper writable + branches who has diropq or whiteout. In case of whiteout, aufs removes + it as usual. +- link(2) and rename(2) systemcalls are exceptions in every policy. + They try selecting the branch where the source exists as possible + since copyup a large file will take long time. If it can't be, + ie. the branch where the source exists is readonly, then they will + follow the copyup policy. +- There is an exception for rename(2) when the target exists. + If the rename target exists, aufs compares the index of the branches + where the source and the target exists and selects the higher + one. If the selected branch is readonly, then aufs follows the + copyup policy. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/06fhsm.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/06fhsm.txt @@ -0,0 +1,120 @@ + +# Copyright (C) 2011-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +File-based Hierarchical Storage Management (FHSM) +---------------------------------------------------------------------- +Hierarchical Storage Management (or HSM) is a well-known feature in the +storage world. Aufs provides this feature as file-based with multiple +writable branches, based upon the principle of "Colder, the Lower". +Here the word "colder" means that the less used files, and "lower" means +that the position in the order of the stacked branches vertically. +These multiple writable branches are prioritized, ie. the topmost one +should be the fastest drive and be used heavily. + +o Characters in aufs FHSM story +- aufs itself and a new branch attribute. +- a new ioctl interface to move-down and to establish a connection with + the daemon ("move-down" is a converse of "copy-up"). +- userspace tool and daemon. + +The userspace daemon establishes a connection with aufs and waits for +the notification. The notified information is very similar to struct +statfs containing the number of consumed blocks and inodes. +When the consumed blocks/inodes of a branch exceeds the user-specified +upper watermark, the daemon activates its move-down process until the +consumed blocks/inodes reaches the user-specified lower watermark. + +The actual move-down is done by aufs based upon the request from +user-space since we need to maintain the inode number and the internal +pointer arrays in aufs. + +Currently aufs FHSM handles the regular files only. Additionally they +must not be hard-linked nor pseudo-linked. + + +o Cowork of aufs and the user-space daemon + During the userspace daemon established the connection, aufs sends a + small notification to it whenever aufs writes something into the + writable branch. But it may cost high since aufs issues statfs(2) + internally. So user can specify a new option to cache the + info. Actually the notification is controlled by these factors. + + the specified cache time. + + classified as "force" by aufs internally. + Until the specified time expires, aufs doesn't send the info + except the forced cases. When aufs decide forcing, the info is always + notified to userspace. + For example, the number of free inodes is generally large enough and + the shortage of it happens rarely. So aufs doesn't force the + notification when creating a new file, directory and others. This is + the typical case which aufs doesn't force. + When aufs writes the actual filedata and the files consumes any of new + blocks, the aufs forces notifying. + + +o Interfaces in aufs +- New branch attribute. + + fhsm + Specifies that the branch is managed by FHSM feature. In other word, + participant in the FHSM. + When nofhsm is set to the branch, it will not be the source/target + branch of the move-down operation. This attribute is set + independently from coo and moo attributes, and if you want full + FHSM, you should specify them as well. +- New mount option. + + fhsm_sec + Specifies a second to suppress many less important info to be + notified. +- New ioctl. + + AUFS_CTL_FHSM_FD + create a new file descriptor which userspace can read the notification + (a subset of struct statfs) from aufs. +- Module parameter 'brs' + It has to be set to 1. Otherwise the new mount option 'fhsm' will not + be set. +- mount helpers /sbin/mount.aufs and /sbin/umount.aufs + When there are two or more branches with fhsm attributes, + /sbin/mount.aufs invokes the user-space daemon and /sbin/umount.aufs + terminates it. As a result of remounting and branch-manipulation, the + number of branches with fhsm attribute can be one. In this case, + /sbin/mount.aufs will terminate the user-space daemon. + + +Finally the operation is done as these steps in kernel-space. +- make sure that, + + no one else is using the file. + + the file is not hard-linked. + + the file is not pseudo-linked. + + the file is a regular file. + + the parent dir is not opaqued. +- find the target writable branch. +- make sure the file is not whiteout-ed by the upper (than the target) + branch. +- make the parent dir on the target branch. +- mutex lock the inode on the branch. +- unlink the whiteout on the target branch (if exists). +- lookup and create the whiteout-ed temporary name on the target branch. +- copy the file as the whiteout-ed temporary name on the target branch. +- rename the whiteout-ed temporary name to the original name. +- unlink the file on the source branch. +- maintain the internal pointer array and the external inode number + table (XINO). +- maintain the timestamps and other attributes of the parent dir and the + file. + +And of course, in every step, an error may happen. So the operation +should restore the original file state after an error happens. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/06mmap.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/06mmap.txt @@ -0,0 +1,72 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +mmap(2) -- File Memory Mapping +---------------------------------------------------------------------- +In aufs, the file-mapped pages are handled by a branch fs directly, no +interaction with aufs. It means aufs_mmap() calls the branch fs's +->mmap(). +This approach is simple and good, but there is one problem. +Under /proc, several entries show the mmapped files by its path (with +device and inode number), and the printed path will be the path on the +branch fs's instead of virtual aufs's. +This is not a problem in most cases, but some utilities lsof(1) (and its +user) may expect the path on aufs. + +To address this issue, aufs adds a new member called vm_prfile in struct +vm_area_struct (and struct vm_region). The original vm_file points to +the file on the branch fs in order to handle everything correctly as +usual. The new vm_prfile points to a virtual file in aufs, and the +show-functions in procfs refers to vm_prfile if it is set. +Also we need to maintain several other places where touching vm_file +such like +- fork()/clone() copies vma and the reference count of vm_file is + incremented. +- merging vma maintains the ref count too. + +This is not a good approach. It just fakes the printed path. But it +leaves all behaviour around f_mapping unchanged. This is surely an +advantage. +Actually aufs had adopted another complicated approach which calls +generic_file_mmap() and handles struct vm_operations_struct. In this +approach, aufs met a hard problem and I could not solve it without +switching the approach. + +There may be one more another approach which is +- bind-mount the branch-root onto the aufs-root internally +- grab the new vfsmount (ie. struct mount) +- lazy-umount the branch-root internally +- in open(2) the aufs-file, open the branch-file with the hidden + vfsmount (instead of the original branch's vfsmount) +- ideally this "bind-mount and lazy-umount" should be done atomically, + but it may be possible from userspace by the mount helper. + +Adding the internal hidden vfsmount and using it in opening a file, the +file path under /proc will be printed correctly. This approach looks +smarter, but is not possible I am afraid. +- aufs-root may be bind-mount later. when it happens, another hidden + vfsmount will be required. +- it is hard to get the chance to bind-mount and lazy-umount + + in kernel-space, FS can have vfsmount in open(2) via + file->f_path, and aufs can know its vfsmount. But several locks are + already acquired, and if aufs tries to bind-mount and lazy-umount + here, then it may cause a deadlock. + + in user-space, bind-mount doesn't invoke the mount helper. +- since /proc shows dev and ino, aufs has to give vma these info. it + means a new member vm_prinode will be necessary. this is essentially + equivalent to vm_prfile described above. + +I have to give up this "looks-smater" approach. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/06xattr.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/06xattr.txt @@ -0,0 +1,96 @@ + +# Copyright (C) 2014-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program; if not, write to the Free Software +# Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA + + +Listing XATTR/EA and getting the value +---------------------------------------------------------------------- +For the inode standard attributes (owner, group, timestamps, etc.), aufs +shows the values from the topmost existing file. This behaviour is good +for the non-dir entries since the bahaviour exactly matches the shown +information. But for the directories, aufs considers all the same named +entries on the lower branches. Which means, if one of the lower entry +rejects readdir call, then aufs returns an error even if the topmost +entry allows it. This behaviour is necessary to respect the branch fs's +security, but can make users confused since the user-visible standard +attributes don't match the behaviour. +To address this issue, aufs has a mount option called dirperm1 which +checks the permission for the topmost entry only, and ignores the lower +entry's permission. + +A similar issue can happen around XATTR. +getxattr(2) and listxattr(2) families behave as if dirperm1 option is +always set. Otherwise these very unpleasant situation would happen. +- listxattr(2) may return the duplicated entries. +- users may not be able to remove or reset the XATTR forever, + + +XATTR/EA support in the internal (copy,move)-(up,down) +---------------------------------------------------------------------- +Generally the extended attributes of inode are categorized as these. +- "security" for LSM and capability. +- "system" for posix ACL, 'acl' mount option is required for the branch + fs generally. +- "trusted" for userspace, CAP_SYS_ADMIN is required. +- "user" for userspace, 'user_xattr' mount option is required for the + branch fs generally. + +Moreover there are some other categories. Aufs handles these rather +unpopular categories as the ordinary ones, ie. there is no special +condition nor exception. + +In copy-up, the support for XATTR on the dst branch may differ from the +src branch. In this case, the copy-up operation will get an error and +the original user operation which triggered the copy-up will fail. It +can happen that even all copy-up will fail. +When both of src and dst branches support XATTR and if an error occurs +during copying XATTR, then the copy-up should fail obviously. That is a +good reason and aufs should return an error to userspace. But when only +the src branch support that XATTR, aufs should not return an error. +For example, the src branch supports ACL but the dst branch doesn't +because the dst branch may natively un-support it or temporary +un-support it due to "noacl" mount option. Of course, the dst branch fs +may NOT return an error even if the XATTR is not supported. It is +totally up to the branch fs. + +Anyway when the aufs internal copy-up gets an error from the dst branch +fs, then aufs tries removing the just copied entry and returns the error +to the userspace. The worst case of this situation will be all copy-up +will fail. + +For the copy-up operation, there two basic approaches. +- copy the specified XATTR only (by category above), and return the + error unconditionally if it happens. +- copy all XATTR, and ignore the error on the specified category only. + +In order to support XATTR and to implement the correct behaviour, aufs +chooses the latter approach and introduces some new branch attributes, +"icexsec", "icexsys", "icextr", "icexusr", and "icexoth". +They correspond to the XATTR namespaces (see above). Additionally, to be +convenient, "icex" is also provided which means all "icex*" attributes +are set (here the word "icex" stands for "ignore copy-error on XATTR"). + +The meaning of these attributes is to ignore the error from setting +XATTR on that branch. +Note that aufs tries copying all XATTR unconditionally, and ignores the +error from the dst branch according to the specified attributes. + +Some XATTR may have its default value. The default value may come from +the parent dir or the environment. If the default value is set at the +file creating-time, it will be overwritten by copy-up. +Some contradiction may happen I am afraid. +Do we need another attribute to stop copying XATTR? I am unsure. For +now, aufs implements the branch attributes to ignore the error. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/07export.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/07export.txt @@ -0,0 +1,58 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Export Aufs via NFS +---------------------------------------------------------------------- +Here is an approach. +- like xino/xib, add a new file 'xigen' which stores aufs inode + generation. +- iget_locked(): initialize aufs inode generation for a new inode, and + store it in xigen file. +- destroy_inode(): increment aufs inode generation and store it in xigen + file. it is necessary even if it is not unlinked, because any data of + inode may be changed by UDBA. +- encode_fh(): for a root dir, simply return FILEID_ROOT. otherwise + build file handle by + + branch id (4 bytes) + + superblock generation (4 bytes) + + inode number (4 or 8 bytes) + + parent dir inode number (4 or 8 bytes) + + inode generation (4 bytes)) + + return value of exportfs_encode_fh() for the parent on a branch (4 + bytes) + + file handle for a branch (by exportfs_encode_fh()) +- fh_to_dentry(): + + find the index of a branch from its id in handle, and check it is + still exist in aufs. + + 1st level: get the inode number from handle and search it in cache. + + 2nd level: if not found in cache, get the parent inode number from + the handle and search it in cache. and then open the found parent + dir, find the matching inode number by vfs_readdir() and get its + name, and call lookup_one_len() for the target dentry. + + 3rd level: if the parent dir is not cached, call + exportfs_decode_fh() for a branch and get the parent on a branch, + build a pathname of it, convert it a pathname in aufs, call + path_lookup(). now aufs gets a parent dir dentry, then handle it as + the 2nd level. + + to open the dir, aufs needs struct vfsmount. aufs keeps vfsmount + for every branch, but not itself. to get this, (currently) aufs + searches in current->nsproxy->mnt_ns list. it may not be a good + idea, but I didn't get other approach. + + test the generation of the gotten inode. +- every inode operation: they may get EBUSY due to UDBA. in this case, + convert it into ESTALE for NFSD. +- readdir(): call lockdep_on/off() because filldir in NFSD calls + lookup_one_len(), vfs_getattr(), encode_fh() and others. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/08shwh.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/08shwh.txt @@ -0,0 +1,52 @@ + +# Copyright (C) 2005-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Show Whiteout Mode (shwh) +---------------------------------------------------------------------- +Generally aufs hides the name of whiteouts. But in some cases, to show +them is very useful for users. For instance, creating a new middle layer +(branch) by merging existing layers. + +(borrowing aufs1 HOW-TO from a user, Michael Towers) +When you have three branches, +- Bottom: 'system', squashfs (underlying base system), read-only +- Middle: 'mods', squashfs, read-only +- Top: 'overlay', ram (tmpfs), read-write + +The top layer is loaded at boot time and saved at shutdown, to preserve +the changes made to the system during the session. +When larger changes have been made, or smaller changes have accumulated, +the size of the saved top layer data grows. At this point, it would be +nice to be able to merge the two overlay branches ('mods' and 'overlay') +and rewrite the 'mods' squashfs, clearing the top layer and thus +restoring save and load speed. + +This merging is simplified by the use of another aufs mount, of just the +two overlay branches using the 'shwh' option. +# mount -t aufs -o ro,shwh,br:/livesys/overlay=ro+wh:/livesys/mods=rr+wh \ + aufs /livesys/merge_union + +A merged view of these two branches is then available at +/livesys/merge_union, and the new feature is that the whiteouts are +visible! +Note that in 'shwh' mode the aufs mount must be 'ro', which will disable +writing to all branches. Also the default mode for all branches is 'ro'. +It is now possible to save the combined contents of the two overlay +branches to a new squashfs, e.g.: +# mksquashfs /livesys/merge_union /path/to/newmods.squash + +This new squashfs archive can be stored on the boot device and the +initramfs will use it to replace the old one at the next boot. --- linux-azure-4.11.0.orig/Documentation/filesystems/aufs/design/10dynop.txt +++ linux-azure-4.11.0/Documentation/filesystems/aufs/design/10dynop.txt @@ -0,0 +1,47 @@ + +# Copyright (C) 2010-2017 Junjiro R. Okajima +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation; either version 2 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +Dynamically customizable FS operations +---------------------------------------------------------------------- +Generally FS operations (struct inode_operations, struct +address_space_operations, struct file_operations, etc.) are defined as +"static const", but it never means that FS have only one set of +operation. Some FS have multiple sets of them. For instance, ext2 has +three sets, one for XIP, for NOBH, and for normal. +Since aufs overrides and redirects these operations, sometimes aufs has +to change its behaviour according to the branch FS type. More importantly +VFS acts differently if a function (member in the struct) is set or +not. It means aufs should have several sets of operations and select one +among them according to the branch FS definition. + +In order to solve this problem and not to affect the behaviour of VFS, +aufs defines these operations dynamically. For instance, aufs defines +dummy direct_IO function for struct address_space_operations, but it may +not be set to the address_space_operations actually. When the branch FS +doesn't have it, aufs doesn't set it to its address_space_operations +while the function definition itself is still alive. So the behaviour +itself will not change, and it will return an error when direct_IO is +not set. + +The lifetime of these dynamically generated operation object is +maintained by aufs branch object. When the branch is removed from aufs, +the reference counter of the object is decremented. When it reaches +zero, the dynamically generated operation object will be freed. + +This approach is designed to support AIO (io_submit), Direct I/O and +XIP (DAX) mainly. +Currently this approach is applied to address_space_operations for +regular files only. --- linux-azure-4.11.0.orig/Documentation/ioctl/ioctl-number.txt +++ linux-azure-4.11.0/Documentation/ioctl/ioctl-number.txt @@ -324,7 +324,7 @@ 0xB5 00-0F uapi/linux/rpmsg.h 0xC0 00-0F linux/usb/iowarrior.h 0xCA 00-0F uapi/misc/cxl.h -0xCA 80-8F uapi/scsi/cxlflash_ioctl.h +0xCA 80-BF uapi/scsi/cxlflash_ioctl.h 0xCB 00-1F CBM serial IEC bus in development: 0xCD 01 linux/reiserfs_fs.h --- linux-azure-4.11.0.orig/Documentation/kdump/kdump.txt +++ linux-azure-4.11.0/Documentation/kdump/kdump.txt @@ -18,7 +18,7 @@ a remote system. Kdump and kexec are currently supported on the x86, x86_64, ppc64, ia64, -s390x and arm architectures. +s390x, arm and arm64 architectures. When the system kernel boots, it reserves a small section of memory for the dump-capture kernel. This ensures that ongoing Direct Memory Access @@ -249,6 +249,13 @@ AUTO_ZRELADDR=y +Dump-capture kernel config options (Arch Dependent, arm64) +---------------------------------------------------------- + +- Please note that kvm of the dump-capture kernel will not be enabled + on non-VHE systems even if it is configured. This is because the CPU + will not be reset to EL2 on panic. + Extended crashkernel syntax =========================== @@ -305,6 +312,8 @@ kernel will automatically locate the crash kernel image within the first 512MB of RAM if X is not given. + On arm64, use "crashkernel=Y[@X]". Note that the start address of + the kernel, X if explicitly specified, must be aligned to 2MiB (0x200000). Load the Dump-capture Kernel ============================ @@ -327,6 +336,8 @@ - Use image or bzImage For arm: - Use zImage +For arm64: + - Use vmlinux or Image If you are using a uncompressed vmlinux image then use following command to load dump-capture kernel. @@ -370,6 +381,9 @@ For arm: "1 maxcpus=1 reset_devices" +For arm64: + "1 maxcpus=1 reset_devices" + Notes on loading the dump-capture kernel: * By default, the ELF headers are stored in ELF64 format to support --- linux-azure-4.11.0.orig/Documentation/kmsg/IPVS +++ linux-azure-4.11.0/Documentation/kmsg/IPVS @@ -0,0 +1,81 @@ +/*? Text: "%s(): NULL arg\n" */ +/*? Text: "%s(): NULL scheduler_name\n" */ +/*? Text: "%s(): [%s] pe already existed in the system\n" */ +/*? Text: "%s(): [%s] pe already linked\n" */ +/*? Text: "%s(): [%s] pe is not in the list. failed\n" */ +/*? Text: "%s(): [%s] scheduler already existed in the system\n" */ +/*? Text: "%s(): [%s] scheduler already linked\n" */ +/*? Text: "%s(): [%s] scheduler is not in the list. failed\n" */ +/*? Text: "%s(): done error\n" */ +/*? Text: "%s(): init error\n" */ +/*? Text: "%s(): lower threshold is higher than upper threshold\n" */ +/*? Text: "%s(): no memory\n" */ +/*? Text: "%s(): request for already hashed, called from %pF\n" */ +/*? Text: "%s(): request for unhash flagged, called from %pF\n" */ +/*? Text: "%s(): server weight less than zero\n" */ +/*? Text: "%s: %s %pI4:%d - %s\n" */ +/*? Text: "%s: %s [%pI6]:%d - %s\n" */ +/*? Text: "%s: %s [%pI6c]:%d - %s\n" */ +/*? Text: "%s: FWM %u 0x%08X - %s\n" */ +/*? Text: "%s: enter\n" */ +/*? Text: "%s: loaded support on port[%d] = %d\n" */ +/*? Text: "BACKUP v0, Dropping buffer bogus conn options\n" */ +/*? Text: "BACKUP v0, bogus conn\n" */ +/*? Text: "BACKUP, Dropping buffer, Err: %d in decoding\n" */ +/*? Text: "BACKUP, Dropping buffer, Unknown version %d\n" */ +/*? Text: "BACKUP, Dropping buffer, msg > buffer\n" */ +/*? Text: "BACKUP, Dropping buffer, to small\n" */ +/*? Text: "BACKUP, Invalid PE parameters\n" */ +/*? Text: "BUG control DEL with n=0 : %s:%d to %s:%d\n" */ +/*? Text: "Connection hash table configured (size=%d, memory=%ldKbytes)\n" */ +/*? Text: "Error binding address of the mcast interface\n" */ +/*? Text: "Error binding to the multicast addr\n" */ +/*? Text: "Error connecting to the multicast addr\n" */ +/*? Text: "Error during creation of socket; terminating\n" */ +/*? Text: "Error joining to the multicast group\n" */ +/*? Text: "Error setting outbound mcast interface\n" */ +/*? Text: "Failed to stop Backup Daemon\n" */ +/*? Text: "Failed to stop Master Daemon\n" */ +/*? Text: "Registered protocols (%s)\n" */ +/*? Text: "SYNC, connection pe_data invalid\n" */ +/*? Text: "Schedule: port zero only supported in persistent services, check your ipvs configuration\n" */ +/*? Text: "Scheduler module ip_vs_%s not found\n" */ +/*? Text: "There is no net ptr to find in the skb in %s() line:%d\n" */ +/*? Text: "UDP no ns data\n" */ +/*? Text: "You probably need to specify IP address on multicast interface.\n" */ +/*? Text: "[%s] pe registered.\n" */ +/*? Text: "[%s] pe unregistered.\n" */ +/*? Text: "[%s] scheduler registered.\n" */ +/*? Text: "[%s] scheduler unregistered.\n" */ +/*? Text: "can't register hooks.\n" */ +/*? Text: "can't register netlink/ioctl.\n" */ +/*? Text: "can't setup connection table.\n" */ +/*? Text: "can't setup control.\n" */ +/*? Text: "cannot register Generic Netlink interface.\n" */ +/*? Text: "cannot register sockopt.\n" */ +/*? Text: "get_ctl: len %u < %u\n" */ +/*? Text: "ip_vs_send_async error %d\n" */ +/*? Text: "ip_vs_sync_buff_create failed.\n" */ +/*? Text: "ipvs loaded.\n" */ +/*? Text: "ipvs unloaded.\n" */ +/*? Text: "length: %u != %u\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "not enough space in Netlink message\n" */ +/*? Text: "persistence engine module ip_vs_pe_%s not found\n" */ +/*? Text: "receiving message error\n" */ +/*? Text: "request control ADD for already controlled: %s:%d to %s:%d\n" */ +/*? Text: "request control DEL for uncontrolled: %s:%d to %s:%d\n" */ +/*? Text: "set_ctl: invalid protocol: %d %pI4:%d %s\n" */ +/*? Text: "set_ctl: len %u != %u\n" */ +/*? Text: "shouldn't reach here, because the box is on the half connection in the tun/dr module.\n" */ +/*? Text: "stopping backup sync thread %d ...\n" */ +/*? Text: "stopping master sync thread %d ...\n" */ +/*? Text: "sync thread started: state = BACKUP, mcast_ifn = %s, syncid = %d\n" */ +/*? Text: "sync thread started: state = MASTER, mcast_ifn = %s, syncid = %d\n" */ +/*? Text: "unknown Generic Netlink command\n" */ +/*? Text: "sync thread started: state = MASTER, mcast_ifn = %s, syncid = %d, id = %d\n" */ +/*? Text: "sync thread started: state = BACKUP, mcast_ifn = %s, syncid = %d, id = %d\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "Unknown mcast interface: %s\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/aes_s390 +++ linux-azure-4.11.0/Documentation/kmsg/s390/aes_s390 @@ -0,0 +1,45 @@ +/*? + * Text: "Allocating XTS fallback algorithm %s failed\n" + * Severity: Error + * Parameter: + * @1: algorithm name + * Description: + * The aes_s390 module failed to allocate a software fallback for the AES + * modes that are not supported by the hardware. A possible reason for this + * problem is that the aes_generic module that provides the fallback + * algorithms is not available. + * User action: + * Ensure that the aes_generic module is available and loaded and reload + * the aes_s390 module. + */ + +/*? + * Text: "Allocating AES fallback algorithm %s failed\n" + * Severity: Error + * Parameter: + * @1: algorithm name + * Description: + * The advanced encryption standard (AES) algorithm includes three modes with + * 128-bit, 192-bit, and 256-bit keys. Your hardware system only provides + * hardware acceleration for the 128-bit mode. The aes_s390 module failed to + * allocate a software fallback for the AES modes that are not supported by the + * hardware. A possible reason for this problem is that the aes_generic module + * that provides the fallback algorithms is not available. + * User action: + * Use the 128-bit mode only or ensure that the aes_generic module is available + * and loaded and reload the aes_s390 module. + */ + +/*? + * Text: "AES hardware acceleration is only available for 128-bit keys\n" + * Severity: Informational + * Description: + * The advanced encryption standard (AES) algorithm includes three modes with + * 128-bit, 192-bit, and 256-bit keys. Your hardware system only provides + * hardware acceleration for the 128-bit key mode. The aes_s390 module + * will use the less performant software fallback algorithm for the 192-bit + * and 256-bit key modes. + * User action: + * None. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/af_iucv +++ linux-azure-4.11.0/Documentation/kmsg/s390/af_iucv @@ -0,0 +1,23 @@ +/*? + * Text: "Application %s on z/VM guest %s exceeds message limit\n" + * Severity: Error + * Parameter: + * @1: application name + * @2: z/VM user ID + * Description: + * Messages or packets destined for the application have accumulated and + * reached the maximum value. The default for the message limit is 65535. + * You can specify a different limit as the value for MSGLIMIT within + * the IUCV statement of the z/VM virtual machine on which the application + * runs. + * User action: + * Ensure that you do not send data faster than the application retrieves + * them. Ensure that the message limit on the z/VM guest virtual machine + * on which the application runs is high enough. + */ + +/*? Text: "Attempt to release alive iucv socket %p\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/ap +++ linux-azure-4.11.0/Documentation/kmsg/s390/ap @@ -0,0 +1,49 @@ +/*? + * Text: "%d is not a valid cryptographic domain\n" + * Severity: Warning + * Parameter: + * @1: AP domain index + * Description: + * The cryptographic domain specified for the 'domain=' module or kernel + * parameter must be an integer in the range 0 to 15. + * User action: + * Reload the cryptographic device driver with a correct module parameter. + * If the device driver has been compiled into the kernel, correct the value + * in the kernel parameter line and reboot Linux. + */ + +/*? + * Text: "The hardware system does not support AP instructions\n" + * Severity: Warning + * Description: + * The ap module addresses AP adapters through AP instructions. The hardware + * system on which the Linux instance runs does not support AP instructions. + * The ap module cannot detect any AP adapters. + * User action: + * Load the ap module only if your Linux instance runs on hardware that + * supports AP instructions. If the ap module has been compiled into the kernel, + * ignore this message. + */ + +/*? + * Text: "Registering adapter interrupts for AP device %02x.%04x failed\n" + * Severity: Error + * Parameter: + * @1: AP device ID + * @2: AP queue + * Description: + * The hardware system supports AP adapter interrupts but failed to enable + * an adapter for interrupts. Possible causes for this error are: + * i) The AP adapter firmware does not support AP interrupts. + * ii) An AP adapter firmware update to a firmware level that supports AP + * adapter interrupts failed. + * iii) The AP adapter firmware has been successfully updated to a level that + * supports AP interrupts but the new firmware has not been activated. + * User action: + * Ensure that the firmware on your AP adapters support AP interrupts and that + * any firmware updates have completed successfully. If necessary, deconfigure + * your cryptographic adapters and reconfigure them to ensure that any firmware + * updates become active, then reload the ap module. If the ap module has been + * compiled into the kernel, reboot Linux. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/appldata +++ linux-azure-4.11.0/Documentation/kmsg/s390/appldata @@ -0,0 +1,91 @@ +/*? + * Text: "Starting the data collection for %s failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: appldata module + * @2: return code + * Description: + * The specified data collection module used the z/VM diagnose call + * DIAG 0xDC to start writing data. z/VM returned an error and the data + * collection could not start. If the return code is 5, your z/VM guest + * virtual machine is not authorized to write data records. + * User action: + * If the return code is 5, ensure that your z/VM guest virtual machine's + * entry in the z/VM directory includes the OPTION APPLMON statement. + * For other return codes see the section about DIAGNOSE Code X'DC' + * in "z/VM CP Programming Services". + */ + +/*? + * Text: "Stopping the data collection for %s failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: appldata module + * @2: return code + * Description: + * The specified data collection module used the z/VM diagnose call DIAG 0xDC + * to stop writing data. z/VM returned an error and the data collection + * continues. + * User action: + * See the section about DIAGNOSE Code X'DC' in "z/VM CP Programming Services". + */ + +/*? + * Text: "Starting a new OS data collection failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * After a CPU hotplug event, the record size for the running operating + * system data collection is no longer correct. The appldata_os module tried + * to start a new data collection with the correct record size but received + * an error from the z/VM diagnose call DIAG 0xDC. Any data collected with + * the current record size might be faulty. + * User action: + * Start a new data collection with the cappldata_os module. For information + * about starting data collections see "Device Drivers, Features, and + * Commands". For information about the return codes see the section about + * DIAGNOSE Code X'DC' in "z/VM CP Programming Services". + */ + +/*? + * Text: "Stopping a faulty OS data collection failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * After a CPU hotplug event, the record size for the running operating + * system data collection is no longer correct. The appldata_os module tried + * to stop the faulty data collection but received an error from the z/VM + * diagnose call DIAG 0xDC. Any data collected with the current record size + * might be faulty. + * User action: + * Try to restart appldata_os monitoring. For information about stopping + * and starting data collections see "Device Drivers, Features, and + * Commands". For information about the return codes see the section about + * DIAGNOSE Code X'DC' in "z/VM CP Programming Services". + */ + +/*? + * Text: "Maximum OS record size %i exceeds the maximum record size %i\n" + * Severity: Error + * Parameter: + * @1: no of bytes + * @2: no of bytes + * Description: + * The OS record size grows with the number of CPUs and is adjusted by the + * appldata_os module in response to CPU hotplug events. For more than 110 + * CPUs the record size would exceed the maximum record size of 4024 bytes + * that is supported by the z/VM hypervisor. To prevent the maximum supported + * record size from being exceeded while data collection is in progress, + * you cannot load the appldata_os module on Linux instances that are + * configured for a maximum of more than 110 CPUs. + * User action: + * If you do not want to collect operating system data, you can ignore this + * message. If you want to collect operating system data, reconfigure your + * Linux instance to support less than 110 CPUs. + */ + +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/bpf_jit +++ linux-azure-4.11.0/Documentation/kmsg/s390/bpf_jit @@ -0,0 +1,16 @@ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ + +/*? + * Text: "Unknown opcode %02x\n" + * Severity: Error + * Parameter: + * @1: Instruction opcode + * Description: + * The BPF JIT compiler has found an unknown instruction in the BPF program + * and therefore stops the compilation. As a fallback, the interpreter is used. + * User action: + * Report this problem and the error message to your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/cio +++ linux-azure-4.11.0/Documentation/kmsg/s390/cio @@ -0,0 +1,247 @@ +/*? + * Text: "%s is not a valid device for the cio_ignore kernel parameter\n" + * Severity: Warning + * Parameter: + * @1: device bus-ID + * Description: + * The device specification for the cio_ignore kernel parameter is + * syntactically incorrect or specifies an unknown device. This device is not + * excluded from being sensed and analyzed. + * User action: + * Correct your device specification in the kernel parameter line to have the + * device excluded when you next reboot Linux. You can write the correct + * device specification to /proc/cio_ignore to add the device to the list of + * devices to be excluded. This does not immediately make the device + * inaccessible but the device is ignored if it disappears and later reappears. + */ + +/*? + * Text: "0.%x.%04x to 0.%x.%04x is not a valid range for cio_ignore\n" + * Severity: Warning + * Parameter: + * @1: from subchannel set ID + * @2: from device number + * @3: to subchannel set ID + * @4: to device number + * Description: + * The device range specified for the cio_ignore kernel parameter is + * syntactically incorrect. No devices specified with this range are + * excluded from being sensed and analyzed. + * User action: + * Correct your range specification in the kernel parameter line to have the + * range of devices excluded when you next reboot Linux. You can write the + * correct range specification to /proc/cio_ignore to add the range of devices + * to the list of devices to be excluded. This does not immediately make the + * devices in the range inaccessible but any of these devices are ignored if + * they disappear and later reappear. + */ + +/*? + * Text: "Processing %s for channel path %x.%02x\n" + * Severity: Notice + * Parameter: + * @1: configuration change + * @2: channel subsystem ID + * @3: CHPID + * Description: + * A configuration change is in progress for the given channel path. + * User action: + * None. + */ + +/*? + * Text: "No CCW console was found\n" + * Severity: Warning + * Description: + * Linux did not find the expected CCW console and tries to use an alternative + * console. A possible reason why the console was not found is that the console + * has been specified in the cio_ignore list. + * User action: + * None, if an appropriate alternative console has been found, and you want + * to use this alternative console. If you want to use the CCW console, ensure + * that is not specified in the cio_ignore list, explicitly specify the console + * with the 'condev=' kernel parameter, and reboot Linux. + */ + +/*? + * Text: "Channel measurement facility initialized using format %s (mode %s)\n" + * Severity: Informational + * Parameter: + * @1: format + * @2: mode + * Description: + * The channel measurement facility has been initialized successfully. + * Format 'extended' should be used for z990 and later mainframe systems. + * Format 'basic' is intended for earlier mainframes. Mode 'autodetected' means + * that the format has been set automatically. Mode 'parameter' means that the + * format has been set according to the 'format=' kernel parameter. + * User action: + * None. + */ + +/*? + * Text: "The CSS device driver initialization failed with errno=%d\n" + * Severity: Alert + * Parameter: + * @1: Return code + * Description: + * The channel subsystem bus could not be established. + * User action: + * See the errno man page to find out what caused the problem. + */ + /*? Text: "%s: Got subchannel machine check but no sch_event handler provided.\n" */ + +/*? + * Text: "%s: Setting the device online failed because it is boxed\n" + * Severity: Warning + * Parameter: + * @1: Device bus-ID + * Description: + * Initialization of a device did not complete because it did not respond in + * time or it was reserved by another operating system. + * User action: + * Make sure that the device is working correctly, then try again to set it + * online. For devices that support the reserve/release mechanism (for example + * DASDs), you can try to override the reservation of the other system by + * writing 'force' to the 'online' sysfs attribute of the affected device. + */ + +/*? + * Text: "%s: Setting the device online failed because it is not operational\n" + * Severity: Warning + * Parameter: + * @1: Device bus-ID + * Description: + * Initialization of a device did not complete because it is not present or + * not operational. + * User action: + * Make sure that the device is present and working correctly, then try again + * to set it online. + */ + +/*? + * Text: "%s: The device stopped operating while being set offline\n" + * Severity: Warning + * Parameter: + * @1: Device bus-ID + * Description: + * While the device was set offline, it was not present or not operational. + * The device is now inactive, but setting it online again might fail. + * User action: + * None. + */ + +/*? + * Text: "%s: The device entered boxed state while being set offline\n" + * Severity: Warning + * Parameter: + * @1: Device bus-ID + * Description: + * While the device was set offline, it did not respond in time or it was + * reserved by another operating system. The device is now inactive, but + * setting it online again might fail. + * User action: + * None. + */ + +/*? + * Text: "Logging for subchannel 0.%x.%04x failed with errno=%d\n" + * Severity: Warning + * Parameter: + * @1: subchannel set ID + * @2: subchannel number + * @3: errno + * Description: + * Capturing model-dependent logs and traces could not be triggered for the + * specified subchannel. + * User action: + * See the errno man page to find out what caused the problem. + */ + +/*? + * Text: "Logging for subchannel 0.%x.%04x was triggered\n" + * Severity: Notice + * Parameter: + * @1: subchannel set ID + * @2: subchannel number + * Description: + * Model-dependent logs and traces may be captured for the specified + * subchannel. + * User action: + * None. + */ + +/*? + * Text: "%s: No interrupt was received within %lus (CS=%02x, DS=%02x, CHPID=%x.%02x)\n" + * Severity: Warning + * Parameter: + * @1: device number + * @2: timeout value + * @3: channel status + * @4: device status + * @5: channel subsystem ID + * @6: CHPID + * Description: + * Internal I/Os are used by the common I/O layer to ensure that devices are + * operational and accessible. + * The common I/O layer did not receive an interrupt for an internal I/O + * during the specified timeout period. + * As a result, the device might assume a state that makes the device + * unusable to Linux until the problem is resolved. + * User action: + * Make sure that the device is working correctly and try the action again. + */ + +/*? + * Text: "Link stopped: RS=%02x RSID=%04x IC=%02x IUPARAMS=%s IUNODEID=%s AUPARAMS=%s AUNODEID=%s\n" + * Severity: Error + * Parameter: + * @1: reporting source + * @2: reporting source ID + * @3: incident code + * @4: incident unit parameters + * @5: incident unit node ID + * @6: attached unit parameters + * @7: attached unit node ID + * + * Description: + * A hardware error has occurred. A unit at one end of an interface + * link has detected a failure in the link or in one of the units attached to + * the link. As a result, data transfer across the link has stopped. In the + * message text, the node IDs of involved units are represented in the + * following format: TTTTTT/MDL,MMM.PPSSSSSSSSSSSS,XXXX where TTTTTT refers to + * the machine type, MDL the model number, MMM the manufacturer, PP the + * manufacturing plant, SSSSSSSSSSSS the unit sequence number and XXXX the + * machine type-dependent physical interface number. If no data is available + * for the unit parameters or node ID field, "n/a" is used instead. + * + * User action: + * Report the problem to your support organization. + */ + +/*? + * Text: "Link degraded: RS=%02x RSID=%04x IC=%02x IUPARAMS=%s IUNODEID=%s AUPARAMS=%s AUNODEID=%s\n" + * Severity: Warning + * Parameter: + * @1: reporting source + * @2: reporting source ID + * @3: incident code + * @4: incident unit parameters + * @5: incident unit node ID + * @6: attached unit parameters + * @7: attached unit node ID + * Description: + * A hardware error has occurred. A unit at one end of an interface + * link has detected a failure in the link or in one of the units attached to + * the link. As a result, data transfer across the link is degraded. In the + * message text, the node IDs of involved units are represented in the + * following format: TTTTTT/MDL,MMM.PPSSSSSSSSSSSS,XXXX where TTTTTT refers to + * the machine type, MDL the model number, MMM the manufacturer, PP the + * manufacturing plant, SSSSSSSSSSSS the unit sequence number and XXXX the + * machine type-dependent physical interface number. If no data is available + * for the unit parameters or node ID field, "n/a" is used instead. + * + * User action: + * Report the problem to your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/cpcmd +++ linux-azure-4.11.0/Documentation/kmsg/s390/cpcmd @@ -0,0 +1,16 @@ +/*? + * Text: "The cpcmd kernel function failed to allocate a response buffer\n" + * Severity: Warning + * Description: + * IPL code, console detection, and device drivers like vmcp or vmlogrdr use + * the cpcmd kernel function to send commands to the z/VM control program (CP). + * If a program that uses the cpcmd function does not allocate a contiguous + * response buffer below 2 GB guest real storage, cpcmd creates a bounce buffer + * to be used as the response buffer. Because of low memory or memory + * fragmentation, cpcmd could not create the bounce buffer. + * User action: + * Look for related page allocation failure messages and at the stack trace to + * find out which program or operation failed. Free some memory and retry the + * failed operation. Consider allocating more memory to your z/VM guest virtual + * machine. + */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/cpu +++ linux-azure-4.11.0/Documentation/kmsg/s390/cpu @@ -0,0 +1,46 @@ +/*? + * Text: "%d configured CPUs, %d standby CPUs\n" + * Severity: Informational + * Parameter: + * @1: number of configured CPUs + * @2: number of standby CPUs + * Description: + * The kernel detected the given number of configured and standby CPUs. + * User action: + * None. + */ + +/*? + * Text: "The CPU configuration topology of the machine is:" + * Severity: Informational + * Description: + * The first six values of the topology information represent fields Mag6 to + * Mag1 of system-information block (SYSIB) 15.1.2. These fields specify the + * maximum numbers of topology-list entries (TLE) at successive topology nesting + * levels. The last value represents the MNest value of SYSIB 15.1.2 which + * specifies the maximum possible nesting that can be configured through + * dynamic changes. For details see the SYSIB 15.1.2 information in the + * "Principles of Operation." + * User action: + * None. + */ + +/*? + * Text: "CPU %i exceeds the maximum %i and is excluded from the dump\n" + * Severity: Warning + * Parameter: + * @1: CPU number + * @2: maximum CPU number + * Description: + * The Linux kernel is used as a system dumper but it runs on more CPUs than + * it has been compiled for with the CONFIG_NR_CPUS kernel configuration + * option. The system dump will be created but information on one or more + * CPUs will be missing. + * User action: + * Update the system dump kernel to a newer version that supports more + * CPUs or reduce the number of installed CPUs and reproduce the problem + * that should be analyzed. If you send the system dump that prompted this + * message to a support organization, be sure to communicate that the dump + * does not include all CPU information. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/cpum_cf +++ linux-azure-4.11.0/Documentation/kmsg/s390/cpum_cf @@ -0,0 +1,68 @@ +/*? + * Text: "Enabling the performance measuring unit failed with rc=%x\n" + * Severity: Error + * Parameter: + * @1: error condition + * Description: + * The device driver failed to enable CPU counter sets with the + * load counter controls (lcctl) instruction. + * See the section about lcctl in "The Load-Program-Parameter and the CPU-Measurement + * Facilities", SA23-2260, for an explanation of the error conditions. + * User action: + * Stop the performance measurement programs and try again. + */ + +/*? + * Text: "Disabling the performance measuring unit failed with rc=%x\n" + * Severity: Error + * Parameter: + * @1: error condition + * Description: + * The device driver failed to disable CPU counter sets with the + * load counter controls (lcctl) instruction. + * See the section about lcctl in "The Load-Program-Parameter and the CPU-Measurement + * Facilities", SA23-2260, for an explanation of the error conditions. + * User action: + * Stop the performance measurement programs and try again. + */ + +/*? + * Text: "Registering the cpum_cf PMU failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: error code + * Description: + * The device driver could not register the Performance Measurement Unit (PMU) + * for the CPU-measurement counter facility. + * A possible cause of this problem is memory constraints. + * User action: + * If the error code is -12 (ENOMEM), consider assigning more memory + * to your Linux instance. + */ + +/*? + * Text: "CPU[%i] Counter data was lost\n" + * Severity: Error + * Parameter: + * @1: cpu number + * Description: + * CPU counter data was lost because of machine internal + * high-priority activities. + * User action: + * None. + */ + +/*? + * Text: "Registering for CPU-measurement alerts failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: error code + * Description: + * The device driver could not register to receive CPU-measurement alerts. + * Alerts make you aware of measurement errors. + * A possible cause of this problem is memory constraints. + * User action: + * If the error code is -12 (ENOMEM), consider assigning more memory + * to your Linux instance. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/cpum_sf +++ linux-azure-4.11.0/Documentation/kmsg/s390/cpum_sf @@ -0,0 +1,104 @@ +/*? + * Text: "The sampling buffer limits have changed to: min=%lu max=%lu (diag=x%lu)\n" + * Severity: Informational + * Parameter: + * @1: minimum size in sample-data-blocks + * @2: maximum size in sample-data-blocks + * @3: size factor for buffering diagnostic-sampling data entries + * Description: + * The minimum or maximum size limit for the sampling facility buffer was + * changed. The change is effective immediately. + * User action: + * None. + */ + +/*? + * Text: "Switching off the sampling facility failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: error condition + * Description: + * The CPU-measurement sampling facility could not be switched off and continues + * to run. For details, see LOAD SAMPLING CONTROLS in + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * If this problem persists, reboot your Linux instance. + */ + +/*? + * Text: "Sample data was lost\n" + * Severity: Error + * Description: + * Sample data was lost because of machine-internal high-priority activities. + * The sampling facility is stopped. + * User action: + * End all performance measurement sessions. Discard the measurement data, + * which are likely to be flawed. Repeat your measurements. + * If the problem persists, contact your hardware administrator. + */ + +/*? + * Text: "Sampling facility support for perf is not available: reason=%04x\n" + * Severity: Error + * Parameter: + * @1: reason code + * Description: + * The device driver could not initialize the sampling facility support. + * Possible reason codes are: + * 0001: The device driver failed to query CPU-measurement sampling facility + * information. + * + * 0002: The device driver does not support the basic-sampling function that + * is available on the LPAR within which the Linux instance runs. + * + * 0003: The device driver could not register to receive CPU-measurement alerts. + * A possible cause of this problem is memory constraints. + * + * 0004: The device driver could not register the Performance Measurement Unit + * (PMU) for the CPU-measurement sampling facility. + * A possible cause of this problem is memory constraints. + * User action: + * Consider assigning more memory to your Linux instance. + */ + +/*? + * Text: "Loading sampling controls failed: op=%i err=%i\n" + * Severity: Error + * Parameter: + * @1: Type of operation + * @2: Error condition + * Description: + * The sampling facility support could not load sampling controls to enable + * (operation type 1) or disable (operation type 2) the CPU-measurement sampling + * facility. For details of the error condition, see LOAD SAMPLING CONTROLS in + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * If the problem persists, reboot your Linux instance. + */ + +/*? + * Text: "A sampling buffer entry is incorrect (alert=0x%x)\n" + * Severity: Error + * Parameter: + * @1: Alert code + * Description: + * An incorrect sampling facility buffer entry was detected. The alert code + * indicates the root cause, for example, an incorrect entry address or an + * incorrect sample-data-block-table entry. + * User action: + * End active performance measurement sessions, for example, perf processes. If + * the problem persists, reboot your Linux instance. + */ + +/*? + * Text: "Registering for s390dbf failed\n" + * Severity: Error + * Description: + * The device driver failed to register for the s390 debug feature. You will + * not receive any debug information. A possible cause of this problem is + * memory constraints. + * User action: + * Consider assigning more memory + * to your Linux instance. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/crc32-vx +++ linux-azure-4.11.0/Documentation/kmsg/s390/crc32-vx @@ -0,0 +1 @@ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/ctcm +++ linux-azure-4.11.0/Documentation/kmsg/s390/ctcm @@ -0,0 +1,202 @@ +/*? + * Text: "%s: An I/O-error occurred on the CTCM device\n" + * Severity: Error + * Parameter: + * @1: bus ID of the CTCM device + * Description: + * An I/O error was detected on one of the subchannels of the CTCM device. + * Depending on the error, the CTCM device driver might attempt an automatic + * recovery. + * User action: + * Check the status of the CTCM device, for example, with ifconfig. If the + * device is not operational, perform a manual recovery. See "Device Drivers, + * Features, and Commands" for details about how to recover a CTCM device. + */ + +/*? + * Text: "%s: An adapter hardware operation timed out\n" + * Severity: Error + * Parameter: + * @1: bus ID of the CTCM device + * Description: + * The CTCM device uses an adapter to physically connect to its communication + * peer. An operation on this adapter timed out. + * User action: + * Check the status of the CTCM device, for example, with ifconfig. If the + * device is not operational, perform a manual recovery. See "Device Drivers, + * Features, and Commands" for details about how to recover a CTCM device. + */ + +/*? + * Text: "%s: An error occurred on the adapter hardware\n" + * Severity: Error + * Parameter: + * @1: bus ID of the CTCM device + * Description: + * The CTCM device uses an adapter to physically connect to its communication + * peer. An operation on this adapter returned an error. + * User action: + * Check the status of the CTCM device, for example, with ifconfig. If the + * device is not operational, perform a manual recovery. See "Device Drivers, + * Features, and Commands" for details about how to recover a CTCM device. + */ + +/*? + * Text: "%s: The communication peer has disconnected\n" + * Severity: Notice + * Parameter: + * @1: channel ID + * Description: + * The remote device has disconnected. Possible reasons are that the remote + * interface has been closed or that the operating system instance with the + * communication peer has been rebooted or shut down. + * User action: + * Check the status of the peer device. Ensure that the peer operating system + * instance is running and that the peer interface is operational. + */ + +/*? + * Text: "%s: The remote operating system is not available\n" + * Severity: Notice + * Parameter: + * @1: channel ID + * Description: + * The operating system instance with the communication peer has disconnected. + * Possible reasons are that the operating system instance has been rebooted + * or shut down. + * User action: + * Ensure that the peer operating system instance is running and that the peer + * interface is operational. + */ + +/*? + * Text: "%s: The adapter received a non-specific IRQ\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the CTCM device + * Description: + * The adapter hardware used by the CTCM device received an IRQ that cannot + * be mapped to a particular device. This is a hardware problem. + * User action: + * Check the status of the CTCM device, for example, with ifconfig. Check if + * the connection to the remote device still works. If the CTCM device is not + * operational, set it offline and back online. If this does not resolve the + * problem, perform a manual recovery. See "Device Drivers, Features, and + * Commands" for details about how to recover a CTCM device. If this problem + * persists, gather Linux debug data, collect the hardware logs, and report the + * problem to your support organization. + */ + +/*? + * Text: "%s: A check occurred on the subchannel\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the CTCM device + * Description: + * A check condition has been detected on the subchannel. + * User action: + * Check if the connection to the remote device still works. If the CTCM device + * is not operational, set it offline and back online. If this does not resolve + * the problem, perform a manual recovery. See "Device Drivers, Features, and + * Commands" for details about how to recover a CTCM device. If this problem + * persists, gather Linux debug data and report the problem to your support + * organization. + */ + +/*? + * Text: "%s: The communication peer is busy\n" + * Severity: Informational + * Parameter: + * @1: channel ID + * Description: + * A busy target device was reported. This might be a temporary problem. + * User action: + * If this problem persists or is reported frequently ensure that the target + * device is working properly. + */ + +/*? + * Text: "%s: The specified target device is not valid\n" + * Severity: Error + * Parameter: + * @1: channel ID + * Description: + * A target device was called with a faulty device specification. This is an + * adapter hardware problem. + * User action: + * Gather Linux debug data, collect the hardware logs, and contact IBM support. + */ + +/*? + * Text: "An I/O operation resulted in error %04x\n" + * Severity: Error + * Parameter: + * @1: channel ID + * @2: error information + * Description: + * A hardware operation ended with an error. + * User action: + * Check the status of the CTCM device, for example, with ifconfig. If the + * device is not operational, perform a manual recovery. See "Device Drivers, + * Features, and Commands" for details about how to recover a CTCM device. + * If this problem persists, gather Linux debug data, collect the hardware logs, + * and report the problem to your support organization. + */ + +/*? + * Text: "%s: Initialization failed with RX/TX init handshake error %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the CTCM device + * @2: error information + * Description: + * A problem occurred during the initialization of the connection. If the + * connection can be established after an automatic recovery, a success message + * is issued. + * User action: + * If the problem is not resolved by the automatic recovery process, check the + * local and remote device. If this problem persists, gather Linux debug data + * and report the problem to your support organization. + */ + +/*? + * Text: "%s: The network backlog for %s is exceeded, package dropped\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the CTCM device + * @2: calling function + * Description: + * There is more network traffic than can be handled by the device. The device + * is closed and some data has not been transmitted. The device might be + * recovered automatically. + * User action: + * Investigate and resolve the congestion. If necessary, set the device + * online to make it operational. + */ + +/*? + * Text: "%s: The XID used in the MPC protocol is not valid, rc = %d\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the CTCM device + * @2: return code + * Description: + * The exchange identification (XID) used by the CTCM device driver when + * in MPC mode is not valid. + * User action: + * Note the error information provided with this message and contact your + * support organization. + */ + +/*? Text: "CTCM driver unloaded\n" */ +/*? Text: "%s: %s Internal error: net_device is NULL, ch = 0x%p\n" */ +/*? Text: "%s / Initializing the ctcm device driver failed, ret = %d\n" */ +/*? Text: "%s: %s: Internal error: Can't determine channel for interrupt device %s\n" */ +/*? Text: "CTCM driver initialized\n" */ +/*? Text: "%s: setup OK : r/w = %s/%s, protocol : %d\n" */ +/*? Text: "%s: Connected with remote side\n" */ +/*? Text: "%s: Restarting device\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/dasd +++ linux-azure-4.11.0/Documentation/kmsg/s390/dasd @@ -0,0 +1,704 @@ +/* dasd_ioctl */ + +/*? + * Text: "%s: The DASD has been put in the quiesce state\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * No I/O operation is possible on this device. + * User action: + * Resume the DASD to enable I/O operations. + */ + +/*? + * Text: "%s: I/O operations have been resumed on the DASD\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD is no longer in state quiesce and I/O operations can be performed + * on the device. + * User action: + * None. + */ + +/*? + * Text: "%s: The DASD cannot be formatted while it is enabled\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD you try to format is enabled. Enabled devices cannot be formatted. + * User action: + * Contact the owner of the formatting tool. + */ + +/*? + * Text: "%s: The specified DASD is a partition and cannot be formatted\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD you try to format is a partition. Partitions cannot be formatted + * separately. You can only format a complete DASD including all its partitions. + * User action: + * Format the complete DASD. + * ATTENTION: Formatting irreversibly destroys all data on all partitions + * of the DASD. + */ + +/*? + * Text: "%s: The specified DASD is a partition and cannot be checked\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD you try to check is a partition. Partitions cannot be checked + * separately. You can only check a complete DASD including all its partitions. + * User action: + * Check the complete DASD. + */ + +/*? + * Text: "%s: Formatting unit %d failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: start track + * @3: return code + * Description: + * The formatting process might have been interrupted by a signal, for example, + * CTRL+C. If the process was not interrupted intentionally, an I/O error + * might have occurred. + * User action: + * Retry to format the device. If the error persists, check the log file for + * related error messages. If you cannot resolve the error, note the return + * code and contact your support organization. + */ + + +/* dasd */ + +/*? + * Text: "%s: Cancelling request %p failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to request + * @3: return code of previous function + * Description: + * In response to a user action, the DASD device driver tried but failed to + * cancel a previously started I/O operation. + * User action: + * Try the action again. + */ + +/*? + * Text: "%s: Flushing the DASD request queue failed for request %p\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to request + * Description: + * As part of the unloading process, the DASD device driver flushes the + * request queue. This failed because a previously started I/O operation + * could not be canceled. + * User action: + * Try again to unload the DASD device driver or to shut down Linux. + */ + +/*? + * Text: "The DASD device driver could not be initialized\n" + * Severity: Informational + * Description: + * The initialization of the DASD device driver failed because of previous + * errors. + * User action: + * Check for related previous error messages. + */ + +/*? + * Text: "%s: Accessing the DASD failed because it is in probeonly mode\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * The dasd= module or kernel parameter specified the probeonly attribute for + * the DASD you are trying to access. The DASD device driver cannot access + * DASDs that are in probeonly mode. + * User action: + * Change the dasd= parameter as to omit probeonly for the DASD and reload + * the DASD device driver. If the DASD device driver has been compiled into + * the kernel, reboot Linux. + */ + +/*? + * Text: "%s: cqr %p timed out (%lus), %i retries remaining\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: request + * @3: timeout value + * @4: number of retries left + * Description: + * A try of the error recovery procedure (ERP) for the channel queued request + * (cqr) timed out and failed to recover the error. ERP continues for the DASD. + * User action: + * Ignore this message if it occurs infrequently and if the recovery succeeds + * during one of the retries. If this error persists, check for related + * previous error messages and report the problem to your support organization. + * + * The timeout can be changed by writing a new value to the sysfs 'expires' attribute of the DASD. The value specifies the timeout in seconds. + */ + +/*? + * Text: "%s: cqr %p timed out (%lus) but cannot be ended, retrying in 5 s\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: request + * @3: timeout value + * Description: + * A try of the error recovery procedure (ERP) for the channel queued request + * (cqr) timed out and failed to recover the error. The I/O request submitted + * during the try could not be canceled. The ERP waits for 5 seconds before + * trying again. + * User action: + * Ignore this message if it occurs infrequently and if the recovery succeeds + * during one of the retries. If this error persists, check for related + * previous error messages and report the problem to your support organization. + * + * The timeout can be changed by writing a new value to the sysfs 'expires' attribute of the DASD. The value specifies the timeout in seconds. + */ + +/*? + * Text: "%s: The DASD cannot be set offline while it is in use\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD cannot be set offline because it is in use by an internal process. + * An action to free the DASD might not have completed yet. + * User action: + * Wait some time and set the DASD offline later. + */ + +/*? + * Text: "%s: The DASD cannot be set offline with open count %i\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: count + * Description: + * The DASD is being used by one or more processes and cannot be set offline. + * User action: + * Ensure that the DASD is not in use anymore, for example, unmount all + * partitions. Then try again to set the DASD offline. + */ + +/*? + * Text: "%s: Setting the DASD online failed with rc=%d\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * Description: + * The DASD could not be set online because of previous errors. + * User action: + * Look for previous error messages. If you cannot resolve the error, note + * the return code and contact your support organization. + */ + +/*? + * Text: "%s Setting the DASD online with discipline %s failed with rc=%i\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: discipline + * @3: return code + * Description: + * The DASD could not be set online because of previous errors. + * User action: + * Look for previous error messages. If you cannot resolve the error, note the + * return code and contact your support organization. + */ + +/*? + * Text: "%s Setting the DASD online failed because of missing DIAG discipline\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD was to be set online with discipline DIAG but this discipline of + * the DASD device driver is not available. + * User action: + * Ensure that the dasd_diag_mod module is loaded. If your Linux system does + * not include this module, you cannot set DASDs online with the DIAG + * discipline. + */ + +/*? + * Text: "%s Setting the DASD online failed because of a missing discipline\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD was to be set online with a DASD device driver discipline that + * is not available. + * User action: + * Ensure that all DASD modules are loaded correctly. + */ + +--------------------------- + +/*? + * Text: "The statistics feature has been switched off\n" + * Severity: Informational + * Description: + * The statistics feature of the DASD device driver has been switched off. + * User action: + * None. + */ + +/*? + * Text: "The statistics feature has been switched on\n" + * Severity: Informational + * Description: + * The statistics feature of the DASD device driver has been switched on. + * User action: + * None. + */ + +/*? + * Text: "The statistics have been reset\n" + * Severity: Informational + * Description: + * The DASD statistics data have been reset. + * User action: + * None. + */ + +/*? + * Text: "%s is not a supported value for /proc/dasd/statistics\n" + * Severity: Warning + * Parameter: + * @1: value + * Description: + * An incorrect value has been written to /proc/dasd/statistics. + * The supported values are: 'set on', 'set off', and 'reset'. + * User action: + * Write a supported value to /proc/dasd/statistics. + */ + +/*? + * Text: "%s is not a valid device range\n" + * Severity: Error + * Parameter: + * @1: range + * Description: + * A device range specified with the dasd= parameter is not valid. + * User action: + * Examine the dasd= parameter and correct the device range. + */ + +/*? + * Text: "The probeonly mode has been activated\n" + * Severity: Informational + * Description: + * The probeonly mode of the DASD device driver has been activated. In this + * mode the device driver rejects any 'open' syscalls with EPERM. + * User action: + * None. + */ + +/*? + * Text: "The IPL device is not a CCW device\n" + * Severity: Error + * Description: + * The value for the dasd= parameter contains the 'ipldev' keyword. During + * the boot process this keyword is replaced with the device from which the + * IPL was performed. The 'ipldev' keyword is not valid if the IPL device is + * not a CCW device. + * User action: + * Do not specify the 'ipldev' keyword when performing an IPL from a device + * other than a CCW device. + */ + +/*? + * Text: "A closing parenthesis ')' is missing in the dasd= parameter\n" + * Severity: Warning + * Description: + * The specification for the dasd= kernel or module parameter has an opening + * parenthesis '(' * without a matching closing parenthesis ')'. + * User action: + * Correct the parameter value. + */ + +/*? + * Text: "The autodetection mode has been activated\n" + * Severity: Informational + * Description: + * The autodetection mode of the DASD device driver has been activated. In + * this mode the DASD device driver sets all detected DASDs online. + * User action: + * None. + */ + +/*? + * Text: "%*s is not a supported device option\n" + * Severity: Warning + * Parameter: + * @1: length of option code + * @2: option code + * Description: + * The dasd= parameter includes an unknown option for a DASD or a device range. + * Options are specified in parenthesis and immediately follow a device or + * device range. + * User action: + * Check the dasd= syntax and remove any unsupported options from the dasd= + * parameter specification. + */ + +/*? + * Text: "PAV support has be deactivated\n" + * Severity: Informational + * Description: + * The 'nopav' keyword has been specified with the dasd= kernel or module + * parameter. The Parallel Access Volume (PAV) support of the DASD device + * driver has been deactivated. + * User action: + * None. + */ + +/*? + * Text: "'nopav' is not supported on z/VM\n" + * Severity: Informational + * Description: + * For Linux instances that run as guest operating systems of the z/VM + * hypervisor Parallel Access Volume (PAV) support is controlled by z/VM not + * by Linux. + * User action: + * Remove 'nopav' from the dasd= module or kernel parameter specification. + */ + +/*? + * Text: "High Performance FICON support has been deactivated\n" + * Severity: Informational + * Description: + * The 'nofcx' keyword has been specified with the dasd= kernel or module + * parameter. The High Performance FICON (transport mode) support of the DASD + * device driver has been deactivated. + * User action: + * None. + */ + +/*? + * Text: "The dasd= parameter value %s has an invalid ending\n" + * Severity: Warning + * Parameter: + * @1: parameter value + * Description: + * The specified value for the dasd= kernel or module parameter is not correct. + * User action: + * Check the module or the kernel parameter. + */ + +/*? + * Text: "Registering the device driver with major number %d failed\n" + * Severity: Warning + * Parameter: + * @1: DASD major + * Description: + * Major number 94 is reserved for the DASD device driver. The DASD device + * driver failed to register with this major number. Another device driver + * might have used major number 94. + * User action: + * Determine which device driver uses major number 94 instead of the DASD + * device driver and unload this device driver. Then try again to load the + * DASD device driver. + */ + +/*? + * Text: "%s: default ERP has run out of retries and failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The error recovery procedure (ERP) tried to recover an error but the number + * of retries for the I/O was exceeded before the error could be resolved. + * User action: + * Check for related previous error messages. + */ + +/*? + * Text: "%s: Unable to terminate request %p on suspend\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to request + * Description: + * As part of the suspend process, the DASD device driver terminates requests + * on the request queue. This failed because a previously started I/O operation + * could not be canceled. The suspend process will be stopped. + * User action: + * Try again to suspend the system. + */ + +/*? + * Text: "%s: ERP failed for the DASD\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * An error recovery procedure (ERP) was performed for the DASD but failed. + * User action: + * Check the message log for previous related error messages. + */ + +/*? + * Text: "%s: An error occurred in the DASD device driver, reason=%s\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: reason code + * Description: + * This problem indicates a program error in the DASD device driver. + * User action: + * Note the reason code and contact your support organization. +*/ + +/*? + * Text: "%s: No operational channel path is left for the device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * All channel paths to the device have become non-operational. The DASD + * device driver suspends I/O operations and queues I/O requests for this + * device until at least one channel path becomes operational again. + * User action: + * Ensure that each channel path to the device has been set up correctly + * and that the related physical cable connections are in place. + */ + +/*? + * Text: "%s: No verified channel paths remain for the device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * All verified channel paths to the device have become non-operational. + * Any other paths to the device have previously been identified as not usable. + * The DASD device driver suspends I/O operations and queues I/O requests + * for this device until at least one channel path becomes operational + * again. + * User action: + * Ensure that each channel path to the device has been set up correctly + * and that the related physical cable connections are in place. + * Set all paths to the device offline and online again to repeat the path + * verification. Alternatively, set the device offline and online again to + * verify all available paths for this device. + * If this problem persists, gather Linux debug data and report the problem + * to your support organization. + */ + +/*? + * Text: "%s: A channel path to the device has become operational\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * At least one channel path of this device has become operational again. + * The DASD device driver resumes I/O operations to the device and processes + * the I/O requests that were queued while there was no operational channel path. + * User action: + * None. + */ + +------------------------------------------------------------------------------------ +/* dasd_diag */ + +/*? + * Text: "%s: A 64-bit DIAG call failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * 64-bit DIAG calls require a 64-bit z/VM version. + * User action: + * Use z/VM 5.2 or later or set the sysfs 'use_diag' attribute of the DASD to 0 + * to switch off DIAG. + */ + +/*? + * Text: "%s: Accessing the DASD failed because of an incorrect format (rc=%d)\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * Description: + * The format of the DASD is not correct. + * User action: + * Check the device format. For details about the return code see the + * section about the INITIALIZE function for DIAGNOSE Code X'250' + * in "z/VM CP Programming Services". If you cannot resolve the error, note + * the return code and contact your support organization. + */ + +/*? + * Text: "%s: New DASD with %ld byte/block, total size %ld KB%s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * @2: bytes per block + * @3: size + * @4: access mode + * Description: + * A DASD with the indicated block size and total size has been set online. + * If the DASD is configured as read-only to the real or virtual hardware, + * the message includes an indication of this hardware access mode. The + * hardware access mode is independent from the 'readonly' attribute of + * the device in sysfs. + * User action: + * None. + */ + +/*? + * Text: "%s: DIAG ERP failed with rc=%d\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * Description: + * An error in the DIAG processing could not be recovered by the error + * recovery procedure (ERP) of the DIAG discipline. + * User action: + * Note the return code, check for related I/O errors, and report this problem + * to your support organization. + */ + +/*? + * Text: "%s: DIAG initialization failed with rc=%d\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * Description: + * Initializing the DASD with the DIAG discipline failed. Possible reasons for + * this problem are that the device has a device type other than FBA or ECKD, + * or has a block size other than one of the supported sizes: + * 512 byte, 1024 byte, 2048 byte, or 4096 byte. + * User action: + * Ensure that the device can be written to and has a supported device type + * and block size. For details about the return code see the section about + * the INITIALIZE function for DIAGNOSE Code X'250' in "z/VM CP Programming + * Services". If you cannot resolve the error, note the error code and contact + * your support organization. + */ + +/*? + * Text: "%s: Device type %d is not supported in DIAG mode\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: device type + * Description: + * Only DASD of type FBA and ECKD are supported in DIAG mode. + * User action: + * Set the sysfs 'use_diag' attribute of the DASD to 0 and try again to access + * the DASD. + */ + +/*? + * Text: "Discipline %s cannot be used without z/VM\n" + * Severity: Informational + * Parameter: + * @1: discipline name + * Description: + * The discipline that is specified with the dasd= kernel or module parameter + * is only available for Linux instances that run as guest operating + * systems of the z/VM hypervisor. + * User action: + * Remove the unsupported discipline from the parameter string. + */ + +/*? + * Text: "%s: The access mode of a DIAG device changed to read-only\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A device changed its access mode from writeable to + * read-only while in use. + * User action: + * Set the device offline, ensure that the device is configured correctly in + * z/VM, then set the device online again. + */ + +------------------------------------------------------------------------------------ +/* dasd_erp */ + +/*? + * Text: "%s: A timeout error occurred for cqr %p\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to request + * Description: + * A channel queued request (cqr) failed because it timed out. + * One possible reason for this error is that a request did not + * complete within the timeout interval specified for the DASD. + * The timeout interval is set as the value of the 'timeout' sysfs + * attribute of a DASD. A value of 0 disables the timeout function. + * The timeout function can be used; for example, by mirroring setups; + * to quickly process a request queue for a DASD that has become unavailable. + * User action: + * Check the message log for previous related error messages. Verify + * that the storage server and the connection from host to storage + * server are operational. If the 'timeout' sysfs attribute of the + * DASD has been set to a value other than 0, verify that this + * setting is intentional and change it if required. + */ + +/*? + * Text: "%s: A transport error occurred for cqr %p\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to request + * Description: + * A channel queued request (cqr) failed because the connection to the + * device was lost and the 'failfast' flag is set for the request. + * This flag can result from, for example: + * + * - A software layer above the DASD device driver; + * for example, in a host based mirroring setup. + * + * - Value 1 for the 'failfast' sysfs attribute of the DASD. + * This setting applies to all requests on the DASD. + * + * User action: + * Ensure that each channel path to the device has been set up + * correctly and that the related physical cable connections are in + * place. If the 'failfast' attribute of the DASD is set to 1, + * verify that this setting is intentional and change it to 0 if required. + */ + +/*? + * Text: "%s Setting the DASD online failed because the required module %s could not be loaded (rc=%d)\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: kernel module name + * @3: return code + * Description: + * The DASD was to be set online with discipline DIAG but this discipline of + * the DASD device driver is not available and an attempt to load the + * corresponding kernel module failed with the specified return code. + * + * User action: + * Ensure that the kernel module with the specified name is correctly installed + * or set the sysfs 'use_diag' attribute of the DASD to 0 to switch off DIAG. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/dasd-eckd +++ linux-azure-4.11.0/Documentation/kmsg/s390/dasd-eckd @@ -0,0 +1,2154 @@ +/* dasd_eckd */ + +/*? + * Text: "%s: ERP failed for the DASD\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * An error recovery procedure (ERP) was performed for the DASD but failed. + * User action: + * Check the message log for previous related error messages. + */ + +/*? + * Text: "%s: An error occurred in the DASD device driver, reason=%s\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: reason code + * Description: + * This problem indicates a program error in the DASD device driver. + * User action: + * Note the reason code and contact your support organization. +*/ + +/*? + * Text: "%s: Allocating memory for private DASD data failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD device driver maintains data structures for each DASD it manages. + * There is not enough memory to allocate these data structures for one or + * more DASD. + * User action: + * Free some memory and try the operation again. + */ + +/*? + * Text: "%s: DASD with %d KB/block, %d KB total size, %d KB/track, %s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * @2: block size + * @3: DASD size + * @4: track size + * @5: disc layout + * Description: + * A DASD with the shown characteristics has been set online. + * User action: + * None. + */ + +/*? + * Text: "%s: Start track number %u used in formatting is too big\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: track number + * Description: + * The DASD format I/O control was used incorrectly by a formatting tool. + * User action: + * Contact the owner of the formatting tool. + */ + +/*? + * Text: "%s: Stop track number %u used in formatting is too big\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: track number + * Description: + * The DASD format I/O control was used incorrectly by a formatting tool. + * User action: + * Contact the owner of the formatting tool. + */ + +/*? + * Text: "%s: The DASD is not formatted\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A DASD has been set online but it has not been formatted yet. You must + * format the DASD before you can use it. + * User action: + * Format the DASD, for example, with dasdfmt. + */ + +/*? + * Text: "%s: 0x%x is not a known command\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: command + * Description: + * This problem is likely to be caused by a programming error. + * User action: + * Contact your support organization. + */ + +/*? + * Text: "%s: Track 0 has no records following the VTOC\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * Linux has identified a volume table of contents (VTOC) on the DASD but + * cannot read any data records following the VTOC. A possible cause of this + * problem is that the DASD has been used with another System z operating + * system. + * User action: + * Format the DASD for usage with Linux, for example, with dasdfmt. + * ATTENTION: Formatting irreversibly destroys all data on the DASD. + */ + +/*? + * Text: "%s: An I/O control call used incorrect flags 0x%x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: flags + * Description: + * The DASD format I/O control was used incorrectly. + * User action: + * Contact the owner of the formatting tool. + */ + +/*? + * Text: "%s: New DASD %04X/%02X (CU %04X/%02X) with %d cylinders, %d heads, %d sectors%s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * @2: device type + * @3: device model + * @4: control unit type + * @5: control unit model + * @6: number of cylinders + * @7: tracks per cylinder + * @8: sectors per track + * @9: access mode + * Description: + * A DASD with the shown characteristics has been set online. + * If the DASD is configured as read-only to the real or virtual hardware, + * the message includes an indication of this hardware access mode. The + * hardware access mode is independent from the 'readonly' attribute of + * the device in sysfs. + * User action: + * None. + */ + +/*? + * Text: "%s: The disk layout of the DASD is not supported\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD device driver only supports the following disk layouts: CDL, LDL, + * FBA, CMS, and CMS RESERVED. + * User action: + * None. + */ + +/*? + * Text: "%s: Start track %u used in formatting exceeds end track\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: track number + * Description: + * The DASD format I/O control was used incorrectly by a formatting tool. + * User action: + * Contact the owner of the formatting tool. + */ + +/*? + * Text: "%s: The DASD cache mode was set to %x (%i cylinder prestage)\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * @2: operation mode + * @3: number of cylinders + * Description: + * The DASD cache mode has been changed. See the storage system documentation + * for information about the different cache operation modes. + * User action: + * None. + */ + +/*? + * Text: "%s: The DASD cannot be formatted with block size %u\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: block size + * Description: + * The block size specified for a format instruction is not valid. The block + * size must be between 512 and 4096 byte and must be a power of 2. + * User action: + * Call the format command with a supported block size. + */ + +/*? + * Text: "%s: The UID of the DASD has changed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The Unique Identifier (UID) of a DASD that is currently in use has changed. + * This indicates that the physical disk has been replaced. + * User action: + * None if the replacement was intentional. + * If the disk change is not expected, stop using the disk to prevent possible + * data loss. +*/ + + +/* dasd_3990_erp */ + +/*? + * Text: "%s: is offline or not installed - INTERVENTION REQUIRED!!\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD to be accessed is not in an accessible state. The I/O operation + * will wait until the device is operational again. This is an operating system + * independent message that is issued by the storage system. + * User action: + * Make the DASD accessible again. For details see the storage system + * documentation. + */ + +/*? + * Text: "%s: The DASD cannot be reached on any path (lpum=%x/opm=%x)\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: last path used mask + * @3: online path mask + * Description: + * After a path to the DASD failed, the error recovery procedure of the DASD + * device driver tried but failed to reconnect the DASD through an alternative + * path. + * User action: + * Ensure that the cabling between the storage server and the mainframe + * system is securely in place. Check the file systems on the DASD when it is + * accessible again. + */ + +/*? + * Text: "%s: Unable to allocate DCTL-CQR\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an internal error. + * User action: + * Contact your support organization. + */ + +/*? + * Text: "%s: FORMAT 0 - Invalid Parameter\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A data argument of a command is not valid. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - DPS Installation Check\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This operating system independent message is issued by the storage system + * for one of the following reasons: + * - A 3380 Model D or E DASD does not have the Dynamic Path Selection (DPS) + * feature in the DASD A-unit. + * - The device type of an attached DASD is not supported by the firmware. + * - A type 3390 DASD is attached to a 3 MB channel. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 2 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Drive motor switch is off\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - CCW Count less than required\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The CCW count of a command is less than required. This is an operating + * system independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Channel requested ... %02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: reason code + * Description: + * This is an operating system independent message that is issued by the + * storage system. The possible reason codes indicate the following problems: + * 00 No Message. + * 01 The channel has requested unit check sense data. + * 02 The channel has requested retry and retry is exhausted. + * 03 A SA Check-2 error has occurred. This sense is presented with + * Equipment Check. + * 04 The channel has requested retry and retry is not possible. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Status Not As Required: reason %02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: reason code + * Description: + * This is an operating system independent message that is issued by the + * storage system. There are several potential reasons for this message; + * byte 8 contains the reason code. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Device status 1 not valid\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Storage Path Restart\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * An operation for an active channel program was queued in a Storage Control + * when a warm start was received by the path. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Reset Notification\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A system reset or its equivalent was received on an interface. The Unit + * Check that generates this sense is posted to the next channel initiated + * selection following the resetting event. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Invalid Command Sequence\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * An incorrect sequence of commands has occurred. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Missing device address bit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Subsystem Processing Error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A firmware logic error has been detected. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Seek incomplete\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Invalid Command\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A command was issued that is not in the 2107/1750 command set. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Command Invalid on Secondary Address\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A command or order not allowed on a PPRC secondary device has been received + * by the secondary device. This is an operating system independent message + * that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Invalid Defective/Alternate Track Pointer\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A defective track has been accessed. The subsystem generates an invalid + * Defective/Alternate Track Pointer as a part of RAID Recovery. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Channel Returned with Incorrect retry CCW\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A command portion of the CCW returned after a command retry sequence does + * not match the command for which retry was signaled. This is an operating + * system independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Diagnostic of Special Command Violates File Mask\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A command is not allowed under the Access Authorization specified by the + * File Mask. This is an operating system independent message that is issued + * by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Head address does not compare\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Device did not respond to selection\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Device check-2 error or Set Sector is not complete\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Device Error Source\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The device has completed soft error logging. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Data Pinned for Device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * Modified data in cache or in persistent storage exists for the DASD. The + * data cannot be destaged to the device. This track is the first track pinned + * for this device. This is an operating system independent message that is + * issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel C\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Device Status 1 not as expected\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 0 - Device Fenced - device = %02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: sense data byte 4 + * Description: + * The device shown in sense byte 4 has been fenced. This is an operating + * system independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Interruption cannot be reset\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Index missing\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - DASD Fast Write inhibited\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * DASD Fast Write is not allowed because of a nonvolatile storage battery + * check condition. This is an operating system independent message that is + * issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Invalid tag-in for an extended command sequence\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Key area error; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Count area error; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Track physical address did not compare\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 2 - 3990 check-2 error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Offset active cannot be reset\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - RCC 1 and RCC 2 sequences not successful\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in count address area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Data area error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel A\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in count address area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the key area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Caching status reset to default\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The storage director has assigned two new subsystem status devices and + * resets the status to its default value. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the data area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Device not ready\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in key area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - DASD controller failed to set or reset the long busy latch\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 1 - Cylinder address did not compare\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 3 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in data area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 2 - Support facility errors\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Key area error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - End operation with transfer count not zero\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 2 - Microcode detected error %02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: error code + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the count area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 3 - Allegiance terminated\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * Allegiance terminated because of a Reset Allegiance or an Unconditional + * Reserve command on another channel. This is an operating system independent + * message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Home address area error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Count area error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Invalid tag-in during selection sequence\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in data area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in home address area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Home address area error; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - Data area error; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in home address area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the home address area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the home address area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the count area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 4 - No sync byte in key area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Invalid DCC selection response or timeout\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the data area\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Operation Terminated\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The storage system ends an operation related to an active channel program + * when termination and redrive are required and logging is not desired. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel B\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 5 - Data Check in the key area; offset active\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Volume is suspended duplex\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The duplex pair volume has entered the suspended duplex state because of a + * failure. This is an operating system independent message that is issued by + * the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel D\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - RCC 1 sequence not successful\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel E\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - 3990 microcode time out when stopping selection\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel F\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - RCC initiated by a connection check alert\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel G\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - extra RCC required\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 6 - Overrun on channel H\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - Unexpected end operation response code\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Permanent path error (DASD controller not available)\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Missing end operation; device transfer incomplete\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Cache or nonvolatile storage equipment failure\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * An equipment failure has occurred in the cache storage or nonvolatile + * storage of the storage system. This is an operating system independent + * message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - DPS cannot be filled\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - Error correction code hardware fault\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Missing end operation; device transfer complete\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - DASD controller not available on disconnected command chain\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - No interruption from device during a command chain\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - No response to selection after a poll interruption\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 9 - Track physical address did not compare while oriented\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 9 - Head address did not compare\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Invalid tag-in for an immediate command sequence\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 9 - Cylinder address did not compare\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - DPS checks after a system reset or selective reset\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Caching reinitiated\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * Caching has been automatically reinitiated following an error. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - End operation with transfer count zero\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 7 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 9 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - Short busy time-out during device selection\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Caching terminated\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The storage system was unable to initiate caching or had to suspend caching + * for a 3990 control unit. If this problem is caused by a failure condition, + * an additional message will provide more information about the failure. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * Check for additional messages that point out possible failures. For more + * information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Subsystem status cannot be determined\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The status of a DASD Fast Write or PPRC volume cannot be determined. + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Nonvolatile storage terminated\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The storage director has stopped using nonvolatile storage or cannot + * initiate nonvolatile storage. If this problem is caused by a failure, an + * additional message will provide more information about the failure. This is + * an operating system independent message that is issued by the storage system. + * User action: + * Check for additional messages that point out possible failures. For more + * information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT 8 - Reserved\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: Write inhibited path encountered\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an informational message. + * User action: + * None. + */ + +/*? + * Text: "%s: FORMAT 9 - Device check-2 error\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * This is an operating system independent message that is issued by the + * storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Track format incorrect\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A track format error occurred while data was being written to the DASD or + * while a duplex pair was being established. This is an operating system + * independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: FORMAT F - Cache fast write access not authorized\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * A request for Cache Fast Write Data access cannot be satisfied because + * of missing access authorization for the storage system. This is an operating + * system independent message that is issued by the storage system. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: Data recovered during retry with PCI fetch mode active\n" + * Severity: Emerg + * Parameter: + * @1: bus ID of the DASD + * Description: + * A data error has been recovered on the storages system but the Linux file + * system cannot be informed about the data mismatch. To prevent Linux from + * running with incorrect data, the DASD device driver will trigger a kernel + * panic. + * User action: + * Reset your real or virtual hardware and reboot Linux. + */ + +/*? + * Text: "%s: The specified record was not found\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The record to be accessed does not exist. The DASD might be unformatted + * or defect. + * User action: + * Try to format the DASD or replace it. + * ATTENTION: Formatting irreversibly destroys all data on the DASD. + */ + +/*? + * Text: "%s: ERP %p (%02x) refers to %p\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: pointer to ERP + * @3: ERP status + * @4: cqr + * Description: + * This message provides debug information for the enhanced error recovery + * procedure (ERP). + * User action: + * If you do not need this information, you can suppress this message by + * switching off ERP logging, for example, by writing '1' to the 'erplog' + * sysfs attribute of the DASD. + */ + +/*? + * Text: "%s: ERP chain at END of ERP-ACTION\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * This message provides debug information for the enhanced error recovery + * procedure (ERP). + * User action: + * If you do not need this information, you can suppress this message by + * switching off ERP logging, for example, by writing '1' to the 'erplog' + * sysfs attribute of the DASD. + */ + +/*? + * Text: "%s: The cylinder data for accessing the DASD is inconsistent\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * An error occurred in the storage system hardware. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: Accessing the DASD failed because of a hardware error\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * An error occurred in the storage system hardware. + * User action: + * For more information see the documentation of your storage system. + */ + +/*? + * Text: "%s: ERP chain at BEGINNING of ERP-ACTION\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * This message provides debug information for the enhanced error recovery + * procedure (ERP). + * User action: + * If you do not need this information, you can suppress this message by + * switching off ERP logging, for example, by writing '1' to the 'erplog' + * sysfs attribute of the DASD. + */ + +/*? + * Text: "%s: ERP %p has run out of retries and failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: ERP pointer + * Description: + * The error recovery procedure (ERP) tried to recover an error but the number + * of retries for the I/O was exceeded before the error could be resolved. + * User action: + * Check for related previous error messages. + */ + +/*? + * Text: "%s: SIM - SRC: %02x%02x%02x%02x\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: sense byte + * @3: sense byte + * @4: sense byte + * @5: sense byte + * Description: + * This error message is a System Information Message (SIM) generated by the + * storage system. The System Reference Code (SRC) defines the error in detail. + * User action: + * Look up the SRC in the storage server documentation. + */ + +/*? + * Text: "%s: log SIM - SRC: %02x%02x%02x%02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: sense byte + * @3: sense byte + * @4: sense byte + * @5: sense byte + * Description: + * This System Information Message (SIM) is generated by the storage system. + * The System Reference Code (SRC) defines the error in detail. + * User action: + * Look up the SRC in the storage server documentation. + */ + +/*? + * Text: "%s: Reading device feature codes failed with rc=%d\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * Description: + * The device feature codes state which advanced features are supported by a + * device. + * Examples for advanced features are PAV or high performance FICON. + * Some early devices do not provide feature codes and no advanced features are + * available on these devices. + * User action: + * None, if the DASD does not provide feature codes. If the DASD provides + * feature codes, make sure that it is working correctly, then set it offline + * and back online. + */ + +/*? + * Text: "%s: A channel path group could not be established\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * Initialization of a DASD did not complete because a channel path group + * could not be established. + * User action: + * Make sure that the DASD is working correctly, then try again to set it + * online. If initialization still fails, reboot. + */ + +/*? + * Text: "%s: The DASD is not operating in multipath mode\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD channel path group could not be configured to use multipath mode. + * This might negatively affect I/O performance on this DASD. + * User action: + * Make sure that the DASD is working correctly, then try again to set it + * online. If initialization still fails, reboot. + */ + +/*? + * Text: "%s: Detecting the DASD disk layout failed because of an I/O error\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The disk layout of the DASD could not be detected because of an unexpected + * I/O error. The DASD device driver treats the device like an unformatted DASD, + * and partitions on the device are not accessible. + * User action: + * If the DASD is formatted, make sure that the DASD is working correctly, + * then set it offline and back online. If the DASD is unformatted, format the + * DASD, for example, with dasdfmt. + * ATTENTION: Formatting irreversibly destroys all data on the DASD. + */ + +/*? + * Text: "%s: An I/O request was rejected because writing is inhibited\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * An I/O request was returned with an error indication of 'command reject' + * and 'write inhibited'. The most likely reason for this error is a + * failed write request to a device that was attached as read-only in z/VM. + * User action: + * Set the device offline, ensure that the device is configured correctly in + * z/VM, then set the device online again. + */ + +/*? + * Text: "%s: An Alias device was reassigned to a new base device with UID: %s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the alias + * @2: UID of new base device + * Description: + * The alias device with the indicated bus ID has been reassigned. The UID of the new base device is shown in the message. + * User action: + * None. + */ + +/*? + * Text: "%s: Detecting the maximum supported data size for zHPF requests failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * High Performance FICON (zHPF) requests are limited to a hardware-dependent + * maximum data size. The DASD device driver failed to detect this size and zHPF + * is not available for this device. + * User action: + * Set the device offline and online again. If this problem persists, gather + * Linux debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Reading device feature codes failed (rc=%d) for new path %x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * @3: path mask + * Description: + * A new path has been made available to the a device. + * A command to read the device feature codes on this device returned an error. + * The new path will not be used for I/O. + * User action: + * Set the new path offline and online again to repeat the path verification. + * Alternatively, set the device offline and online again to + * verify all available paths for this device. + * If this problem persists, gather Linux debug data and report the problem + * to your support organization. + */ + +/*? + * Text: "%s: Detecting the maximum data size for zHPF requests failed (rc=%d) for a new path %x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: return code + * @3: path mask + * Description: + * High Performance FICON (zHPF) requests are limited to a hardware-dependent + * maximum data size. A command to detect this size for + * a new path returned an error. The new path will not be used for I/O. + * User action: + * Set the new path offline and online again to repeat the path verification. + * Alternatively, set the device offline and online again to + * verify all available paths for this device. + * If this problem persists, gather Linux debug data and report the problem + * to your support organization. + */ + +/*? + * Text: "%s: The maximum data size for zHPF requests %u on a new path %x is below the active maximum %u\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * @2: size in bytes + * @3: path mask + * @4: size in bytes + * Description: + * High Performance FICON (zHPF) requests are limited to a hardware-dependent + * maximum data size. The maximum of the new path is below + * the previously established common maximum for the + * existing paths for this device. This could cause requests on the new + * path to fail. The new path will not be used for I/O. + * User action: + * Set the device offline and online again to establish a new common maximum + * data size for the device. + */ + +/*? + * Text: "%s: The device reservation was lost\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * This Linux instance has lost its reservation of the device to another + * operating system instance. Depending on the reservation policy for the + * device, I/O might be blocked until the other operating system instance + * surrenders the reservation or all I/O requests might fail until the + * device is reset. + * User action: + * None, if this situation is handled by system automation software. + * If this situation is not handled by automation, check the + * last_known_reservation_state attribute of the device in sysfs. + * If the value is 'lost', verify that the device is no longer reserved + * by another operating system instance, then set the device offline and + * online again. For any other value of the last_known_reservation_state + * no action is required. I/O will resume when the device reservation is + * surrendered by the other operating system instance. + */ + +/*? + * Text: "%s: The storage server does not support raw-track access\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD cannot be accessed in raw-track access mode because the storage + * server does not have all required features for this access mode. + * In raw-track access mode, the DASD device driver accesses complete ECKD + * tracks. + * By default, the DASD device driver accesses only the data fields of ECKD + * devices and omits the count and key data fields. + * User action: + * Ensure that the raw_track_access sysfs attribute of the DASD has the value + * 0 to access the device in default ECKD mode. + */ + +/*? + * Text: "%s: The newly added channel path %02X will not be used because it leads to a different device %s\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: logical path mask + * @3: UID + * Description: + * The newly added channel path has a different UID than the DASD device. This indicates + * an incorrect cabling. This path is not going to be used. + * User action: + * Check the cabling of the DASD device. Disconnect and reconnect the cable. + */ + +/*? + * Text: "%s: Not all channel paths lead to the same device, path %02X leads to device %s instead of %s\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: logical path mask + * @3: UID + * @4: UID + * Description: + * Some channel paths have a different UID than others. This indicates + * an incorrect cabling. The DASD device is not enabled. + * User action: + * Check cabling of the DASD device and retry to enable the device. + */ + +/*? + * Text: "Service on the storage server caused path %x.%02x to go offline" + * Severity: Warning + * Parameter: + * @1: channel subsystem ID + * @2: CHPID + * Description: + * A channel path to the DASD has been set offline because of + * a service action on the storage server. The path will be set back + * online automatically when the service action is completed. + * User action: + * None. + */ + +/*? + * Text: "Path %x.%02x is back online after service on the storage server" + * Severity: Informational + * Parameter: + * @1: channel subsystem ID + * @2: CHPID + * Description: + * A path had been set offline temporarily because of a service + * action on the storage server. + * The service action has completed, and the channel path is available + * again. + * User action: + * None. + */ + +/*? + * Text: "%s: High Performance FICON disabled\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * Description: + * High Performance FICON (HPF) has been disabled. Either the device + * lost HPF functionality, or none of the remaining channel paths are + * HPF capable. + * User action: + * Report the problem to your support organization. + * Ensure that the cabling between the storage server and the mainframe + * system is securely in place. + * Reset the device and channel paths by writing "all" or a logical path mask + * to the path_reset sysfs attribute of the device. + */ + +/*? + * Text: "%s: Channel path %02X lost HPF functionality and is disabled\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: logical path mask + * Description: + * A channel path has lost High Performance FICON (HPF) functionality + * and was removed from regular operations. + * User action: + * Report the problem to your support organization. + * Ensure that the cabling between the storage server and the mainframe + * system is securely in place. + * Reset the device and channel paths by writing "all" or a logical path mask + * to the path_reset sysfs attribute of the device. + */ + +/*? + * Text: "%s: Path %x.%02x (pathmask %02x) is disabled - IFCC threshold exceeded\n" + * Severity: Error + * Parameter: + * @1: bus ID of the DASD + * @2: cssid + * @3: chpid + * @4: logical path mask + * Description: + * Due to numerous interface or channel control checks (IFCCs), a channel path + * was removed from regular operations to retain good I/O performance. + * User action: + * Ensure that the cabling between the storage server and the mainframe + * system is securely in place. + * Reset the device and channel paths by writing "all" or a logical path mask + * to the path_reset sysfs attribute of the device. + * If the problem persists, report it to your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/dasd-fba +++ linux-azure-4.11.0/Documentation/kmsg/s390/dasd-fba @@ -0,0 +1,36 @@ + +/*? + * Text: "%s: New FBA DASD %04X/%02X (CU %04X/%02X) with %d MB and %d B/blk%s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the DASD + * @2: device type + * @3: device model + * @4: control unit type + * @5: control unit model + * @6: size + * @7: bytes per block + * @8: access mode + * Description: + * A DASD with the shown characteristics has been set online. + * If the DASD is configured as read-only to the real or virtual hardware, + * the message includes an indication of this hardware access mode. The + * hardware access mode is independent from the 'readonly' attribute of + * the device in sysfs. + * User action: + * None. + */ + +/*? + * Text: "%s: Allocating memory for private DASD data failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the DASD + * Description: + * The DASD device driver maintains data structures for each DASD it manages. + * There is not enough memory to allocate these data structures for one or + * more DASD. + * User action: + * Free some memory and try the operation again. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/dcssblk +++ linux-azure-4.11.0/Documentation/kmsg/s390/dcssblk @@ -0,0 +1,206 @@ +/*? + * Text: "Adjacent DCSSs %s and %s are not contiguous\n" + * Severity: Error + * Parameter: + * @1: name 1 + * @2: name 2 + * Description: + * You can only map a set of two or more DCSSs to a single DCSS device if the + * DCSSs in the set form a contiguous memory space. The DCSS device cannot be + * created because there is a memory gap between two adjacent DCSSs. + * User action: + * Ensure that you have specified all DCSSs that belong to the set. Check the + * definitions of the DCSSs on the z/VM hypervisor to verify that they form + * a contiguous memory space. + */ + +/*? + * Text: "DCSS %s and DCSS %s have incompatible types\n" + * Severity: Error + * Parameter: + * @1: name 1 + * @2: name 2 + * Description: + * You can only map a set of two or more DCSSs to a single DCSS device if + * either all DCSSs in the set have the same type or if the set contains DCSSs + * of the two types EW and EN but no other type. The DCSS device cannot be + * created because at least two of the specified DCSSs are not compatible. + * User action: + * Check the definitions of the DCSSs on the z/VM hypervisor to verify that + * their types are compatible. + */ + +/*? + * Text: "DCSS %s is of type SC and cannot be loaded as exclusive-writable\n" + * Severity: Error + * Parameter: + * @1: device name + * Description: + * You cannot load a DCSS device in exclusive-writable access mode if the DCSS + * devise maps to one or more DCSSs of type SC. + * User action: + * Load the DCSS in shared access mode. + */ + +/*? + * Text: "DCSS device %s is removed after a failed access mode change\n" + * Severity: Error + * Parameter: + * @1: device name + * Description: + * To change the access mode of a DCSS device, all DCSSs that map to the device + * were unloaded. Reloading the DCSSs for the new access mode failed and the + * device is removed. + * User action: + * Look for related messages to find out why the DCSSs could not be reloaded. + * If necessary, add the device again. + */ + +/*? + * Text: "All DCSSs that map to device %s are saved\n" + * Severity: Informational + * Parameter: + * @1: device name + * Description: + * A save request has been submitted for the DCSS device. Changes to all DCSSs + * that map to the device are saved permanently. + * User action: + * None. + */ + +/*? + * Text: "Device %s is in use, its DCSSs will be saved when it becomes idle\n" + * Severity: Informational + * Parameter: + * @1: device name + * Description: + * A save request for the device has been deferred until the device becomes + * idle. Then changes to all DCSSs that the device maps to will be saved + * permanently. + * User action: + * None. + */ + +/*? + * Text: "A pending save request for device %s has been canceled\n" + * Severity: Informational + * Parameter: + * @1: device name + * Description: + * A save request for the DCSSs that map to a DCSS device has been pending + * while the device was in use. This save request has been canceled. Changes to + * the DCSSs will not be saved permanently. + * User action: + * None. + */ + +/*? + * Text: "Loaded %s with total size %lu bytes and capacity %lu sectors\n" + * Severity: Informational + * Parameter: + * @1: DCSS names + * @2: total size in bytes + * @3: total size in 512 byte sectors + * Description: + * The listed DCSSs have been verified as contiguous and successfully loaded. + * The displayed sizes are the sums of all DCSSs. + * User action: + * None. + */ + +/*? + * Text: "Device %s cannot be removed because it is not a known device\n" + * Severity: Warning + * Parameter: + * @1: device name + * Description: + * The DCSS device you are trying to remove is not known to the DCSS device + * driver. + * User action: + * List the entries under /sys/devices/dcssblk/ to see the names of the + * existing DCSS devices. + */ + +/*? + * Text: "Device %s cannot be removed while it is in use\n" + * Severity: Warning + * Parameter: + * @1: device name + * Description: + * You are trying to remove a device that is in use. + * User action: + * Make sure that all users of the device close the device before you try to + * remove it. + */ + +/*? + * Text: "Device %s has become idle and is being saved now\n" + * Severity: Informational + * Parameter: + * @1: device name + * Description: + * A save request for the DCSSs that map to a DCSS device has been pending + * while the device was in use. The device has become idle and all changes + * to the DCSSs are now saved permanently. + * User action: + * None. + */ + +/*? + * Text: "Writing to %s failed because it is a read-only device\n" + * Severity: Warning + * Parameter: + * @1: device name + * Description: + * The DCSS device is in shared access mode and cannot be written to. Depending + * on the type of the DCSSs that the device maps to, you might be able to + * change the access mode to exclusive-writable. + * User action: + * If the DCSSs of the device are of type SC, do not attempt to write to the + * device. If the DCSSs of the device are of type ER or SR, change the access + * mode to exclusive-writable before writing to the device. + */ + +/*? + * Text: "The address range of DCSS %s changed while the system was suspended\n" + * Severity: Error + * Parameter: + * @1: device name + * Description: + * After resuming the system, the start address or end address of a DCSS does + * not match the address when the system was suspended. DCSSs must not be + * changed after the system was suspended. + * This error cannot be recovered. The system is stopped with a kernel panic. + * User action: + * Reboot Linux. + */ + +/*? + * Text: "Suspending the system failed because DCSS device %s is writable\n" + * Severity: Error + * Parameter: + * @1: device name + * Description: + * A system cannot be suspended if one or more DCSSs are accessed in exclusive- + * writable mode. DCSS segment types EW, SW, and EN are always writable and + * must be removed before a system is suspended. + * User action: + * Remove all DCSSs of segment types EW, SW, and EN by writing the DCSS name to + * the sysfs 'remove' attribute. Set the access mode for all DCSSs of segment + * types SR and ER to read-only by writing 1 to the sysfs 'shared' attribute of + * the DCSS. Then try again to suspend the system. + */ + +/*? + * Text: "DCSS %s is of type SN or EN and cannot be saved\n" + * Severity: Warning + * Parameter: + * @1: DCSS name + * Description: + * DCSSs of type SN or EN cannot be saved. + * User action: + * If the DCSS was set up with the intention to prevent the content from being saved, + * no action is necessary. + * To be able to save the content, you must define the DCSS with a type other than SN or EN. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/diag288_wdt +++ linux-azure-4.11.0/Documentation/kmsg/s390/diag288_wdt @@ -0,0 +1,66 @@ +/*? + * Text: "The watchdog cannot be activated\n" + * Severity: Error + * Description: + * Diagnose instruction 0x288 was called to activate the diag288 watchdog. + * The diagnose call returned an error that cannot be handled by the device driver. + * The watchdog stays inactive. + * User action: + * Contact your support organization. + */ + +/*? + * Text: "The watchdog cannot be initialized\n" + * Severity: Error + * Description: + * Diagnose instruction 0x288 was called to initialize the diag288 watchdog. + * The diagnose call returned an error that cannot be handled by the device driver. + * The watchdog stays inactive. + * A possible reason for this error is that your real or virtual hardware does not support + * the diag288 watchdog. + * User action: + * Confirm that the diag288 watchdog is supported in your environment. + * Use a watchdog that is supported in your environment. + */ + +/*? + * Text: "The watchdog cannot be deactivated\n" + * Severity: Error + * Description: + * Diagnose instruction 0x288 was called to deactivate the diag288 watchdog. + * The diagnose call returned an error that cannot be handled by the device driver. + * The watchdog stays active and a watchdog timeout will trigger the configured timeout action. + * The diag288 watchdog device driver might intentionally be configured to prevent deactivation. + * User action: + * You can configure the diag288 watchdog device driver such that it can be deactivated. + * If the diag288 device driver has been compiled as a separate module, diag288_wdt, reload the module + * without specifying the 'nowayout' module parameter. + * If the diag288 device driver has been compiled into your kernel, + * reboot Linux without specifying the 'diag288.nowayout' kernel parameter'. + */ + +/*? + * Text: "The watchdog timer cannot be started or reset\n" + * Severity: Error + * Description: + * Diagnose instruction 0x288 was called to start the diag288 watchdog or to set timer back to zero. + * The diagnose call returned an error that cannot be handled by the device driver. + * The watchdog stays inactive or becomes inactive. + * User action: + * Contact your support organization. + */ + +/*? + * Text: "Linux cannot be suspended while the watchdog is in use\n" + * Severity: Error + * Description: + * The watchdog must not time out while Linux is suspended. + * Therefore, the diag288 watchdog device driver prevents Linux from being suspended + * while the watchdog is in use. + * User action: + * i) Stop the watchdog application. ii) If the problem persists, close the watchdog + * device node by issuing 'echo V > /dev/watchdog'. + * iii) If the device driver still prevents Linux from being suspended, + * contact your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/extmem +++ linux-azure-4.11.0/Documentation/kmsg/s390/extmem @@ -0,0 +1,293 @@ +/*? + * Text: "Querying a DCSS type failed with rc=%ld\n" + * Severity: Warning + * Parameter: + * @1: return code + * Description: + * The DCSS kernel interface used z/VM diagnose call X'64' to query the + * type of a DCSS. z/VM failed to determine the type and returned an error. + * User action: + * Look for related messages to find out which DCSS is affected. + * For details about the return codes see the section about DIAGNOSE Code + * X'64' in "z/VM CP Programming Services". + */ + +/*? + * Text: "Loading DCSS %s failed with rc=%ld\n" + * Severity: Warning + * Parameter: + * @1: DCSS name + * @2: return code + * Description: + * The DCSS kernel interface used diagnose call X'64' to load a DCSS. z/VM + * failed to load the DCSS and returned an error. + * User action: + * For details about the return codes see the section about DIAGNOSE Code + * X'64' in "z/VM CP Programming Services". + */ + +/*? + * Text: "DCSS %s of range %p to %p and type %s loaded as exclusive-writable\n" + * Severity: Informational + * Parameter: + * @1: DCSS name + * @2: starting page address + * @3: ending page address + * @4: DCSS type + * Description: + * The DCSS was loaded successfully in exclusive-writable access mode. + * User action: + * None. + */ + +/*? + * Text: "DCSS %s of range %p to %p and type %s loaded in shared access mode\n" + * Severity: Informational + * Parameter: + * @1: DCSS name + * @2: starting page address + * @3: ending page address + * @4: DCSS type + * Description: + * The DCSS was loaded successfully in shared access mode. + * User action: + * None. + */ + +/*? + * Text: "DCSS %s is already in the requested access mode\n" + * Severity: Informational + * Parameter: + * @1: DCSS name + * Description: + * A request to reload a DCSS with a new access mode has been rejected + * because the new access mode is the same as the current access mode. + * User action: + * None. + */ + +/*? + * Text: "DCSS %s is in use and cannot be reloaded\n" + * Severity: Warning + * Parameter: + * @1: DCSS name + * Description: + * Reloading a DCSS in a different access mode has failed because the DCSS is + * being used by one or more device drivers. The DCSS remains loaded with the + * current access mode. + * User action: + * Ensure that the DCSS is not used by any device driver then try again to + * load the DCSS with the new access mode. + */ + +/*? + * Text: "DCSS %s overlaps with used memory resources and cannot be reloaded\n" + * Severity: Warning + * Parameter: + * @1: DCSS name + * Description: + * The DCSS has been unloaded and cannot be reloaded because it overlaps with + * another loaded DCSS or with the memory of the z/VM guest virtual machine + * (guest storage). + * User action: + * Ensure that no DCSS is loaded that has overlapping memory resources + * with the DCSS you want to reload. If the DCSS overlaps with guest storage, + * use the DEF STORE CONFIG z/VM CP command to create a sufficient storage gap + * for the DCSS. For details, see the section about the DCSS device driver in + * "Device Drivers, Features, and Commands". + */ + +/*? + * Text: "Reloading DCSS %s failed with rc=%ld\n" + * Severity: Warning + * Parameter: + * @1: DCSS name + * @2: return code + * Description: + * The DCSS kernel interface used z/VM diagnose call X'64' to reload a DCSS + * in a different access mode. The DCSS was unloaded but z/VM failed to reload + * the DCSS. + * User action: + * For details about the return codes see the section about DIAGNOSE Code + * X'64' in "z/VM CP Programming Services". + */ + +/*? + * Text: "Unloading unknown DCSS %s failed\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * The specified DCSS cannot be unloaded. The DCSS is known to the DCSS device + * driver but not to the DCSS kernel interface. This problem indicates a + * program error in extmem.c. + * User action: + * Report this problem to your support organization. + */ + +/*? + * Text: "Saving unknown DCSS %s failed\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * The specified DCSS cannot be saved. The DCSS is known to the DCSS device + * driver but not to the DCSS kernel interface. This problem indicates a + * program error in extmem.c. + * User action: + * Report this problem to your support organization. + */ + +/*? + * Text: "Saving a DCSS failed with DEFSEG response code %i\n" + * Severity: Error + * Parameter: + * @1: response-code + * Description: + * The DEFSEG z/VM CP command failed to permanently save changes to a DCSS. + * User action: + * Ensure that the z/VM guest virtual machine is authorized to issue + * the CP DEFSEG command (typically privilege class E). + * Look for related messages to find the cause of this error. See also message + * HCPE in the DEFSEG section of the "z/VM CP Command and + * Utility Reference". + */ + +/*? + * Text: "Saving a DCSS failed with SAVESEG response code %i\n" + * Severity: Error + * Parameter: + * @1: response-code + * Description: + * The SAVESEG z/VM CP command failed to permanently save changes to a DCSS. + * User action: + * Ensure that the z/VM guest virtual machine is authorized to issue + * the CP SAVESEG command (typically privilege class E). + * Look for related messages to find the cause of this error. See also message + * HCPE in the SAVESEG section of the "z/VM CP Command and + * Utility Reference". + */ + +/*? + * Text: "DCSS %s cannot be loaded or queried\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * You cannot load or query the specified DCSS because it either is not defined + * in the z/VM hypervisor, or it is a class S DCSS, or it is above 2047 MB + * and the Linux system is a 31-bit system. + * User action: + * Use the CP command "QUERY NSS" to find out if the DCSS is a valid + * DCSS that can be loaded. + */ + +/*? + * Text: "DCSS %s cannot be loaded or queried without z/VM\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * A DCSS is a z/VM resource. Your Linux instance is not running as a z/VM + * guest operating system and, therefore, cannot load DCSSs. + * User action: + * Load DCSSs only on Linux instances that run as z/VM guest operating systems. + */ + +/*? + * Text: "Loading or querying DCSS %s resulted in a hardware error\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * Either the z/VM DIAGNOSE X'64' query or load call issued for the DCSS + * returned with an error. + * User action: + * Look for previous extmem message to find the return code from the + * DIAGNOSE X'64' query or load call. For details about the return codes see + * the section about DIAGNOSE Code X'64' in "z/VM CP Programming Services". + */ + +/*? + * Text: "DCSS %s has multiple page ranges and cannot be loaded or queried\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * You can only load or query a DCSS with multiple page ranges if: + * - The DCSS has 6 or fewer page ranges + * - The page ranges form a contiguous address space + * - The page ranges are of type EW or EN + * User action: + * Check the definition of the DCSS to make sure that the conditions for + * DCSSs with multiple page ranges are met. + */ + +/*? + * Text: "%s needs used memory resources and cannot be loaded or queried\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * You cannot load or query the DCSS because it overlaps with an already + * loaded DCSS or with the memory of the z/VM guest virtual machine + * (guest storage). + * User action: + * Ensure that no DCSS is loaded that has overlapping memory resources + * with the DCSS you want to load or query. If the DCSS overlaps with guest + * storage, use the DEF STORE CONFIG z/VM CP command to create a sufficient + * storage gap for the DCSS. For details, see the section about the DCSS + * device driver in "Device Drivers, Features, and Commands". + */ + +/*? + * Text: "DCSS %s is already loaded in a different access mode\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * The DCSS you are trying to load has already been loaded in a different + * access mode. You cannot simultaneously load the DCSS in different modes. + * User action: + * Reload the DCSS in a different mode or load it with the same mode in which + * it has already been loaded. + */ + +/*? + * Text: "There is not enough memory to load or query DCSS %s\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * The available memory is not enough to load or query the DCSS. + * User action: + * Free some memory and repeat the failed operation. + */ + +/*? + * Text: "DCSS %s overlaps with used storage and cannot be loaded\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * You cannot load the DCSS because it overlaps with an already loaded DCSS + * or with the memory of the z/VM guest virtual machine (guest storage). + * User action: + * Ensure that no DCSS is loaded that has overlapping memory resources + * with the DCSS you want to load. If the DCSS overlaps with guest storage, + * use the DEF STORE CONFIG z/VM CP command to create a sufficient storage gap + * for the DCSS. For details, see the section about the DCSS device driver in + * "Device Drivers, Features, and Commands". + */ + +/*? + * Text: "DCSS %s exceeds the kernel mapping range (%lu) and cannot be loaded\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * @2: kernel mapping range in bytes + * Description: + * You cannot load the DCSS because it exceeds the kernel mapping range limit. + * User action: + * Ensure that the DCSS range is defined below the kernel mapping range. + */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/hmcdrv +++ linux-azure-4.11.0/Documentation/kmsg/s390/hmcdrv @@ -0,0 +1,22 @@ +/*? + * Text: "Allocating the requested cache size of %zu bytes failed\n" + * Severity: Error + * Parameter: + * @1: size + * Description: + * You cannot use the 'hmcdrv' module. + * Either the cache size that was specified for the 'hmcdrv' module exceeded + * the maximum of 1048576 (1 megabyte), or not enough free memory was + * available. + * If the 'hmcdrv' module was compiled into the kernel, the cache size was + * specified with the 'hmcdrv.cachesize' kernel parameter. + * For a separate 'hmcdrv' module, the cache size was specified with the + * 'cachesize=' module parameter. + * User action: + * Specify a smaller cache size and try again to load the module. + * Do not exceed the maximum specification of 1048576 (1 megabyte). + * If necessary, free some memory and try again. + * If the module is compiled into the kernel, you must reboot Linux to change + * the cache size specification. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/hugetlb +++ linux-azure-4.11.0/Documentation/kmsg/s390/hugetlb @@ -0,0 +1,13 @@ +/*? + * Text: "hugepagesz= specifies an unsupported page size %s\n" + * Severity: Error + * Parameter: + * @1: size + * Description: + * The hugepagesz= kernel parameter specifies a huge page size + * that is not supported. + * User action: + * Specify "1M" for 1 MB huge pages. These are supported as of z10. + * Specify "2G" for 2 GB huge pages. These are supported as of zEC12 + * and zBC12 machines. + */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/hvc_iucv +++ linux-azure-4.11.0/Documentation/kmsg/s390/hvc_iucv @@ -0,0 +1,123 @@ +/*? + * Text: "The z/VM IUCV HVC device driver cannot be used without z/VM\n" + * Severity: Notice + * Description: + * The z/VM IUCV hypervisor console (HVC) device driver requires the + * z/VM inter-user communication vehicle (IUCV). + * User action: + * Set "hvc_iucv=" to zero in the kernel parameter line and reboot Linux. + */ + +/*? + * Text: "%lu is not a valid value for the hvc_iucv= kernel parameter\n" + * Severity: Error + * Parameter: + * @1: hvc_iucv_devices + * Description: + * The "hvc_iucv=" kernel parameter specifies the number of z/VM IUCV + * hypervisor console (HVC) terminal devices. + * The parameter value ranges from 0 to 8. + * If zero is specified, the z/VM IUCV HVC device driver is disabled + * and no IUCV-based terminal access is available. + * User action: + * Correct the "hvc_iucv=" setting in the kernel parameter line and + * reboot Linux. + */ + +/*? + * Text: "Creating a new HVC terminal device failed with error code=%d\n" + * Severity: Error + * Parameter: + * @1: errno + * Description: + * The device driver initialization failed to allocate a new + * HVC terminal device. + * A possible cause of this problem is memory constraints. + * User action: + * If the error code is -12 (ENOMEM), consider assigning more memory + * to your z/VM guest virtual machine. + */ + +/*? + * Text: "Registering HVC terminal device as Linux console failed\n" + * Severity: Error + * Description: + * The device driver initialization failed to set up the first HVC terminal + * device for use as Linux console. + * User action: + * If the error code is -12 (ENOMEM), consider assigning more memory + * to your z/VM guest virtual machine. + */ + +/*? + * Text: "Registering IUCV handlers failed with error code=%d\n" + * Severity: Error + * Parameter: + * @1: errno + * Description: + * The device driver initialization failed to register with z/VM IUCV to + * handle IUCV connections, as well as sending and receiving of IUCV messages. + * User action: + * Check for related IUCV error messages and see the errno manual page + * to find out what caused the problem. + */ + +/*? + * Text: "Allocating memory failed with reason code=%d\n" + * Severity: Error + * Parameter: + * @1: reason + * Description: + * The z/VM IUCV hypervisor console (HVC) device driver initialization failed, + * because of a general memory allocation failure. The reason code indicates + * the memory operation that has failed: + * kmem_cache (reason code=1), + * mempool (reason code=2), or + * hvc_iucv_allow= (reason code=3) + * User action: + * Consider assigning more memory to your z/VM guest virtual machine. + */ + +/*? + * Text: "hvc_iucv_allow= does not specify a valid z/VM user ID list\n" + * Severity: Error + * Description: + * The "hvc_iucv_allow=" kernel parameter specifies a comma-separated list + * of z/VM user IDs that are permitted to connect to the z/VM IUCV hypervisor + * device driver. + * The z/VM user IDs in the list must not exceed eight characters and must + * not contain spaces. + * User action: + * Correct the "hvc_iucv_allow=" setting in the kernel parameter line and reboot + * Linux. + */ + +/*? + * Text: "hvc_iucv_allow= specifies too many z/VM user IDs\n" + * Severity: Error + * Description: + * The "hvc_iucv_allow=" kernel parameter specifies a comma-separated list + * of z/VM user IDs that are permitted to connect to the z/VM IUCV hypervisor + * device driver. + * The number of z/VM user IDs that are specified with the "hvc_iucv_allow=" + * kernel parameter exceeds the maximum of 500. + * User action: + * Correct the "hvc_iucv_allow=" setting by reducing the z/VM user IDs in + * the list and reboot Linux. + */ + +/*? + * Text: "A connection request from z/VM user ID %s was refused\n" + * Severity: Informational + * Parameter: + * @1: ID + * Description: + * An IUCV connection request from another z/VM guest virtual machine has been + * refused. The request was from a z/VM guest virtual machine that is not + * listed by the "hvc_iucv_allow=" kernel parameter. + * User action: + * Check the "hvc_iucv_allow=" kernel parameter setting. + * Consider adding the z/VM user ID to the "hvc_iucv_allow=" list in the kernel + * parameter line and reboot Linux. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/hypfs +++ linux-azure-4.11.0/Documentation/kmsg/s390/hypfs @@ -0,0 +1,56 @@ +/*? + * Text: "The hardware system does not support hypfs\n" + * Severity: Error + * Description: + * hypfs requires DIAGNOSE Code X'204' but this diagnose code is not available + * on your hardware. You need more recent hardware to use hypfs. + * User action: + * None. + */ + +/*? + * Text: "The hardware system does not provide all functions required by hypfs\n" + * Severity: Error + * Description: + * hypfs requires DIAGNOSE Code X'224' but this diagnode code is not available + * on your hardware. You need more recent hardware to use hypfs. + * User action: + * None. + */ + +/*? + * Text: "Updating the hypfs tree failed\n" + * Severity: Error + * Description: + * There was not enough memory available to update the hypfs tree. + * User action: + * Free some memory and try again to update the hypfs tree. Consider assigning + * more memory to your LPAR or z/VM guest virtual machine. + */ + +/*? + * Text: "%s is not a valid mount option\n" + * Severity: Error + * Parameter: + * @1: mount option + * Description: + * hypfs has detected mount options that are not valid. + * User action: + * See "Device Drivers Features and Commands" for information about valid + * mount options for hypfs. + */ + +/*? + * Text: "Initialization of hypfs failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: error code + * Description: + * Initialization of hypfs failed because of resource or hardware constraints. + * Possible reasons for this problem are insufficient free memory or missing + * hardware interfaces. + * User action: + * See errno.h for information about the error codes. + */ + +/*? Text: "Hypervisor filesystem mounted\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/iucv +++ linux-azure-4.11.0/Documentation/kmsg/s390/iucv @@ -0,0 +1,33 @@ +/*? + * Text: "Defining an interrupt buffer on CPU %i failed with 0x%02x (%s)\n" + * Severity: Warning + * Parameter: + * @1: CPU number + * @2: hexadecimal error value + * @3: short error code explanation + * Description: + * Defining an interrupt buffer for external interrupts failed. Error + * value 0x03 indicates a problem with the z/VM directory entry of the + * z/VM guest virtual machine. This problem can also be caused by a + * program error. + * User action: + * If the error value is 0x03, examine the z/VM directory entry of your + * z/VM guest virtual machine. If the directory entry is correct or if the + * error value is not 0x03, report this problem to your support organization. + */ + +/*? + * Text: "Suspending Linux did not completely close all IUCV connections\n" + * Severity: Warning + * Description: + * When resuming a suspended Linux instance, the IUCV base code found + * data structures from one or more IUCV connections that existed before the + * Linux instance was suspended. Modules that use IUCV connections must close + * these connections when a Linux instance is suspended. This problem + * indicates an error in a program that used an IUCV connection. + * User action: + * Report this problem to your support organization. + */ + +/*? Text: "iucv_external_interrupt: out of memory\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/lcs +++ linux-azure-4.11.0/Documentation/kmsg/s390/lcs @@ -0,0 +1,169 @@ +/*? + * Text: "%s: Allocating a socket buffer to interface %s failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the LCS device + * @2: network interface + * Description: + * LAN channel station (LCS) devices require a socket buffer (SKB) structure + * for storing incoming data. The LCS device driver failed to allocate an SKB + * structure to the LCS device. A likely cause of this problem is memory + * constraints. + * User action: + * Free some memory and repeat the failed operation. + */ + +/*? + * Text: "%s: Shutting down the LCS device failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the LCS device + * Description: + * A request to shut down a LAN channel station (LCS) device resulted in an + * error. The error is logged in the LCS trace at trace level 4. + * User action: + * Try again to shut down the device. If the error persists, see the LCS trace + * to find out what causes the error. + */ + +/*? + * Text: "%s: Detecting a network adapter for LCS devices failed with rc=%d (0x%x)\n" + * Severity: Error + * Parameter: + * @1: bus ID of the LCS device + * @2: lcs_detect return code in decimal notation + * @3: lcs_detect return code in hexadecimal notation + * Description: + * The LCS device driver could not initialize a network adapter. + * User action: + * Ensure that the physical connection from the port to the network is + * in place. If the error persists, note the return code from the error + * message and contact IBM support. + */ + +/*? + * Text: "%s: A recovery process has been started for the LCS device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the LCS device + * Description: + * The LAN channel station (LCS) device is shut down and restarted. The recovery + * process might have been initiated by a user or started automatically as a + * response to a device problem. + * User action: + * Wait until a message indicates the completion of the recovery process. + */ + +/*? + * Text: "%s: An I/O-error occurred on the LCS device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the LCS device + * Description: + * The LAN channel station (LCS) device reported a problem that can be recovered + * by the LCS device driver. Repeated occurrences of this problem indicate a + * malfunctioning device. + * User action: + * If this problem occurs frequently, initiate a recovery process for the + * device, for example, by writing '1' to the 'recover' sysfs attribute of the + * device. + */ + +/*? + * Text: "%s: A command timed out on the LCS device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the LCS device + * Description: + * The LAN channel station (LCS) device reported a problem that can be recovered + * by the LCS device driver. Repeated occurrences of this problem indicate a + * malfunctioning device. + * User action: + * If this problem occurs frequently, initiate a recovery process for the + * device, for example, by writing '1' to the 'recover' sysfs attribute of the + * device. + */ + +/*? + * Text: "%s: An error occurred on the LCS device, rc=%ld\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the LCS device + * @2: return code + * Description: + * The LAN channel station (LCS) device reported a problem that can be recovered + * by the LCS device driver. Repeated occurrences of this problem indicate a + * malfunctioning device. + * User action: + * If this problem occurs frequently, initiate a recovery process for the + * device, for example, by writing '1' to the 'recover' sysfs attribute of the + * device. + */ + +/*? + * Text: "%s: The LCS device stopped because of an error, dstat=0x%X, cstat=0x%X \n" + * Severity: Warning + * Parameter: + * @1: bus ID of the LCS device + * @2: device status + * @3: subchannel status + * Description: + * The LAN channel station (LCS) device reported an error. The LCS device driver + * might start a device recovery process. + * User action: + * If the device driver does not start a recovery process, initiate a recovery + * process, for example, by writing '1' to the 'recover' sysfs attribute of the + * device. If the problem persists, note the status information provided with + * the message and contact IBM support. + */ + +/*? + * Text: "%s: Starting an LCS device resulted in an error, rc=%d!\n" + * Severity: Error + * Parameter: + * @1: bus ID of the LCS device + * @2: ccw_device_start return code in decimal notation + * Description: + * The LAN channel station (LCS) device driver failed to initialize an LCS + * device. The device is not operational. + * User action: + * Initiate a recovery process, for example, by writing '1' to the 'recover' + * sysfs attribute of the device. If the problem persists, contact IBM support. + */ + +/*? + * Text: "%s: Sending data from the LCS device to the LAN failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: bus ID of the LCS device + * @2: ccw_device_resume return code in decimal notation + * Description: + * The LAN channel station (LCS) device driver could not send data to the LAN + * using the LCS device. This might be a temporary problem. Operations continue + * on the LCS device. + * User action: + * If this problem occurs frequently, initiate a recovery process, for example, + * by writing '1' to the 'recover' sysfs attribute of the device. If the + * problem persists, contact IBM support. + */ + +/*? Text: "Query IPAssist failed. Assuming unsupported!\n" */ +/*? Text: "Stoplan for %s initiated by LGW\n" */ +/*? Text: "Not enough memory to add new multicast entry!\n" */ +/*? Text: "Not enough memory for debug facility.\n" */ +/*? Text: "Adding multicast address failed. Table possibly full!\n" */ +/*? Text: "Error in opening device!\n" */ +/*? Text: "LCS device %s %s IPv6 support\n" */ +/*? Text: "Device %s successfully recovered!\n" */ +/*? Text: "LCS device %s %s Multicast support\n" */ +/*? Text: " Initialization failed\n" */ +/*? Text: "Loading %s\n" */ +/*? Text: "Initialization failed\n" */ +/*? Text: "Terminating lcs module.\n" */ +/*? Text: "Device %s could not be recovered!\n" */ +/*? Text: "Initializing the lcs device driver failed\n" */ +/*? Text: "%s: The lcs device driver failed to recover the device\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/monreader +++ linux-azure-4.11.0/Documentation/kmsg/s390/monreader @@ -0,0 +1,128 @@ +/*? + * Text: "Reading monitor data failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * The z/VM *MONITOR record device driver failed to read monitor data + * because the IUCV REPLY function failed. The read function against + * the monitor record device returns EIO. All monitor data that has been read + * since the last read with 0 size is incorrect. + * User action: + * Disregard all monitor data that has been read since the last read with + * 0 size. If the device driver has been compiled as a separate module, unload + * and reload the monreader module. If the device driver has been compiled + * into the kernel, reboot Linux. For more information about possible causes + * of the error see the IUCV section in "z/VM CP Programming Services" and + * the *MONITOR section in "z/VM Performance". + */ + +/*? + * Text: "z/VM *MONITOR system service disconnected with rc=%i\n" + * Severity: Error + * Parameter: + * @1: IPUSER SEVER return code + * Description: + * The z/VM *MONITOR record device driver receives monitor records through + * an IUCV connection to the z/VM *MONITOR system service. This connection + * has been severed and the read function of the z/VM *MONITOR device driver + * returns EIO. All data received since the last read with 0 size is incorrect. + * User action: + * Disregard all monitor data read since the last read with 0 size. Close and + * reopen the monitor record device. For information about the IPUSER SEVER + * return codes see "z/VM Performance". + */ + +/*? + * Text: "The read queue for monitor data is full\n" + * Severity: Warning + * Description: + * The read function of the z/VM *MONITOR device driver returns EOVERFLOW + * because not enough monitor data has been read since the monitor device + * has been opened. Monitor data already read are valid and subsequent reads + * return valid data but some intermediate data might be missing. + * User action: + * Be aware that monitor data might be missing. Assure that you regularly + * read monitor data after opening the monitor record device. + */ + +/*? + * Text: "Connecting to the z/VM *MONITOR system service failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: IUCV CONNECT return code + * Description: + * The z/VM *MONITOR record device driver receives monitor records through + * an IUCV connection to the z/VM *MONITOR system service. This connection + * could not be established when the monitor record device was opened. If + * the return code is 15, your z/VM guest virtual machine is not authorized + * to connect to the *MONITOR system service. + * User action: + * If the return code is 15, ensure that the IUCV *MONITOR statement is + * included in the z/VM directory entry for your z/VM guest virtual machine. + * For other IUCV CONNECT return codes see the IUCV section in "CP Programming + * Services" and the *MONITOR section in "z/VM Performance". + */ + +/*? + * Text: "Disconnecting the z/VM *MONITOR system service failed with rc=%i\n" + * Severity: Warning + * Parameter: + * @1: IUCV SEVER return code + * Description: + * The z/VM *MONITOR record device driver receives monitor data through an + * IUCV connection to the z/VM *MONITOR system service. This connection + * could not be closed when the monitor record device was closed. You might + * not be able to resume monitoring. + * User action: + * No immediate action is necessary. If you cannot open the monitor record + * device in the future, reboot Linux. For information about the IUCV SEVER + * return codes see the IUCV section in "CP Programming Services" and the + * *MONITOR section in "z/VM Performance". + */ + +/*? + * Text: "The z/VM *MONITOR record device driver cannot be loaded without z/VM\n" + * Severity: Error + * Description: + * The z/VM *MONITOR record device driver uses z/VM system services to provide + * monitor data about z/VM guest operating systems to applications on Linux. + * On Linux instances that run in environments other than the z/VM hypervisor, + * the z/VM *MONITOR record device driver does not provide any useful + * function and the corresponding monreader module cannot be loaded. + * User action: + * Load the z/VM *MONITOR record device driver only on Linux instances that run + * as guest operating systems of the z/VM hypervisor. If the z/VM *MONITOR + * record device driver has been compiled into the kernel, ignore this message. + */ + +/*? + * Text: "The z/VM *MONITOR record device driver failed to register with IUCV\n" + * Severity: Error + * Description: + * The z/VM *MONITOR record device driver receives monitor data through an IUCV + * connection and needs to register with the IUCV device driver. This + * registration failed and the z/VM *MONITOR record device driver was not + * loaded. A possible cause of this problem is insufficient memory. + * User action: + * Free some memory and try again to load the module. If the z/VM *MONITOR + * record device driver has been compiled into the kernel, you might have to + * configure more memory and reboot Linux. If you do not want to read monitor + * data, ignore this message. + */ + +/*? + * Text: "The specified *MONITOR DCSS %s does not have the required type SC\n" + * Severity: Error + * Parameter: + * @1: DCSS name + * Description: + * The DCSS that was specified with the monreader.mondcss kernel parameter or + * with the mondcss module parameter cannot be a *MONITOR DCSS because it is + * not of type SC. + * User action: + * Confirm that you are using the name of the DCSS that has been configured as + * the *MONITOR DCSS on the z/VM hypervisor. If the default name, MONDCSS, is + * used, omit the monreader.mondcss or mondcss parameter. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/monwriter +++ linux-azure-4.11.0/Documentation/kmsg/s390/monwriter @@ -0,0 +1,17 @@ +/*? + * Text: "Writing monitor data failed with rc=%i\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * The monitor stream application device driver used the z/VM diagnose call + * DIAG X'DC' to start writing monitor data. z/VM returned an error and the + * monitor data cannot be written. If the return code is 5, your z/VM guest + * virtual machine is not authorized to write monitor data. + * User action: + * If the return code is 5, ensure that your z/VM guest virtual machine's + * entry in the z/VM directory includes the OPTION APPLMON statement. + * For other return codes see the section about DIAGNOSE Code X'DC' + * in "z/VM CP Programming Services". + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/netiucv +++ linux-azure-4.11.0/Documentation/kmsg/s390/netiucv @@ -0,0 +1,156 @@ +/*? + * Text: "%s: The peer interface of the IUCV device has closed the connection\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the IUCV device + * Description: + * The peer interface on the remote z/VM guest virtual machine has closed the + * connection. Do not expect further packets on this interface. Any packets + * you send to this interface will be dropped. + * User action: + * None. + */ + +/*? + * Text: "%s: The IUCV device failed to connect to z/VM guest %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the IUCV device + * @2: z/VM user ID + * Description: + * The connection cannot be established because the z/VM guest virtual + * machine with the peer interface is not running. + * User action: + * Ensure that the z/VM guest virtual machine with the peer interface is + * running; then try again to establish the connection. + */ + +/*? + * Text: "%s: The IUCV device failed to connect to the peer on z/VM guest %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the IUCV device + * @2: z/VM user ID + * Description: + * The connection cannot be established because the z/VM guest virtual machine + * with the peer interface is not configured for IUCV connections. + * User action: + * Configure the z/VM guest virtual machine with the peer interface for IUCV + * connections; then try again to establish the connection. + */ + +/*? + * Text: "%s: Connecting the IUCV device would exceed the maximum number of IUCV connections\n" + * Severity: Error + * Parameter: + * @1: bus ID of the IUCV device + * Description: + * The connection cannot be established because the maximum number of IUCV + * connections has been reached on the local z/VM guest virtual machine. + * User action: + * Close some of the established IUCV connections on the local z/VM guest + * virtual machine; then try again to establish the connection. + */ + +/*? + * Text: "%s: z/VM guest %s has too many IUCV connections to connect with the IUCV device\n" + * Severity: Error + * Parameter: + * @1: bus ID of the IUCV device + * @2: remote z/VM user ID + * Description: + * Connecting to the remote z/VM guest virtual machine failed because the + * maximum number of IUCV connections for the remote z/VM guest virtual + * machine has been reached. + * User action: + * Close some of the established IUCV connections on the remote z/VM guest + * virtual machine; then try again to establish the connection. + */ + +/*? + * Text: "%s: The IUCV device cannot connect to a z/VM guest with no IUCV authorization\n" + * Severity: Error + * Parameter: + * @1: bus ID of the IUCV device + * Description: + * Because the remote z/VM guest virtual machine is not authorized for IUCV + * connections, the connection cannot be established. + * User action: + * Add the statements 'IUCV ALLOW' and 'IUCV ANY' to the z/VM directory + * entry of the remote z/VM guest virtual machine; then try again to + * establish the connection. See "z/VM CP Planning and Administration" + * for details about the IUCV statements. + */ + +/*? + * Text: "%s: Connecting the IUCV device failed with error %d\n" + * Severity: Error + * Parameter: + * @1: bus ID of the IUCV device + * @2: error code + * Description: + * The connection cannot be established because of an IUCV CONNECT error. + * User action: + * Report this problem to your support organization. + */ + +/*? + * Text: "%s: The IUCV device has been connected successfully to %s\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the IUCV device + * @2: remote z/VM user ID + * Description: + * The connection has been established and the interface is ready to + * transmit communication packages. + * User action: + * None. + */ + +/*? + * Text: "%s: The IUCV interface to %s has been established successfully\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the IUCV device + * @2: remote z/VM user ID + * Description: + * The IUCV interface to the remote z/VM guest virtual machine has been + * established and can be activated with "ifconfig up" or an equivalent + * command. + * User action: + * None. + */ + +/*? + * Text: "%s: The IUCV device is connected to %s and cannot be removed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the IUCV device + * @2: remote z/VM user ID + * Description: + * Removing a connection failed because the interface is active with a peer + * interface on a remote z/VM guest virtual machine. + * User action: + * Deactivate the interface with "ifconfig down" or an equivalent command; + * then try again to remove the interface. + */ + +/*? + * Text: "%s: The peer z/VM guest %s has closed the connection\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the IUCV device + * @2: remote z/VM user ID + * Description: + * The peer interface is no longer available. + * User action: + * Either deactivate and remove the interface, or wait for the peer + * z/VM guest to re-establish the interface. + */ + +/*? Text: "driver unloaded\n" */ +/*? Text: "driver initialized\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/numa +++ linux-azure-4.11.0/Documentation/kmsg/s390/numa @@ -0,0 +1,11 @@ +/*? + * Text: "NUMA mode: %s\n" + * Severity: Informational + * Parameter: + * @1: mode + * Description: + * Linux started with the specified NUMA mode. + * User action: + * None. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/numa_emu +++ linux-azure-4.11.0/Documentation/kmsg/s390/numa_emu @@ -0,0 +1,50 @@ +/*? + * Text: "Not enough memory for %d nodes, reducing node count\n" + * Severity: Warning + * Parameter: + * @1: requested number of nodes + * Description: + * Using the requested memory stripe size for emulating the requested number of + * NUMA nodes requires more than the available memory. The number of nodes is + * specified with the emu_nodes= kernel parameter. The memory stripe size to + * be used for distributing the available memory among the nodes is specified + * with the emu_size= kernel parameter. Fewer nodes were created than the + * requested number; each node has one memory stripe of the requested size. + * User action: + * Specify fewer nodes, reduce the memory stripe size, or make more memory + * available to your Linux instance. + */ + +/*? + * Text: "Creating %d nodes with memory stripe size %ld MB\n" + * Severity: Informational + * Parameter: + * @1: number of nodes + * @2: stripe size + * Description: + * NUMA emulation is activated with the reported number of NUMA nodes. + * The specified memory stripe size is used to distribute, in round-robin + * fashion, the available memory among the nodes. + * User action: + * None. + */ + +/*? + * Text: "Increasing memory stripe size from %ld MB to %ld MB\n" + * Severity: Warning + * Parameter: + * @1: requested memory stripe size + * @2: adjusted memory stripe size + * Description: + * NUMA emulation could not use the requested memory stripe size and + * therefore has increased it to the next possible value. + * The requested memory stripe size is a default value or it was specified + * with the emu_size= kernel parameter. + * The memory stripe size must be a multiple of the memory block size that + * can be read in hexadecimal notation from + * /sys/devices/system/memory/block_size_bytes. + * User action: + * To avoid this message in the future, specify a valid memory stripe size + * with the emu_size= kernel parameter. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/os_info +++ linux-azure-4.11.0/Documentation/kmsg/s390/os_info @@ -0,0 +1,36 @@ +/*? + * Text: "entry %i: %s (addr=0x%lx size=%lu)\n" + * Severity: Informational + * Parameter: + * @1: entry ID + * @2: entry state + * @3: entry address + * @4: entry size + * Description: + * Linux is running in kdump mode and reports information defined by the + * previously running production kernel. Possible values for + * "entry state" are: + * + * - copied: The entry has been found, verified, and copied + * + * - not available: The entry has not been defined + * + * - checksum failed: The entry has been found, but it is not valid + * User action: + * If kdump fails, contact your service organization and include this message + * in the error report. + */ + +/*? + * Text: "crashkernel: addr=0x%lx size=%lu\n" + * Severity: Informational + * Parameter: + * @1: address + * @2: size + * Description: + * Linux is running in kdump mode and reports the address and size of + * the memory area that was reserved for kdump by the previously running + * production kernel. + * User action: + * None. + */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/perf +++ linux-azure-4.11.0/Documentation/kmsg/s390/perf @@ -0,0 +1,90 @@ +/*? + * Text: "CPU[%i] CPUM_CF: ver=%u.%u A=%04x E=%04x C=%04x\n" + * Severity: Informational + * Parameter: + * @1: cpu number + * @2: first version number + * @3: second version number + * @4: counter set authorization + * @5: counter set enable controls + * @6: counter set activation controls + * Description: + * This message displays information about the CPU-measurement counter facility + * (CPUM_CF) on a particular CPU. For details, see + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * None. + */ + +/*? + * Text: "CPU[%i] CPUM_SF: basic=%i diag=%i min=%lu max=%lu cpu_speed=%u\n" + * Severity: Informational + * Parameter: + * @1: cpu number + * @2: authorization status for the basic-sampling function + * @3: authorization status for the diagnostic-sampling function + * @4: minimum sampling interval + * @5: maximum sampling interval + * @6: cpu speed + * Description: + * This message displays generic information about the CPU-measurement sampling + * facility (CPUM_SF) on a particular CPU. For details, see + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * None. + */ + +/*? + * Text: "CPU[%i] CPUM_SF: Basic-sampling: a=%i e=%i c=%i bsdes=%i tear=%016lx dear=%016lx\n" + * Severity: Informational + * Parameter: + * @1: cpu number + * @2: authorization control + * @3: enable control + * @4: activation control + * @5: basic-sampling-data-entry size + * @6: tear register contents + * @7: dear register contents + * Description: + * This message displays information about the basic-sampling function of the + * CPU-measurement sampling facility (CPUM_SF) on a particular CPU. + * For details, see + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * None. + */ + +/*? + * Text: "CPU[%i] CPUM_SF: Diagnostic-sampling: a=%i e=%i c=%i dsdes=%i tear=%016lx dear=%016lx\n" + * Severity: Informational + * Parameter: + * @1: cpu number + * @2: authorization control + * @3: enable control + * @4: activation control + * @5: diagnostic-sampling-data-entry size + * @6: tear register contents + * @7: dear register contents + * Description: + * This message displays information about the diagnostic-sampling function of the + * CPU-measurement sampling facility (CPUM_SF) on a particular CPU. + * For details, see + * "The Load-Program-Parameter and the CPU-Measurement Facilities", SA23-2260. + * User action: + * None. + */ + +/*? + * Text: "The sampling facility is already reserved by %p\n" + * Severity: Warning + * Parameter: + * @1: address of perf sampling support owner + * Description: + * A process tried to reserve the sampling facility support, but it was already + * reserved by another process. + * User action: + * Check whether another process, for example, the perf program or OProfile is + * currently active. Retry activating the sampling facility after the other + * process has ended. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/prng +++ linux-azure-4.11.0/Documentation/kmsg/s390/prng @@ -0,0 +1,103 @@ +/* prng */ + +/*? + * Text: "prng runs in TDES mode with chunksize=%d and reseed_limit=%u\n" + * Severity: Informational + * Parameter: + * @1: read chunk size in bytes + * @2: reseed limit + * Description: + * The pseudo-random number device driver started in triple DES mode. + * For IBM mainframes earlier than IBM zEnterprise EC12 (zEC12), + * triple DES is the only available mode. + * As of zEC12, the preferred mode is SHA-512. + * User action: + * If triple DES is the expected mode, no action is required. + * Otherwise, verify that the prng started with the mode= module or + * prng.mode= kernel parameter set to a value other than 1. + * The value 1 forces triple DES mode. Also ensure that the mainframe + * runs with the latest firmware level. + */ + +/*? + * Text: "The prng module stopped after running in triple DES mode\n" + * Severity: Informational + * Description: + * The pseudo-random number device driver was running in triple DES mode. + * The device driver module, prng, was unloaded, or it stopped + * because Linux shut down. + * User action: + * None. + */ + +/*? + * Text: "The prng module cannot start in SHA-512 mode\n" + * Severity: Error + * Description: + * The pseudo-random number device driver was loaded with the mode= module parameter + * or the prng.mode= kernel parameter set to 2. This setting forces SHA-512 mode, + * but the required support for MSA 5 is not available. This support requires an IBM + * zEnterprise EC12 (zEC12) or later mainframe. + * User action: + * If your mainframe is earlier than zEC12, set the mode= module or + * prng.mode= kernel parameter to 0 or 1 to run the + * pseudo-random number device driver in triple DES mode. + * Otherwise, ensure that MSA 5 support available. + */ + +/*? + * Text: "prng runs in SHA-512 mode with chunksize=%d and reseed_limit=%u\n" + * Severity: Informational + * Parameter: + * @1: read chunk size in bytes + * @2: reseed limit + * Description: + * The pseudo-random number device driver started in SHA-512 mode. + * As of IBM zEnterprise EC12, this is the preferred mode. + * User action: + * None. + */ + +/*? + * Text: "The prng module stopped after running in SHA-512 mode\n" + * Severity: Informational + * Description: + * The pseudo-random number device driver was running in SHA-512 mode. + * The device driver module, prng, was unloaded, or stopped + * because Linux shut down. + * User action: + * None. + */ + +/*? + * Text: "The prng self test state test for the SHA-512 mode failed\n" + * Severity: Error + * Description: + * The pseudo-random number device driver is not operational because the self test failed. + * After processing a published National Institute of Standards and Technology (NIST) test vector for the + * Deterministic Random Bit Generator (DRBG) algorithm, the device driver + * was not in the expected working state. This failure might indicate + * that the cryptographic software or hardware is not working correctly. + * The processed NIST test vector was: Hash Drbg, Sha-512, Count #0. + * User action: + * Unload and reload the prng module, or + * if prng was compiled into the kernel, restart Linux. + * If the error persists, contact your support organization. + */ + +/*? + * Text: "The prng self test data test for the SHA-512 mode failed\n" + * Severity: Error + * Description: + * The pseudo-random number device driver is not operational because the self test failed. + * After processing a published National Institute of Standards and Technology (NIST) test vector for the + * Deterministic Random Bit Generator (DRBG) algorithm, the device driver + * did not produce the expected pseudo-random data. This failure might indicate + * that the cryptographic software or hardware is not working correctly. + * The processed NIST test vector was: Hash Drbg, Sha-512, Count #0. + * User action: + * Unload and reload the prng module, or + * if prng was compiled into the kernel, restart Linux. + * If the error persists, contact your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/qeth +++ linux-azure-4.11.0/Documentation/kmsg/s390/qeth @@ -0,0 +1,929 @@ +/*? + * Text: "%s: The LAN is offline\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A start LAN command was sent by the qeth device driver but the physical or + * virtual adapter has not started the LAN. The LAN might take a few seconds + * to become available. + * User action: + * Check the status of the qeth device, for example, with the lsqeth command. + * If the device does not become operational within a few seconds, initiate a + * recovery process, for example, by writing '1' to the 'recover' sysfs + * attribute of the device. + */ + +/*? + * Text: "%s: A recovery process has been started for the device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A recovery process was started either by the qeth device driver or through + * a user command. + * User action: + * Wait until a message indicates the completion of the recovery process. + */ + +/*? + * Text: "%s: The qeth device driver failed to recover an error on the device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth device driver performed an automatic recovery operation to recover + * an error on a qeth device. The recovery operation failed. + * User action: + * Try the following actions in the given order: i) Check the status of the + * qeth device, for example, with the lsqeth command. ii) Initiate a recovery + * process by writing '1' to the 'recover' sysfs attribute of the device. + * iii) Ungroup and regroup the subchannel triplet of the device. vi) Reboot + * Linux. v) If the problem persists, gather Linux debug data and report the + * problem to your support organization. + */ + +/*? + * Text: "%s: Device recovery failed to restore all offload features\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth device driver performed a recovery operation on a qeth device. Part + * of the recovery is to restore the offload features that were enabled before + * the recovery. At least one of those offload features could not be restored. + * User action: + * Check which offload features are enabled on the device, for example with + * the "ethtool -k" command. Try to explicitly re-enable the missing offload + * features for the device, for example with the "ethtool -K" command. + */ + +/*? + * Text: "%s: The link for interface %s on CHPID 0x%X failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * @3: CHPID + * Description: + * A network link failed. A possible reason for this error is that a physical + * network cable has been disconnected. + * User action: + * Ensure that the network cable on the adapter hardware is connected properly. + * If the connection is to a guest LAN, ensure that the device is still coupled + * to the guest LAN. + */ + +/*? + * Text: "%s: The link for %s on CHPID 0x%X has been restored\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * @3: CHPID + * Description: + * A failed network link has been re-established. A device recovery is in + * progress. + * User action: + * Wait until a message indicates the completion of the recovery process. + */ + +/*? + * Text: "%s: A hardware operation timed out on the device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A hardware operation timed out on the qeth device. + * User action: + * Check the status of the qeth device, for example, with the lsqeth command. + * If the device is not operational, initiate a recovery process, for example, + * by writing '1' to the 'recover' sysfs attribute of the device. + */ + +/*? + * Text: "%s: The adapter hardware is of an unknown type\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth device driver does not recognize the adapter hardware. The cause + * of this problem could be a hardware error or a Linux level that does not + * support your adapter hardware. + * User action: + * i) Investigate if your adapter hardware is supported by your Linux level. + * Consider using hardware that is supported by your Linux level or upgrading + * to a Linux level that supports your hardware. ii) Install the latest + * firmware on your adapter hardware. iii) If the problem persists and is not + * caused by a version mismatch, contact IBM support. + */ + +/*? + * Text: "%s: The adapter is used exclusively by another host\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth adapter is exclusively used by another host. + * User action: + * Use another qeth adapter or configure this one not exclusively to a + * particular host. + */ + +/*? + * Text: "%s: QDIO reported an error, rc=%i\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: return code + * Description: + * The QDIO subsystem reported an error. + * User action: + * Check for related QDIO errors. Check the status of the qeth device, for + * example, with the lsqeth command. If the device is not operational, initiate + * a recovery process, for example, by writing '1' to the 'recover' sysfs + * attribute of the device. + */ + +/*? + * Text: "%s: There is no kernel module to support discipline %d\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * @2: discipline + * Description: + * The qeth device driver or a user command requested a kernel module for a + * particular qeth discipline. Either the discipline is not supported by the + * qeth device driver or the requested module is not available to your Linux + * system. + * User action: + * Check if the requested discipline module has been compiled into the kernel + * or is present in /lib/modules//kernel/drivers/s390/net. + */ + +/*? + * Text: "Initializing the qeth device driver failed\n" + * Severity: Error + * Parameter: + * Description: + * The base module of the qeth device driver could not be initialized. + * User action: + * See errno.h to determine the reason for the error. + * i) Reboot Linux. ii) If the problem persists, gather Linux debug data and + * report the problem to your support organization. + */ + +/*? + * Text: "%s: Registering IP address %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: IP address + * Description: + * An IP address could not be registered with the network adapter. + * User action: + * Check if another operating system instance has already registered the + * IP address with the same network adapter or at the same logical IP subnet. + */ + +/*? + * Text: "%s: Reading the adapter MAC address failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth device driver could not read the MAC address from the network + * adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Starting ARP processing support for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not start ARP support on the network adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Starting IP fragmentation support for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not start IP fragmentation support on the + * network adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Starting VLAN support for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not start VLAN support on the network adapter. + * User action: + * None if you do not require VLAN support. If you need VLAN support, + * ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Starting multicast support for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not start multicast support on the network + * adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Activating IPv6 support for %s failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not activate IPv6 support on the network + * adapter. + * User action: + * None if you do not require IPv6 communication. If you need IPv6 support, + * ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Enabling the passthrough mode for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not enable the passthrough mode on the + * network adapter. The passthrough mode is required for all network traffic + * other than IPv4. In particular, the passthrough mode is required for IPv6 + * traffic. + * User action: + * None if all you want to support is IPv4 communication. If you want to support + * IPv6 or other network traffic apart from IPv4, ungroup and regroup the + * subchannel triplet of the device. If this does not resolve the problem, + * reboot Linux. If the problem persists, gather Linux debug data and report + * the problem to your support organization. + */ + +/*? + * Text: "%s: Enabling broadcast filtering for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not enable broadcast filtering on the network + * adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Setting up broadcast filtering for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not set up broadcast filtering on the network + * adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Setting up broadcast echo filtering for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not set up broadcast echo filtering on the + * network adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: Starting HW checksumming for %s failed, using SW checksumming\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The network adapter supports hardware checksumming for IP packages + * but the qeth device driver could not start hardware checksumming on the + * adapter. The qeth device driver continues to use software checksumming for + * IP packages. + * User action: + * None if you do not require hardware checksumming for network + * traffic. If you want to enable hardware checksumming, ungroup and regroup + * the subchannel triplet of the device. If this does not resolve the problem, + * reboot Linux. If the problem persists, gather Linux debug data and report + * the problem to your support organization. + */ + +/*? + * Text: "%s: Enabling HW checksumming for %s failed, using SW checksumming\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The network adapter supports hardware checksumming for IP packages + * but the qeth device driver could not enable hardware checksumming on the + * adapter. The qeth device driver continues to use software checksumming for + * IP packages. + * User action: + * None if you do not require hardware checksumming for network + * traffic. If you want to enable hardware checksumming, ungroup and regroup + * the subchannel triplet of the device. If this does not resolve the problem, + * reboot Linux. If the problem persists, gather Linux debug data and report + * the problem to your support organization. + */ + +/*? + * Text: "%s: Starting outbound TCP segmentation offload for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The network adapter supports TCP segmentation offload, but the qeth device + * driver could not start this support on the adapter. + * User action: + * None if you do not require TCP segmentation offload. If you want to + * enable TCP segmentation offload, ungroup and regroup the subchannel triplet + * of the device. If this does not resolve the problem, reboot Linux. If the + * problem persists, gather Linux debug data and report the problem to your + * support organization. + */ + +/*? + * Text: "%s: The network adapter failed to generate a unique ID\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * In IBM mainframe environments, network interfaces are not identified by + * a specific MAC address. Therefore, the network adapters provide the network + * interfaces with unique IDs to be used in their IPv6 link local addresses. + * Without such a unique ID, duplicate addresses might be assigned in other + * LPARs. + * User action: + * Install the latest firmware on the adapter hardware. Manually, configure + * an IPv6 link local address for this device. + */ + +/*? + * Text: "There is no IPv6 support for the layer 3 discipline\n" + * Severity: Warning + * Description: + * If you want to use IPv6 with the layer 3 discipline, you need a Linux kernel + * with IPv6 support. Because your Linux kernel has not been compiled with + * IPv6 support, you cannot use IPv6 with the layer 3 discipline, even if your + * adapter supports IPv6. + * User action: + * Use a Linux kernel that has been complied to include IPv6 support if you + * want to use IPv6 with layer 3 qeth devices. + */ + +/*? + * Text: "%s: The qeth device is not configured for the OSI layer required by z/VM\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A qeth device that connects to a virtual network on z/VM must be configured for the + * same Open Systems Interconnection (OSI) layer as the virtual network. An ETHERNET + * guest LAN or VSWITCH uses the data link layer (layer 2) while an IP guest LAN + * or VSWITCH uses the network layer (layer 3). + * User action: + * If you are connecting to an ETHERNET guest LAN or VSWITCH, set the layer2 sysfs + * attribute of the qeth device to 1. If you are connecting to an IP guest LAN or + * VSWITCH, set the layer2 sysfs attribute of the qeth device to 0. + */ + +/*? + * Text: "%s: Starting source MAC-address support for %s failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The qeth device driver could not enable source MAC-address on the network + * adapter. + * User action: + * Ungroup and regroup the subchannel triplet of the device. If this does not + * resolve the problem, reboot Linux. If the problem persists, gather Linux + * debug data and report the problem to your support organization. + */ + +/*? + * Text: "%s: MAC address %pM already exists\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: MAC-address + * Description: + * Setting the MAC address for the qeth device fails, because this + * MAC address is already defined on the OSA CHPID. + * User action: + * Use a different MAC address for this qeth device. + */ + +/*? + * Text: "%s: MAC address %pM is not authorized\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: MAC-address + * Description: + * This qeth device is a virtual network interface card (NIC), to which z/VM + * has already assigned a MAC address. z/VM MAC address verification does + * not allow you to change this predefined address. + * User action: + * None; use the MAC address that has been assigned by z/VM. + */ + +/*? + * Text: "%s: The HiperSockets network traffic analyzer is activated\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'. + * The corresponding HiperSockets interface has been switched into promiscuous mode. + * As a result, the HiperSockets network traffic analyzer is started on the device. + * User action: + * None. + */ + + /*? + * Text: "%s: The HiperSockets network traffic analyzer is deactivated\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'. + * Promiscuous mode has been switched off for the corresponding HiperSockets interface + * As a result, the HiperSockets network traffic analyzer is stopped on the device. + * User action: + * None. + */ + +/*? + * Text: "%s: The device is not authorized to run as a HiperSockets network traffic analyzer\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'. + * The corresponding HiperSockets interface is switched into promiscuous mode + * but the network traffic analyzer (NTA) rules configured at the Support Element (SE) + * do not allow tracing. Possible reasons are: + * - Tracing is not authorized for all HiperSockets LANs in the mainframe system + * - Tracing is not authorized for this HiperSockets LAN + * - LPAR is not authorized to enable an NTA + * User action: + * Configure appropriate HiperSockets NTA rules at the SE. + */ + +/*? + * Text: "%s: A HiperSockets network traffic analyzer is already active in the HiperSockets LAN\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The sysfs 'sniffer' attribute of the HiperSockets device has the value '1'. + * The HiperSockets interface is switched into promiscuous mode but another + * HiperSockets device on the same HiperSockets LAN is already running as + * a network traffic analyzer. + * A HiperSockets LAN can only have one active network traffic analyzer. + * User action: + * Do not configure multiple HiperSockets devices in the same HiperSockets LAN as + * tracing devices. + */ + +/*? + * Text: "%s: Enabling HW TX checksumming for %s failed, using SW TX checksumming\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * Description: + * The network adapter supports hardware checksumming for outgoing IP packages + * but the qeth device driver could not enable hardware TX checksumming on the + * adapter. The qeth device driver continues to use software checksumming for + * outgoing IP packages. + * User action: + * None if you do not require hardware checksumming for outgoing network + * traffic. If you want to enable hardware checksumming, ungroup and regroup + * the subchannel triplet of the device. If this does not resolve the problem, + * reboot Linux. If the problem persists, gather Linux debug data and report + * the problem to your support organization. + */ + +/*? + * Text: "%s: A connection could not be established because of an OLM limit\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * z/OS has activated Optimized Latency Mode (OLM) for a connection through an OSA Express3 adapter. + * This reduces the maximum number of concurrent connections per physical port for shared adapters. + * The new connection would exceed the maximum. Linux cannot establish further connections using + * this adapter. + * User action: + * If possible, deactivate an existing connection that uses this adapter and try again to establish + * the new connection. If you cannot free an existing connection, use a different adapter for the + * new connection. + */ + +/*? + * Text: "%s: Setting the device online failed because of insufficient authorization\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The qeth device is configured with OSX CHPIDs. An OSX CHPID cannot be activated unless the LPAR is explicitly authorized to access it. + * For z/VM guest operating systems, the z/VM user ID must be explicitly authorized in addition to the LPAR. + * You grant these authorizations through the Service Element. + * User action: + * At the Service Element, authorize the LPAR and, if applicable, the z/VM user ID for using the OSX CHPIDs with which the qeth device has been configured. + * Then try again to set the device online. + */ + +/*? + * Text: "%s: portname is deprecated and is ignored\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * An OSA-Express port name was required to identify a shared OSA port. + * All operating system instances that shared the port had to use the same port name. + * This requirement no longer applies, and the specified portname attribute is ignored. + * User action: + * For future upgrades, remove OSA port name specifications from your + * network configuration. + */ + +/*? Text: "core functions removed\n" */ +/*? Text: "%s: Device is a%s card%s%s%s\nwith link type %s.\n" */ +/*? Text: "%s: issue_next_read failed: no iob available!\n" */ +/*? Text: "%s: Priority Queueing not supported\n" */ +/*? Text: "%s: sense data available. cstat 0x%X dstat 0x%X\n" */ +/*? Text: "loading core functions\n" */ +/*? Text: "%s: MAC address %pM successfully registered on device %s\n" */ +/*? Text: "%s: Device successfully recovered!\n" */ +/*? Text: "register layer 2 discipline\n" */ +/*? Text: "unregister layer 2 discipline\n" */ +/*? Text: "%s: Hardware IP fragmentation not supported on %s\n" */ +/*? Text: "%s: IPv6 not supported on %s\n" */ +/*? Text: "%s: VLAN not supported on %s\n" */ +/*? Text: "%s: Inbound source MAC-address not supported on %s\n" */ +/*? Text: "%s: IPV6 enabled\n" */ +/*? Text: "%s: ARP processing not supported on %s!\n" */ +/*? Text: "%s: Hardware IP fragmentation enabled \n" */ +/*? Text: "%s: set adapter parameters not supported.\n" */ +/*? Text: "%s: VLAN enabled\n" */ +/*? Text: "register layer 3 discipline\n" */ +/*? Text: "%s: Outbound TSO enabled\n" */ +/*? Text: "%s: Broadcast not supported on %s\n" */ +/*? Text: "%s: Outbound TSO not supported on %s\n" */ +/*? Text: "%s: Inbound HW Checksumming not supported on %s,\ncontinuing using Inbound SW Checksumming\n" */ +/*? Text: "%s: Using no checksumming on %s.\n" */ +/*? Text: "%s: Broadcast enabled\n" */ +/*? Text: "%s: Multicast not supported on %s\n" */ +/*? Text: "%s: Using SW checksumming on %s.\n" */ +/*? Text: "%s: HW Checksumming (%sbound) enabled\n" */ +/*? Text: "unregister layer 3 discipline\n" */ +/*? Text: "%s: Multicast enabled\n" */ +/*? Text: "%s: QDIO data connection isolation is deactivated\n" */ +/*? Text: "%s: QDIO data connection isolation is activated\n" */ +/*? Text: "%s: Adapter does not support QDIO data connection isolation\n" */ +/*? Text: "%s: Adapter is dedicated. QDIO data connection isolation not supported\n" */ +/*? Text: "%s: TSO does not permit QDIO data connection isolation\n" */ +/*? Text: "%s: HW TX Checksumming enabled\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "qeth_l3: ignoring TR device\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ + +/*? + * Text: "%s: Turning off reflective relay mode at the adjacent switch failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The policy for the QDIO data connection isolation was + * changed successfully, and communications are now handled according to the + * new policy. The ISOLATION_FORWARD policy is no longer used, but the qeth + * device driver could not turn off the reflective relay mode on the adjacent + * switch port. + * User action: + * Check the adjacent switch for errors and correct the problem. + */ + +/*? + * Text: "%s: The adjacent switch port does not support reflective relay mode\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The 'isolation' sysfs attribute of the qeth device could not be set to 'forward'. + * This setting selects the ISOLATION_FORWARD policy for the QDIO data connection + * isolation. The ISOLATION_FORWARD policy requires a network adapter in Virtual + * Ethernet Port Aggregator (VEPA) mode with an adjacent switch port in reflective + * relay mode. + * User action: + * Use a switch port that supports reflective relay mode if you want to use the + * ISOLATION_FORWARD policy for the qeth device. + */ + +/*? + * Text: "%s: The reflective relay mode cannot be enabled at the adjacent switch port" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The 'isolation' sysfs attribute of the qeth device could not be set to 'forward'. + * This setting selects the ISOLATION_FORWARD policy for the QDIO data connection + * isolation. The ISOLATION_FORWARD policy requires a network adapter in Virtual + * Ethernet Port Aggregator (VEPA) mode with an adjacent switch port in reflective relay + * mode. The qeth device driver failed to enable the required reflective relay mode on + * the adjacent switch port although the switch port supports this mode. + * User action: + * Enable reflective relay mode on the switch for the adjacent port and try again. + */ + +/*? + * Text: "%s: Interface %s is down because the adjacent port is no longer in reflective relay mode\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * @2: interface name + * Description: + * The ISOLATION_FORWARD policy is active for the QDIO data connection isolation + * of the qeth device. This policy requires a network adapter in Virtual Ethernet + * Port Aggregator (VEPA) mode with an adjacent switch port in reflective relay mode. + * The reflective relay mode on the adjacent switch port was disabled. The qeth device + * was set offline and the interface was deactivated to prevent any unintended network traffic. + * User action: + * Enable the reflective relay mode again on the adjacent port or use the 'isolation' + * sysfs attribute of the qeth device to set a different policy for the QDIO data connection + * isolation. You can then resume operations by setting the qeth device back + * online and activating the interface. + */ + +/*? + * Text: "%s: Failed to create completion queue\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The HiperSockets device could not be configured with a completion queue. + * A completion queue is required to operate AF_IUCV communication in an LPAR. + * User action: + * i) Investigate if you have the latest firmware level in place. + * ii) If the problem persists and is not caused by a version mismatch, contact IBM + * support. + */ + +/*? + * Text: "%s: Completion Queueing supported\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The HiperSockets device supports completion queueing. This is required to + * set up AF_IUCV communication in an LPAR. + */ + +/*? + * Text: "%s: Completion Queue support enabled" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The HiperSockets device is enabled for completion queueing. This is part of + * the process to set up AF_IUCV communication in an LPAR. + */ + +/*? + * Text: "%s: Completion Queue support disabled" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The HiperSockets device is disabled for completion queueing. This device + * cannot or no longer be used to set up AF_IUCV communication in an LPAR. + */ + +/*? + * Text: "%s: The device represents a Bridge Capable Port\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * Description: + * You can configure this device as a Bridge Port. + * User action: + * None. + */ + +/*? + * Text: "%s: The device is not configured as a Bridge Port\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The Bridge Port role cannot be withdrawn from a device + * that is not configured as a Bridge Port. + * User action: + * None. + */ + +/*? + * Text: "%s: The LAN already has a primary Bridge Port\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A LAN can have multiple secondary Bridge Ports, but only + * one primary Bridge Port. Configuring the device as a + * primary Bridge Port failed because another port on the + * LAN has been configured as the primary Bridge Port. + * User action: + * Find out which operating system instance has configured the primary + * Bridge Port. Assure that the primary role for this port is withdrawn + * before trying again to configure your device as the primary Bridge + * Port. Alternatively, consider configuring your device as a secondary + * Bridge Port. + */ + +/*? + * Text: "%s: The device is already a secondary Bridge Port\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A device cannot be configured as a primary or secondary + * Bridge Port if it is already configured as a secondary Bridge Port. + * User action: + * None, if you want the device to be a secondary Bridge Port. + * If you want to configure the device as the primary Bridge Port, + * withdraw the secondary role by writing 'none' to the 'bridgeport_role' + * sysfs attribute of the device. Then try again to configure the + * device as the primary Bridge Port. + */ + +/*? + * Text: "%s: The LAN cannot have more secondary Bridge Ports\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A LAN can have up to five secondary Bridge Ports. + * You cannot configure a further device as a secondary + * Bridge Port unless the Bridge Ports role is withdrawn from one of + * the existing secondary Bridge Ports. + * User action: + * Assure that the Bridge Port role is withdrawn from one of the + * existing secondary Bridge Ports before trying again to configure your + * device as a secondary Bridge Port. + */ + +/*? + * Text: "%s: The device is already a primary Bridge Port\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * A device cannot be configured as a primary or secondary + * Bridge Port if it is already configured as a primary Bridge Port. + * User action: + * None, if you want the device to be a primary Bridge Port. + * If you want to configure the device as a secondary Bridge Port, + * withdraw the primary role by writing 'none' to the 'bridgeport_role' + * sysfs attribute of the device. Then try again to configure the + * device as the secondary Bridge Port. + */ + +/*? + * Text: "%s: The device is not authorized to be a Bridge Port\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * The device cannot be configured as a Bridge Port because + * the required authorizations in the hardware are not in place. + * User action: + * See your hardware documentation about how to authorize + * ports for becoming a Bridge Port. + */ + +/*? + * Text: "%s: A Bridge Port is already configured by a different operating system\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * Linux instances cannot configure the target port as a Bridge Port. + * Another operating system already uses a Bridge Port on the HiperSockets + * or on the OSA adapter. For example, a z/VM instance might be using + * a port in a VSWITCH configuration. Multiple Bridge Ports on the same + * HiperSockets or OSA adapter must be configured by instances of the same + * operating system, for example, all Linux or all z/VM. + * User action: + * Reconsider your network topology. Configure Bridge Ports only for ports + * on adapters where any other Bridge Ports are configured by other Linux + * instances. + */ + +/*? + * Text: "%s: Setting address notification failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the qeth device + * Description: + * Enabling or disabling the address notification feature of a + * HiperSockets device failed. The device might not be configured as a + * Bridge Port. + * User action: + * None, unless you need address notifications for this device. + * If you need notifications, confirm that your device is attached to a + * HiperSockets LAN that supports Bridge Capable Ports and that your + * device is configured as a Bridge Port. If the 'bridgeport_role' + * sysfs attribute of the device contains, one of the values 'primary' + * or 'secondary' and you cannot set the address notification, contact + * your support organization. + */ + +/*? + * Text: "%s: Address notification from the Bridge Port stopped %s (%s)\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the qeth device + * @2: network interface name + * @3: error reported by the hardware + * Description: + * A Bridge Port no longer provides address notifications. + * Possible reasons include traffic overflow and that the device is no + * longer configured as a Bridge Port. A udev event with + * BRIDGEDHOST=abort was emitted to alert applications that rely on the + * address notifications. + * User action: + * None. + */ + +/*? + * Text: "%s: The qeth driver ran out of channel command buffers\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the qeth device + * Description: + * Command buffers can temporarily run out during periods of + * intense network configuration activities. + * The device driver recovers from this condition as outstanding + * commands are completed. + * User action: + * Wait for a short time. If the problem persists, + * initiate a recovery process by writing '1' to the 'recover' + * sysfs attribute of the device. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/s390dbf +++ linux-azure-4.11.0/Documentation/kmsg/s390/s390dbf @@ -0,0 +1,83 @@ +/*? + * Text: "Root becomes the owner of all s390dbf files in sysfs\n" + * Severity: Warning + * Description: + * The S/390 debug feature you are using only supports uid/gid = 0. + * User action: + * None. + */ + +/*? + * Text: "Registering debug feature %s failed\n" + * Severity: Error + * Parameter: + * @1: feature name + * Description: + * The initialization of an S/390 debug feature failed. A likely cause of this + * problem is memory constraints. The system keeps running, but the debug + * data for this feature will not be available in sysfs. + * User action: + * Consider assigning more memory to your LPAR or z/VM guest virtual machine. + */ + +/*? + * Text: "Registering view %s/%s would exceed the maximum number of views %i\n" + * Severity: Error + * Parameter: + * @1: feature name + * @2: view name + * @3: maximum + * Description: + * The maximum number of allowed debug feature views has been reached. The + * view has not been registered. The system keeps running but the new view + * will not be available in sysfs. This is a program error. + * User action: + * Report this problem to your support partner. + */ + +/*? + * Text: "%s is not a valid level for a debug feature\n" + * Severity: Warning + * Parameter: + * @1: level + * Description: + * Setting a new level for a debug feature by using the 'level' sysfs attribute + * failed. Valid levels are the minus sign (-) and the integers in the + * range 0 to 6. The minus sign switches off the feature. The numbers switch + * the feature on, where higher numbers produce more debug output. + * User action: + * Write a valid value to the 'level' sysfs attribute. + */ + +/*? + * Text: "Flushing debug data failed because %c is not a valid area\n" + * Severity: Informational + * Parameter: + * @1: debug area number + * Description: + * Flushing a debug area by using the 'flush' sysfs attribute failed. Valid + * values are the minus sign (-) for flushing all areas, or the number of the + * respective area for flushing a single area. + * User action: + * Write a valid area number or the minus sign (-) to the 'flush' sysfs + * attribute. + */ + +/*? + * Text: "Allocating memory for %i pages failed\n" + * Severity: Informational + * Parameter: + * @1: number of pages + * Description: + * Setting the debug feature size by using the 'page' sysfs attribute failed. + * Linux did not have enough memory for expanding the debug feature to the + * requested size. + * User action: + * Use a smaller number of pages for the debug feature or allocate more + * memory to your LPAR or z/VM guest virtual machine. + */ + +/*? Text: "%s: set new size (%i pages)\n" */ +/*? Text: "%s: switched off\n" */ +/*? Text: "%s: level %i is out of range (%i - %i)\n" */ +/*? Text: "Registering view %s/%s failed due to out of memory\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/sclp_cmd +++ linux-azure-4.11.0/Documentation/kmsg/s390/sclp_cmd @@ -0,0 +1,44 @@ +/*? Text: "sync request failed (cmd=0x%08x, status=0x%02x)\n" */ +/*? Text: "readcpuinfo failed (response=0x%04x)\n" */ +/*? Text: "configure cpu failed (cmd=0x%08x, response=0x%04x)\n" */ +/*? Text: "configure channel-path failed (cmd=0x%08x, response=0x%04x)\n" */ +/*? Text: "read channel-path info failed (response=0x%04x)\n" */ +/*? Text: "assign storage failed (cmd=0x%08x, response=0x%04x, rn=0x%04x)\n" */ +/*? Text: "configure PCI I/O adapter failed: cmd=0x%08x response=0x%04x\n" */ +/*? Text: "request failed (status=0x%02x)\n" */ +/*? Text: "request failed with response code 0x%x\n" */ + +/*? + * Text: "Memory hotplug state changed, suspend refused.\n" + * Severity: Error + * Description: + * Suspend is refused after a memory hotplug operation was performed. + * User action: + * The system needs to be restarted and no memory hotplug operation must be + * performed in order to allow suspend. + */ + +/*? + * Text: "Standby memory at 0x%llx (%lluM of %lluM usable)\n" + * Severity: Informational + * Parameter: + * @1: start address of standby memory + * @2: usable memory in MB + * @3: total detected memory in MB + * Description: + * Standby memory was detected. It can be used for memory hotplug only + * if it is aligned to the Linux hotplug memory block size. + * If the aligned amount of memory matches the total amount, + * all detected standby memory can be used. Otherwise, some of the detected + * memory is unaligned and cannot be used. + * User action: + * None, if the usable and the total amount of detected standby memory match. + * If the amounts of memory do not match, + * check the memory setup of your guest virtual machine and ensure that + * the standby memory start and end + * address is aligned to the Linux hotplug memory block size. + * On Linux, issue "cat /sys/devices/system/memory/block_size_bytes" + * to find the hotplug memory block size value in hexadecimal notation. + * On z/VM, query your memory setup with "vmcp q v store". + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/sclp_config +++ linux-azure-4.11.0/Documentation/kmsg/s390/sclp_config @@ -0,0 +1,15 @@ +/*? + * Text: "CPU capability may have changed\n" + * Severity: Informational + * Description: + * The capability of the CPUs in the configuration may have been upgraded + * or downgraded. This message may also appear if the capability of the + * CPUs in the configuration did not change. + * For details see the STORE SYSTEM INFORMATION description in the + * "Principles of Operation." + * User action: + * The user can examine /proc/sysinfo for CPU capability values. + */ +/*? Text: "Open for Business request failed with response code 0x%04x\n" */ +/*? Text: "SCLP receiver did not register to receive Configuration Management Data Events.\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/sclp_cpi +++ linux-azure-4.11.0/Documentation/kmsg/s390/sclp_cpi @@ -0,0 +1,3 @@ +/*? Text: "request failed (status=0x%02x)\n" */ +/*? Text: "request failed with response code 0x%x\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/sclp_ocf +++ linux-azure-4.11.0/Documentation/kmsg/s390/sclp_ocf @@ -0,0 +1 @@ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/sclp_sdias +++ linux-azure-4.11.0/Documentation/kmsg/s390/sclp_sdias @@ -0,0 +1,4 @@ +/*? Text: "sclp_send failed for get_nr_blocks\n" */ +/*? Text: "SCLP error: %x\n" */ +/*? Text: "sclp_send failed: %x\n" */ +/*? Text: "Error from SCLP while copying hsa. Event status = %x\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/scm_block +++ linux-azure-4.11.0/Documentation/kmsg/s390/scm_block @@ -0,0 +1,51 @@ +/*? + * Text: "%lx: The capabilities of the SCM increment changed\n" + * Severity: Informational + * Parameter: + * @1: start address of the SCM increment + * Description: + * A configuration change is in progress for the storage class memory (SCM) + * increment. + * User action: + * Verify that the capability of the SCM increment is as intended; for + * example, with lsscm. + */ + +/*? + * Text: "An I/O operation to SCM failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * An error occurred during I/O to storage class memory (SCM). The operation + * was repeated, but the maximum number of retries was exceeded before the + * request could be fulfilled. + * User action: + * Contact your support organization. + */ + +/*? + * Text: "%lx: Write access to the SCM increment is suspended\n" + * Severity: Informational + * Parameter: + * @1: start address of the SCM increment + * Description: + * A concurrent firmware upgrade is in progress. For the duration of the + * upgrade, write access to the storage class memory (SCM) increment has been + * suspended. + * User action: + * None. + */ + +/*? + * Text: "%lx: Write access to the SCM increment is restored\n" + * Severity: Informational + * Parameter: + * @1: start address of the SCM increment + * Description: + * Write access to the storage class memory (SCM) increment was restored + * after a temporary suspension during a concurrent firmware upgrade. + * User action: + * None. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/setup +++ linux-azure-4.11.0/Documentation/kmsg/s390/setup @@ -0,0 +1,165 @@ +/*? + * Text: "The initial RAM disk does not fit into the memory\n" + * Severity: Error + * Description: + * The load address and the size of the initial RAM disk specify a memory + * area that is not available. + * User action: + * Lower the load address of the initial RAM disk, reduce the size of the + * initial RAM disk, or increase the size of the system memory to make the + * initial RAM disk fit into the memory. + */ + +/*? + * Text: "The maximum memory size is %luMB\n" + * Severity: Notice + * Parameter: + * @1: size in MB + * Description: + * The system memory size cannot exceed the amount of memory that is + * provided by the real or virtual hardware. It can be further reduced + * through an upper memory address limit that is specified with the + * mem= kernel parameter. + * User action: + * None. + */ + +/*? + * Text: "Linux is running as a z/VM guest operating system in 31-bit mode\n" + * Severity: Informational + * Description: + * The 31-bit Linux kernel detected that it is running as a guest operating + * system of the z/VM hypervisor. + * User action: + * None. + */ + +/*? + * Text: "Linux is running natively in 31-bit mode\n" + * Severity: Informational + * Description: + * The 31-bit Linux kernel detected that it is running on an IBM mainframe, + * either as the sole operating system in an LPAR or as the sole operating + * system on the entire mainframe. The Linux kernel is not running as a + * guest operating system of the z/VM hypervisor. + * User action: + * None. + */ + +/*? + * Text: "The hardware system has IEEE compatible floating point units\n" + * Severity: Informational + * Description: + * The Linux kernel detected that it is running on a hardware system with + * CPUs that have IEEE compatible floating point units. + * User action: + * None. + */ + +/*? + * Text: "The hardware system has no IEEE compatible floating point units\n" + * Severity: Informational + * Description: + * The Linux kernel detected that it is running on a hardware system with + * CPUs that do not have IEEE compatible floating point units. + * User action: + * None. + */ + +/*? + * Text: "Linux is running as a z/VM guest operating system in 64-bit mode\n" + * Severity: Informational + * Description: + * The 64-bit Linux kernel detected that it is running as a guest operating + * system of the z/VM hypervisor. + * User action: + * None. + */ + +/*? + * Text: "Linux is running under KVM in 64-bit mode\n" + * Severity: Informational + * Description: + * The 64-bit Linux kernel detected that it is running as a guest operating + * system of the KVM hypervisor. + * User action: + * None. + */ + +/*? + * Text: "Linux is running natively in 64-bit mode\n" + * Severity: Informational + * Description: + * The 64-bit Linux kernel detected that it is running on an IBM mainframe, + * either as the sole operating system in an LPAR or as the sole operating + * system on the entire mainframe. The Linux kernel is not running as a + * guest operating system of the z/VM hypervisor. + * User action: + * None. + */ + +/*? + * Text: "Defining the Linux kernel NSS failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * The Linux kernel could not define the named saved system (NSS) with + * the z/VM CP DEFSYS command. The return code represents the numeric + * portion of the CP DEFSYS error message. + * User action: + * For return code 1, the z/VM guest virtual machine is not authorized + * to define named saved systems. + * Ensure that the z/VM guest virtual machine is authorized to issue + * the CP DEFSYS command (typically privilege class E). + * For other return codes, see the help and message documentation for + * the CP DEFSYS command. + */ + +/*? + * Text: "Saving the Linux kernel NSS failed with rc=%d\n" + * Severity: Error + * Parameter: + * @1: return code + * Description: + * The Linux kernel could not save the named saved system (NSS) with + * the z/VM CP SAVESYS command. The return code represents the numeric + * portion of the CP SAVESYS error message. + * User action: + * For return code 1, the z/VM guest virtual machine is not authorized + * to save named saved systems. + * Ensure that the z/VM guest virtual machine is authorized to issue + * the CP SAVESYS command (typically privilege class E). + * For other return codes, see the help and message documentation for + * the CP SAVESYS command. + */ + +/*? + * Text: "crashkernel reservation failed: %s\n" + * Severity: Informational + * Parameter: + * @1: reason string + * Description: + * The memory reservation for the kdump "crashkernel" parameter was not + * successful. The Linux kernel was either not able to find a free memory + * area or an invalid area has been defined. The reason string describes the + * cause of the failure in more detail. + * User action: + * Increase the memory footprint of your virtual machine or adjust the values + * for the "crashkernel" kernel parameter. Then boot your Linux system again. + */ + +/*? + * Text: "Reserving %lluMB of memory at %lluMB for crashkernel (System RAM: %luMB)\n" + * Severity: Informational + * Parameter: + * @1: amount of reserved memory + * @2: storage location of reserved memory + * @3: amount of system RAM + * Description: + * The memory reservation for the kdump "crashkernel" parameter was successful + * and a kdump kernel can now be loaded with the kexec tool. + * User action: + * None. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/smsgiucv +++ linux-azure-4.11.0/Documentation/kmsg/s390/smsgiucv @@ -0,0 +1 @@ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/smsgiucv_app +++ linux-azure-4.11.0/Documentation/kmsg/s390/smsgiucv_app @@ -0,0 +1 @@ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/tape +++ linux-azure-4.11.0/Documentation/kmsg/s390/tape @@ -0,0 +1,63 @@ +/*? + * Text: "%s: A tape unit was detached while in use\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A tape unit has been detached from the I/O configuration while a tape + * was being accessed. This typically results in I/O error messages and + * potentially in damaged data on the tape. + * User action: + * Check the output of the application that accesses the tape device. + * If this problem occurred during a write-type operation, consider repeating + * the operation after bringing the tape device back online. + */ + +/*? + * Text: "%s: A tape cartridge has been mounted\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the tape device + * Description: + * A tape cartridge has been inserted into the tape unit. The tape in the + * tape unit is ready to be accessed. + * User action: + * None. + */ + +/*? + * Text: "%s: The tape cartridge has been successfully unloaded\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape cartridge has been unloaded from the tape unit. Insert a tape + * cartridge before accessing the tape device. + * User action: + * None. + */ + +/*? + * Text: "A cartridge is loaded in tape device %s, refusing to suspend\n" + * Severity: Error + * Parameter: + * @1: bus ID of the tape device + * Description: + * A request to suspend a tape device currently loaded with a cartridge is + * rejected. + * User action: + * Unload the tape device. Then try to suspend the system again. + */ + +/*? + * Text: "Tape device %s is busy, refusing to suspend\n" + * Severity: Error + * Parameter: + * @1: bus ID of the tape device + * Description: + * A request to suspend a tape device being currently in use is rejected. + * User action: + * Terminate applications performing tape operations + * and then try to suspend the system again. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/tape_34xx +++ linux-azure-4.11.0/Documentation/kmsg/s390/tape_34xx @@ -0,0 +1,418 @@ +/*? + * Text: "%s: An unexpected condition %d occurred in tape error recovery\n" + * Severity: Error + * Parameter: + * @1: bus ID of the tape device + * @2: number + * Description: + * The control unit has reported an error condition that is not recognized by + * the error recovery process of the tape device driver. + * User action: + * Report this problem and the condition number from the message to your + * support organization. + */ + +/*? + * Text: "%s: A data overrun occurred between the control unit and tape unit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A data overrun error has occurred on the connection between the control + * unit and the tape unit. If this problem occurred during a write-type + * operation, the integrity of the data on the tape might be compromised. + * User action: + * Use a faster connection. If this problem occurred during a write-type + * operation, consider repositioning the tape and repeating the operation. + */ + +/*? + * Text: "%s: The block ID sequence on the tape is incorrect\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The control unit has detected an incorrect block ID sequence on the tape. + * This problem typically indicates that the data on the tape is damaged. + * User action: + * If this problem occurred during a write-type operation reposition the tape + * and repeat the operation. + */ + +/*? + * Text: "%s: A read error occurred that cannot be recovered\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A read error has occurred that cannot be recovered. The current tape might + * be damaged. + * User action: + * None. + */ + +/*? + * Text: "%s: A write error on the tape cannot be recovered\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A write error has occurred that could not be recovered by the automatic + * error recovery process. + * User action: + * Use a different tape cartridge. + */ + +/*? + * Text: "%s: Writing the ID-mark failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The ID-mark at the beginning of tape could not be written. The tape medium + * might be write-protected. + * User action: + * Try a different tape cartridge. Ensure that the write-protection on the + * cartridge is switched off. + */ + +/*? + * Text: "%s: Reading the tape beyond the end of the recorded area failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A read-type operation failed because it extended beyond the end of the + * recorded area on the tape medium. + * User action: + * None. + */ + +/*? + * Text: "%s: The tape contains an incorrect block ID sequence\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The control unit has detected an incorrect block ID sequence on the tape. + * This problem typically indicates that the data on the tape is damaged. + * User action: + * If this problem occurred during a write-type operation reposition the tape + * and repeat the operation. + */ + +/*? + * Text: "%s: A path equipment check occurred for the tape device\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A path equipment check has occurred. This check indicates problems with the + * connection between the mainframe system and the tape control unit. + * User action: + * Ensure that the cable connections between the mainframe system and the + * control unit are securely in place and not damaged. + */ + +/*? + * Text: "%s: The tape unit cannot process the tape format\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * Either the tape unit is not able to read the format ID mark, or the + * specified format is not supported by the tape unit. + * User action: + * If you do not need the data recorded on the current tape, use a different + * tape or write a new format ID mark at the beginning of the tape. Be aware + * that writing a new ID mark leads to a loss of all data that has been + * recorded on the tape. If you need the data on the current tape, use a tape + * unit that supports the tape format. + */ + +/*? + * Text: "%s: The tape medium is write-protected\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A write-type operation failed because the tape medium is write-protected. + * User action: + * Eject the tape cartridge, switch off the write protection on the cartridge, + * insert the cartridge, and try the operation again. + */ + +/*? + * Text: "%s: The tape does not have the required tape tension\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape does not have the required tape tension. + * User action: + * Rewind and reposition the tape, then repeat the operation. + */ + +/*? + * Text: "%s: The tape unit failed to load the cartridge\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * An error has occurred while loading the tape cartridge. + * User action: + * Unload the cartridge and load it again. + */ + +/*? + * Text: "%s: Automatic unloading of the tape cartridge failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit failed to unload the cartridge. + * User action: + * Unload the cartridge manually by using the eject button on the tape unit. + */ + +/*? + * Text: "%s: An equipment check has occurred on the tape unit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * Possible reasons for the check condition are a unit adapter error, a buffer + * error on the lower interface, an unusable internal path, or an error that + * has occurred while loading the cartridge. + * User action: + * Examine the tape unit and the cartridge loader. Consult the tape unit + * documentation for details. + */ + +/*? + * Text: "%s: The tape information states an incorrect length\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape is shorter than stated at the beginning of the tape data. A + * possible reason for this problem is that the tape might have been physically + * truncated. Data written to the tape might be incomplete or damaged. + * User action: + * If this problem occurred during a write-type operation, consider repeating + * the operation with a different tape cartridge. + */ + +/*? + * Text: "%s: The tape unit is not ready\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit is online but not ready. + * User action: + * Turn the ready switch on the tape unit to the ready position and try the + * operation again. + */ + +/*? + * Text: "%s: The tape medium has been rewound or unloaded manually\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit rewind button, unload button, or both have been used to + * rewind or unload the tape cartridge. A tape cartridge other than the + * intended cartridge might have been inserted or the tape medium might not + * be at the expected position. + * User action: + * Verify that the correct tape cartridge has been inserted and that the tape + * medium is at the required position before continuing to work with the tape. + */ + +/*? + * Text: "%s: The tape subsystem is running in degraded mode\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape subsystem is not operating at its maximum performance. + * User action: + * Contact your service representative for the tape unit and report this + * problem. + */ + +/*? + * Text: "%s: The tape unit is already assigned\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit is already assigned to another channel path. + * User action: + * Free the tape unit from the operating system instance to which it is + * currently assigned then try again. + */ + +/*? + * Text: "%s: The tape unit is not online\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit is not online to the tape device driver. + * User action: + * Ensure that the tape unit is operational and that the cable connections + * between the control unit and the tape unit are securely in place and not + * damaged. + */ + +/*? + * Text: "%s: The control unit has fenced access to the tape volume\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The control unit fences further access to the current tape volume. The data + * integrity on the tape volume might have been compromised. + * User action: + * Rewind and unload the tape cartridge. + */ + +/*? + * Text: "%s: A parity error occurred on the tape bus\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * A data parity check error occurred on the bus. Data that was read or written + * while the error occurred is not valid. + * User action: + * Reposition the tape and repeat the read-type or write-type operation. + */ + +/*? + * Text: "%s: I/O error recovery failed on the tape control unit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * An I/O error occurred that cannot be recovered by the automatic error + * recovery process of the tape control unit. The application that operates + * the tape unit will receive a return value of -EIO which indicates an + * I/O error. The data on the tape might be damaged. + * User action: + * If this problem occurred during a write-type operation, consider + * repositioning the tape and repeating the operation. + */ + +/*? + * Text: "%s: The tape unit requires a firmware update\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit requires firmware patches from the tape control unit but the + * required patches are not available on the control unit. + * User action: + * Make the require patches available on the control unit then reposition the + * tape and retry the operation. For details about obtaining and installing + * firmware updates see the control unit documentation. + */ + +/*? + * Text: "%s: The maximum block size for buffered mode is exceeded\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The block to be written is larger than allowed for the buffered mode. + * User action: + * Use a smaller block size. + */ + +/*? + * Text: "%s: A channel interface error cannot be recovered\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * An error has occurred on the channel interface. This error cannot + * be recovered by the control unit error recovery process. + * User action: + * See the documentation of the control unit. + */ + +/*? + * Text: "%s: A channel protocol error occurred\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * An error was detected in the channel protocol. + * User action: + * Reposition the tape and try the operation again. + */ + +/*? + * Text: "%s: The tape unit does not support the compaction algorithm\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit cannot read the current tape. The data on the tape has been + * compressed with an algorithm that is not supported by the tape unit. + * User action: + * Use a tape unit that supports the compaction algorithm used for the + * current tape. + */ + +/*? + * Text: "%s: The tape unit does not support tape format 3480-2 XF\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit does not support tapes recorded in the 3480-2 XF format. + * User action: + * If you do not need the data recorded on the current tape, rewind the tape + * and overwrite it with a supported format. If you need the data on the + * current tape, use a tape unit that supports the tape format. + */ + +/*? + * Text: "%s: The tape unit does not support format 3480 XF\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit does not support tapes recorded in the 3480 XF format. + * User action: + * If you do not need the data recorded on the current tape, rewind the tape + * and overwrite it with a supported format. If you need the data on the + * current tape, use a tape unit that supports the tape format. + */ + +/*? + * Text: "%s: The tape unit does not support the current tape length\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The length of the tape in the cartridge is incompatible with the tape unit. + * User action: + * Either use a different tape unit or use a tape with a supported length. + */ + +/*? + * Text: "%s: The tape unit does not support the tape length\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The length of the tape in the cartridge is incompatible with the tape + * unit. + * User action: + * Either use a different tape unit or use a tape with a supported length. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/tape_3590 +++ linux-azure-4.11.0/Documentation/kmsg/s390/tape_3590 @@ -0,0 +1,183 @@ +/*? + * Text: "%s: The tape medium must be loaded into a different tape unit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape device has indicated an error condition that requires loading + * the tape cartridge into a different tape unit to recover. + * User action: + * Unload the cartridge and use a different tape unit to retry the operation. + */ + +/*? + * Text: "%s: Tape media information: exception %s, service %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: exception + * @3: service + * Description: + * This is an operating system independent tape medium information message + * that was issued by the tape unit. The information in the message is + * intended for the IBM customer engineer. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: Device subsystem information: exception %s, service %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: exception + * @3: required service action + * Description: + * This is an operating system independent device subsystem information message + * that was issued by the tape unit. The information in the message is + * intended for the IBM customer engineer. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: I/O subsystem information: exception %s, service %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: exception + * @3: required service action + * Description: + * This is an operating system independent I/O subsystem information message + * that was issued by the tape unit. The information in the message is + * intended for the IBM customer engineer. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: The tape unit has issued sense message %s\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: sense message code + * Description: + * The tape unit has issued an operating system independent sense message. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: The tape unit has issued an unknown sense message code 0x%x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: code + * Description: + * The tape device driver has received an unknown sense message from the + * tape unit. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: MIM SEV=%i, MC=%02x, ES=%x/%x, RC=%02x-%04x-%02x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: SEV + * @3: message code + * @4: exception + * @5: required service action + * @6: refcode + * @7: mid + * @8: fid + * Description: + * This is an operating system independent information message that was + * issued by the tape unit. The information in the message is intended for + * the IBM customer engineer. + * User action: + * See to the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: IOSIM SEV=%i, DEVTYPE=3590/%02x, MC=%02x, ES=%x/%x, REF=0x%04x-0x%04x-0x%04x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: SEV + * @3: model + * @4: message code + * @5: exception + * @6: required service action + * @7: refcode1 + * @8: refcode2 + * @9: refcode3 + * Description: + * This is an operating system independent I/O subsystem information message + * that was issued by the tape unit. The information in the message is + * intended for the IBM customer engineer. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: DEVSIM SEV=%i, DEVTYPE=3590/%02x, MC=%02x, ES=%x/%x, REF=0x%04x-0x%04x-0x%04x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: SEV + * @3: model + * @4: message code + * @5: exception + * @6: required service action + * @7: refcode1 + * @8: refcode2 + * @9: refcode3 + * Description: + * This is an operating system independent device subsystem information message + * issued by the tape unit. The information in the message is intended for + * the IBM customer engineer. + * User action: + * See the documentation for the tape unit for further information. + */ + +/*? + * Text: "%s: The tape unit has issued an unknown sense message code %x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * @2: code + * Description: + * The tape device has issued a sense message, that is unknown to the device + * driver. + * User action: + * Use the message code printed as hexadecimal value and see the documentation + * for the tape unit for further information. + */ + +/*? + * Text: "%s: The tape unit failed to obtain the encryption key from EKM\n" + * Severity: Error + * Parameter: + * @1: bus ID of the tape device + * Description: + * The tape unit was unable to retrieve the encryption key required to decode + * the data on the tape from the enterprise key manager (EKM). + * User action: + * See the EKM and tape unit documentation for information about how to enable + * the tape unit to retrieve the encryption key. + */ + +/*? + * Text: "%s: A different host has privileged access to the tape unit\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the tape device + * Description: + * You cannot access the tape unit because a different operating system + * instance has privileged access to the unit. + * User action: + * Unload the current cartridge to solve this problem. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/time +++ linux-azure-4.11.0/Documentation/kmsg/s390/time @@ -0,0 +1,36 @@ +/*? + * Text: "The ETR interface has adjusted the clock by %li microseconds\n" + * Severity: Notice + * Parameter: + * @1: number of microseconds + * Description: + * The external time reference (ETR) interface has synchronized the system + * clock with the external reference and set it to a new value. The time + * difference between the old and new clock value has been passed to the + * network time protocol (NTP) as a single shot adjustment. + * User action: + * None. + */ + +/*? + * Text: "The real or virtual hardware system does not provide an ETR interface\n" + * Severity: Warning + * Description: + * The 'etr=' parameter has been passed on the kernel parameter line for + * a Linux instance that does not have access to the external time reference + * (ETR) facility. + * User action: + * To avoid this warning remove the 'etr=' kernel parameter. + */ + +/*? + * Text: "The real or virtual hardware system does not provide an STP interface\n" + * Severity: Warning + * Description: + * The 'stp=' parameter has been passed on the kernel parameter line for + * a Linux instance that does not have access to the server time protocol + * (STP) facility. + * User action: + * To avoid this warning remove the 'stp=' kernel parameter. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/vmlogrdr +++ linux-azure-4.11.0/Documentation/kmsg/s390/vmlogrdr @@ -0,0 +1,19 @@ +/*? Text: "vmlogrdr: failed to start recording automatically\n" */ +/*? Text: "vmlogrdr: connection severed with reason %i\n" */ +/*? Text: "vmlogrdr: iucv connection to %s failed with rc %i \n" */ +/*? Text: "vmlogrdr: failed to stop recording automatically\n" */ +/*? Text: "not running under VM, driver not loaded.\n" */ + +/*? + * Text: "vmlogrdr: device %s is busy. Refuse to suspend.\n" + * Severity: Error + * Parameter: + * @1: device name + * Description: + * Suspending vmlogrdr devices that are in uses is not supported. + * A request to suspend such a device is refused. + * User action: + * Close all applications that use any of the vmlogrdr devices + * and then try to suspend the system again. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/vmur +++ linux-azure-4.11.0/Documentation/kmsg/s390/vmur @@ -0,0 +1,48 @@ +/*? + * Text: "The %s cannot be loaded without z/VM\n" + * Severity: Error + * Parameter: + * @1: z/VM virtual unit record device driver + * Description: + * The z/VM virtual unit record device driver provides Linux with access to + * z/VM virtual unit record devices like punch card readers, card punches, and + * line printers. On Linux instances that run in environments other than the + * z/VM hypervisor, the device driver does not provide any useful function and + * the corresponding vmur module cannot be loaded. + * User action: + * Load the vmur module only on Linux instances that run as guest operating + * systems of the z/VM hypervisor. If the z/VM virtual unit record device + * has been compiled into the kernel, ignore this message. + */ + +/*? + * Text: "Kernel function alloc_chrdev_region failed with error code %d\n" + * Severity: Error + * Parameter: + * @1: error code according to errno definitions + * Description: + * The z/VM virtual unit record device driver (vmur) needs to register a range + * of character device minor numbers from 0x0000 to 0xffff. + * This registration failed, probably because of memory constraints. + * User action: + * Free some memory and reload the vmur module. If the z/VM virtual unit + * record device driver has been compiled into the kernel reboot Linux. + * Consider assigning more memory to your LPAR or z/VM guest virtual machine. + */ + +/*? + * Text: "Unit record device %s is busy, %s refusing to suspend.\n" + * Severity: Error + * Parameter: + * @1: bus ID of the unit record device + * @1: z/VM virtual unit record device driver + * Description: + * Linux cannot be suspended while a unit record device is in use. + * User action: + * Stop all applications that work on z/VM spool file queues, for example, the + * vmur tool. Then try again to suspend Linux. + */ + +/*? Text: "%s loaded.\n" */ +/*? Text: "%s unloaded.\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/xpram +++ linux-azure-4.11.0/Documentation/kmsg/s390/xpram @@ -0,0 +1,74 @@ +/*? + * Text: "%d is not a valid number of XPRAM devices\n" + * Severity: Error + * Parameter: + * @1: number of partitions + * Description: + * The number of XPRAM partitions specified for the 'devs' module parameter + * or with the 'xpram.parts' kernel parameter must be an integer in the + * range 1 to 32. The XPRAM device driver created a maximum of 32 partitions + * that are probably not configured as intended. + * User action: + * If the XPRAM device driver has been compiled as a separate module, + * unload the module and load it again with a correct value for the 'devs' + * module parameter. If the XPRAM device driver has been compiled + * into the kernel, correct the 'xpram.parts' parameter in the kernel + * command line and restart Linux. + */ + +/*? + * Text: "Not enough expanded memory available\n" + * Severity: Error + * Description: + * The amount of expanded memory required to set up your XPRAM partitions + * depends on the 'sizes' parameter specified for the xpram module or on + * the specifications for the 'xpram.parts' parameter if the XPRAM device + * driver has been compiled into the kernel. Your + * current specification exceed the amount of available expanded memory. + * Your XPRAM partitions are probably not configured as intended. + * User action: + * If the XPRAM device driver has been compiled as a separate module, + * unload the xpram module and load it again with an appropriate value + * for the 'sizes' module parameter. If the XPRAM device driver has been + * compiled into the kernel, adjust the 'xpram.parts' parameter in the + * kernel command line and restart Linux. If you need more than the + * available expanded memory, increase the expanded memory allocation for + * your virtual hardware or LPAR. + */ + +/*? + * Text: "No expanded memory available\n" + * Severity: Error + * Description: + * The XPRAM device driver has been loaded in a Linux instance that runs + * in an LPAR or virtual hardware without expanded memory. + * No XPRAM partitions are created. + * User action: + * Allocate expanded memory for your LPAR or virtual hardware or do not + * load the xpram module. You can ignore this message, if you do not want + * to create XPRAM partitions. + */ + +/*? + * Text: "Resuming the system failed: %s\n" + * Severity: Error + * Parameter: + * @1: cause of the failure + * Description: + * A system cannot be resumed if the expanded memory setup changes + * after hibernation. Possible reasons for the failure are: + * - Expanded memory was removed after hibernation. + * - Size of the expanded memory changed after hibernation. + * The system is stopped with a kernel panic. + * User action: + * Reboot Linux. + */ + +/*? Text: " number of devices (partitions): %d \n" */ +/*? Text: " size of partition %d: %u kB\n" */ +/*? Text: " size of partition %d to be set automatically\n" */ +/*? Text: " memory needed (for sized partitions): %lu kB\n" */ +/*? Text: " partitions to be sized automatically: %d\n" */ +/*? Text: " automatically determined partition size: %lu kB\n" */ +/*? Text: " %u pages expanded memory found (%lu KB).\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/zcrypt +++ linux-azure-4.11.0/Documentation/kmsg/s390/zcrypt @@ -0,0 +1,22 @@ +/*? + * Text: "Cryptographic device %02x.%04x failed and was set offline\n" + * Severity: Error + * Parameter: + * @1: AP device ID + * @2: AP queue + * Description: + * A cryptographic device failed to process a cryptographic request. + * The cryptographic device driver could not correct the error and + * set the device offline. The application that issued the + * request received an indication that the request has failed. + * User action: + * Use the lszcrypt command to confirm that the cryptographic + * hardware is still configured to your LPAR or z/VM guest virtual + * machine. If the device is available to your Linux instance the + * command output contains a line that begins with 'card', + * where is the two-digit decimal number in the message text. + * After ensuring that the device is available, use the chzcrypt command to + * set it online again. + * If the error persists, contact your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/zdump +++ linux-azure-4.11.0/Documentation/kmsg/s390/zdump @@ -0,0 +1,27 @@ +/*? + * Text: "The 32-bit dump tool cannot be used for a 64-bit system\n" + * Severity: Alert + * Description: + * The dump process ends without creating a system dump. + * User action: + * Use a 64-bit dump tool to obtain a system dump for 64-bit Linux instance. + */ +/*? + * Text: "The 64-bit dump tool cannot be used for a 32-bit system\n" + * Severity: Alert + * Description: + * The dump process ends without creating a system dump. + * User action: + * Use a 32-bit dump tool to obtain a system dump for 32-bit Linux instance. + */ +/*? + * Text: "The dump process started for a 64-bit operating system\n" + * Severity: Alert + * Description: + * The SCSI dump process started to create a dump for a 64-bit operating + * system instance. + * User action: + * None. + */ +/*? Text: "0x%x is an unknown architecture.\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/zfcp +++ linux-azure-4.11.0/Documentation/kmsg/s390/zfcp @@ -0,0 +1,709 @@ +/*? + * Text: "%s is not a valid SCSI device\n" + * Severity: Error + * Parameter: + * @1: device specification + * Description: + * The specification for an initial SCSI device provided with the 'zfcp.device' + * kernel parameter or with the 'device' module parameter is syntactically + * incorrect. The specified SCSI device could not be attached to the Linux + * system. + * User action: + * Correct the value for the 'zfcp.device' or 'device' parameter and reboot + * Linux. See "Device Drivers, Features, and Commands" for information about + * the syntax. + */ + +/*? + * Text: "The zfcp device driver could not register with the common I/O layer\n" + * Severity: Error + * Description: + * The device driver initialization failed. A possible cause of this problem is + * memory constraints. + * User action: + * Free some memory and try again to load the zfcp device driver. If the zfcp + * device driver has been compiled into the kernel, reboot Linux. Consider + * assigning more memory to your LPAR or z/VM guest virtual machine. If the + * problem persists, contact your support organization. + */ + +/*? + * Text: "%s: Setting up data structures for the FCP adapter failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The zfcp device driver could not allocate data structures for an FCP adapter. + * A possible reason for this problem is memory constraints. + * User action: + * Set the FCP adapter offline or detach it from the Linux system, free some + * memory and set the FCP adapter online again or attach it again. If this + * problem persists, gather Linux debug data, collect the FCP adapter + * hardware logs, and report the problem to your support organization. + */ + +/*? + * Text: "%s: The FCP device is operational again\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * An FCP device has been unavailable because it had been detached from the + * Linux system or because the corresponding CHPID was offline. The FCP device + * is now available again and the zfcp device driver resumes all operations to + * the FCP device. + * User action: + * None. + */ + +/*? + * Text: "%s: The CHPID for the FCP device is offline\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The CHPID for an FCP device has been set offline, either logically in Linux + * or on the hardware. + * User action: + * Find out which CHPID corresponds to the FCP device, for example, with the + * lscss command. Check if the CHPID has been set logically offline in sysfs. + * Write 'on' to the CHPID's status attribute to set it online. If the CHPID is + * online in sysfs, find out if it has been varied offline through a hardware + * management interface, for example the service element (SE). + */ + +/*? + * Text: "%s: The FCP device has been detached\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * An FCP device is no longer available to Linux. + * User action: + * Ensure that the FCP adapter is operational and attached to the LPAR or z/VM + * virtual machine. + */ + +/*? + * Text: "%s: The FCP device did not respond within the specified time\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The common I/O layer waited for a response from the FCP adapter but + * no response was received within the specified time limit. This might + * indicate a hardware problem. + * User action: + * Consult your hardware administrator. If this problem persists, + * gather Linux debug data, collect the FCP adapter hardware logs, and + * report the problem to your support organization. + */ + +/*? + * Text: "%s: Registering the FCP device with the SCSI stack failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter could not be registered with the Linux SCSI + * stack. A possible reason for this problem is memory constraints. + * User action: + * Set the FCP adapter offline or detach it from the Linux system, free some + * memory and set the FCP adapter online again or attach it again. If this + * problem persists, gather Linux debug data, collect the FCP adapter + * hardware logs, and report the problem to your support organization. + */ + +/*? + * Text: "%s: ERP cannot recover an error on the FCP device\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * An error occurred on an FCP device. The error recovery procedure (ERP) + * could not resolve the error. The FCP device driver cannot use the FCP device. + * User action: + * Check for previous error messages for the same FCP device to find the + * cause of the problem. + */ + +/*? + * Text: "%s: Creating an ERP thread for the FCP device failed.\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The zfcp device driver could not set up error recovery procedure (ERP) + * processing for the FCP device. The FCP device is not available for use + * in Linux. + * User action: + * Free some memory and try again to load the zfcp device driver. If the zfcp + * device driver has been compiled into the kernel, reboot Linux. Consider + * assigning more memory to your LPAR or z/VM guest virtual machine. If the + * problem persists, contact your support organization. + */ + +/*? + * Text: "%s: ERP failed for LUN 0x%016Lx on port 0x%016Lx\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: LUN + * @3: WWPN + * Description: + * An error occurred on the SCSI device at the specified LUN. The error recovery + * procedure (ERP) could not resolve the error. The SCSI device is not + * available. + * User action: + * Verify that the LUN is correct. Check the fibre channel fabric for errors + * related to the specified WWPN and LUN, the storage server, and Linux. + */ + +/*? + * Text: "%s: ERP failed for remote port 0x%016Lx\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: WWPN + * Description: + * An error occurred on a remote port. The error recovery procedure (ERP) + * could not resolve the error. The port is not available. + * User action: + * Verify that the WWPN is correct and check the fibre channel fabric for + * errors related to the WWPN. + */ + +/*? + * Text: "%s: Registering port 0x%016Lx failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: WWPN + * Description: + * The Linux kernel could not allocate enough memory to register the + * remote port with the indicated WWPN with the SCSI stack. The remote + * port is not available. + * User action: + * Free some memory and trigger the rescan for ports. + */ + +/*? + * Text: "%s: A QDIO problem occurred\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * QDIO reported a problem to the zfcp device driver. The zfcp device driver + * tries to recover this problem. + * User action: + * Check for related error messages. If this problem occurs frequently, gather + * Linux debug data and contact your support organization. + */ + +/*? + * Text: "%s: Setting up the QDIO connection to the FCP adapter failed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The zfcp device driver failed to establish a QDIO connection with the FCP + * adapter. + * User action: + * Set the FCP adapter offline or detach it from the Linux system, free some + * memory and set the FCP adapter online again or attach it again. If this + * problem persists, gather Linux debug data, collect the FCP adapter + * hardware logs, and report the problem to your support organization. + */ + +/*? + * Text: "%s: The FCP adapter reported a problem that cannot be recovered\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter has a problem that cannot be recovered by the zfcp device + * driver. The zfcp device driver stopped using the FCP device. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to your support organization. + */ + +/*? + * Text: "%s: There is a wrap plug instead of a fibre channel cable\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter is not physically connected to the fibre channel fabric. + * User action: + * Remove the wrap plug from the FCP adapter and connect the adapter with the + * fibre channel fabric. + */ + +/*? + * Text: "%s: FCP device not operational because of an unsupported FC class\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter hardware does not support the fibre channel service class + * requested by the zfcp device driver. This problem indicates a program error + * in the zfcp device driver. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to your support organization. + */ + +/*? + * Text: "%s: 0x%Lx is an ambiguous request identifier\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: request ID + * Description: + * The FCP adapter reported that it received the same request ID twice. This is + * an error. The zfcp device driver stopped using the FCP device. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to your support organization. + */ + +/*? + * Text: "%s: QTCB version 0x%x not supported by FCP adapter (0x%x to 0x%x)\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: requested version + * @3: lowest supported version + * @4: highest supported version + * Description: + * See message text. + * The queue transfer control block (QTCB) version requested by the zfcp device + * driver is not supported by the FCP adapter hardware. + * User action: + * If the requested version is higher than the highest version supported by the + * hardware, install more recent firmware on the FCP adapter. If the requested + * version is lower then the lowest version supported by the hardware, upgrade + * to a Linux level with a more recent zfcp device driver. + */ + +/*? + * Text: "%s: The FCP adapter could not log in to the fibre channel fabric\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The fibre channel switch rejected the login request from the FCP adapter. + * User action: + * Check the fibre channel fabric or switch logs for possible errors. + */ + +/*? + * Text: "%s: The FCP device is suspended because of a firmware update\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP device is not available while a firmware update is in progress. This + * problem is temporary. The FCP device will resume operations when the + * firmware update is completed. + * User action: + * Wait 10 seconds and try the operation again. + */ + +/*? + * Text: "%s: All NPIV ports on the FCP adapter have been assigned\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The number of N_Port ID Virtualization (NPIV) ports that can be assigned + * on an FCP adapter is limited. Once assigned, NPIV ports are not released + * automatically but have to be released explicitly through the support + * element (SE). + * User action: + * Identify NPIV ports that have been assigned but are no longer in use and + * release them from the SE. + */ + +/*? + * Text: "%s: The link between the FCP adapter and the FC fabric is down\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter is not usable. Specific error information is not available. + * User action: + * Check the cabling and the fibre channel fabric configuration. If this + * problem persists, gather Linux debug data, collect the FCP adapter + * hardware logs, and report the problem to your support organization. + */ + +/*? + * Text: "%s: The QTCB type is not supported by the FCP adapter\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The queue transfer control block (QTCB) type requested by the zfcp device + * driver is not supported by the FCP adapter hardware. + * User action: + * Install the latest firmware on your FCP adapter hardware. If this does not + * resolve the problem, upgrade to a Linux level with a more recent zfcp device + * driver. If the problem persists, contact your support organization. + */ + +/*? + * Text: "%s: The error threshold for checksum statistics has been exceeded\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter has reported a large number of bit errors. This might + * indicate a problem with the physical components of the fibre channel fabric. + * Details about the errors have been written to the HBA trace for the FCP + * adapter. + * User action: + * Check for problems in the fibre channel fabric and ensure that all cables + * are properly plugged. + */ + +/*? + * Text: "%s: The local link has been restored\n" + * Severity: Informational + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * A problem with the connection between the FCP adapter and the adjacent node + * on the fibre channel fabric has been resolved. The FCP adapter is now + * available again. + * User action: + * None. + */ + +/*? + * Text: "%s: The mode table on the FCP adapter has been damaged\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * This is an FCP adapter hardware problem. + * User action: + * Report this problem with FCP hardware logs to IBM support. + */ + +/*? + * Text: "%s: The adjacent fibre channel node does not support FCP\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The fibre channel switch or storage system that is connected to the FCP + * channel does not support the fibre channel protocol (FCP). The zfcp + * device driver stopped using the FCP device. + * User action: + * Check the adjacent fibre channel node. + */ + +/*? + * Text: "%s: The FCP adapter does not recognize the command 0x%x\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: command + * Description: + * A command code that was sent from the zfcp device driver to the FCP adapter + * is not valid. The zfcp device driver stopped using the FCP device. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to your support organization. + */ + +/*? + * Text: "%s: There is no light signal from the local fibre channel cable\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * There is no signal on the fibre channel cable that connects the FCP adapter + * to the fibre channel fabric. + * User action: + * Ensure that the cable is in place and connected properly to the FCP adapter + * and to the adjacent fibre channel switch or storage system. + */ + +/*? + * Text: "%s: The WWPN assignment file on the FCP adapter has been damaged\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * This is an FCP adapter hardware problem. + * User action: + * Report this problem with FCP hardware logs to IBM support. + */ + +/*? + * Text: "%s: The FCP device detected a WWPN that is duplicate or not valid\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * This condition indicates an error in the FCP adapter hardware or in the z/VM + * hypervisor. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to IBM support. + */ + +/*? + * Text: "%s: The fibre channel fabric does not support NPIV\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP adapter requires N_Port ID Virtualization (NPIV) from the adjacent + * fibre channel node. Either the FCP adapter is connected to a fibre channel + * switch that does not support NPIV or the FCP adapter tries to use NPIV in a + * point-to-point setup. The connection is not operational. + * User action: + * Verify that NPIV is correctly used for this connection. Check the FCP adapter + * configuration and the fibre channel switch configuration. If necessary, + * update the fibre channel switch firmware. + */ + +/*? + * Text: "%s: The FCP adapter cannot support more NPIV ports\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * N_Port ID Virtualization (NPIV) ports consume physical resources on the FCP + * adapter. The FCP adapter resources are exhausted. The connection is not + * operational. + * User action: + * Analyze the number of available NPIV ports and which operating system + * instances use them. If necessary, reconfigure your setup to move some + * NPIV ports to an FCP adapter with free resources. + */ + +/*? + * Text: "%s: The adjacent switch cannot support more NPIV ports\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * N_Port ID Virtualization (NPIV) ports consume physical resources. The + * resources of the fibre channel switch that is connected to the FCP adapter + * are exhausted. The connection is not operational. + * User action: + * Analyze the number of available NPIV ports on the adjacent fibre channel + * switch and how they are used. If necessary, reconfigure your fibre channel + * fabric to accommodate the required NPIV ports. + */ + +/*? + * Text: "%s: 0x%x is not a valid transfer protocol status\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: status information + * Description: + * The transfer protocol status information reported by the FCP adapter is not + * a valid status for the zfcp device driver. The zfcp device driver stopped + * using the FCP device. + * User action: + * Gather Linux debug data, collect the FCP adapter hardware logs, and report + * this problem to your support organization. + */ + +/*? + * Text: "%s: Unknown or unsupported arbitrated loop fibre channel topology detected\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The FCP device is connected to a fibre channel arbitrated loop or the FCP adapter + * reported an unknown fibre channel topology. The zfcp device driver supports + * point-to-point connections and switched fibre channel fabrics but not arbitrated + * loop topologies. The FCP device cannot be used. + * User action: + * Check the fibre channel setup and ensure that only supported topologies are + * connected to the FCP adapter. + */ + +/*? + * Text: "%s: FCP adapter maximum QTCB size (%d bytes) is too small\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: maximum supported size + * @3: requested QTCB size + * Description: + * The queue transfer control block (QTCB) size requested by the zfcp + * device driver is not supported by the FCP adapter hardware. + * User action: + * Update the firmware on your FCP adapter hardware to the latest + * available level and update the Linux kernel to the latest supported + * level. If the problem persists, contact your support organization. + */ + +/*? + * Text: "%s: The FCP adapter only supports newer control block versions\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The protocol supported by the FCP adapter is not compatible with the zfcp + * device driver. + * User action: + * Upgrade your Linux kernel to a level that includes a zfcp device driver + * with support for the control block version required by your FCP adapter. + */ + +/*? + * Text: "%s: The FCP adapter only supports older control block versions\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * Description: + * The protocol supported by the FCP adapter is not compatible with the zfcp + * device driver. + * User action: + * Install the latest firmware on your FCP adapter. + */ + +/*? + * Text: "%s: Not enough FCP adapter resources to open remote port 0x%016Lx\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: WWPN + * Description: + * Each port that is opened consumes physical resources of the FCP adapter to + * which it is attached. These resources are exhausted and the specified port + * cannot be opened. + * User action: + * Reduce the total number of remote ports that are attached to the + * FCP adapter. + */ + +/*? + * Text: "%s: LUN 0x%Lx on port 0x%Lx is already in use by CSS%d, MIF Image ID %x\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: LUN + * @3: remote port WWPN + * @4: channel subsystem ID + * @5: MIF Image ID of the LPAR + * Description: + * The SCSI device at the indicated LUN is already in use by another system. + * Only one system at a time can use the SCSI device. + * User action: + * Ensure that the other system stops using the device before trying to use it. + */ + +/*? + * Text: "%s: No handle is available for LUN 0x%016Lx on port 0x%016Lx\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: LUN + * @3: WWPN + * Description: + * The FCP adapter can only open a limited number of SCSI devices. This limit + * has been reached and the SCSI device at the indicated LUN cannot be opened. + * User action: + * For FCP subchannels running in non-NPIV mode, check all SCSI + * devices opened through the FCP adapter and close some of them. For + * FCP subchannels running in NPIV mode, verify the SAN zoning and + * host connections on the storage systems. Ensure that the zoning and + * host connections only allow access to the required LUNs. As a + * workaround, disable the automatic LUN scanning by setting the + * zfcp.allow_lun_scan kernel parameter or the allow_lun_scan module + * parameter to 0. + */ + +/*? + * Text: "%s: Incorrect direction %d, LUN 0x%016Lx on port 0x%016Lx closed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: value in direction field + * @3: LUN + * @4: WWPN + * Description: + * The direction field in a SCSI request contains an incorrect value. The zfcp + * device driver closed down the SCSI device at the indicated LUN. + * User action: + * Gather Linux debug data and report this problem to your support organization. + */ + +/*? + * Text: "%s: Incorrect CDB length %d, LUN 0x%016Lx on port 0x%016Lx closed\n" + * Severity: Error + * Parameter: + * @1: bus ID of the zfcp device + * @2: value in length field + * @3: LUN + * @4: WWPN + * Description: + * The control-data-block (CDB) length field in a SCSI request is not valid or + * too large for the FCP adapter. The zfcp device driver closed down the SCSI + * device at the indicated LUN. + * User action: + * Gather Linux debug data and report this problem to your support organization. + */ + +/*? + * Text: "%s: Opening WKA port 0x%x failed\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: destination ID of the WKA port + * Description: + * The FCP adapter rejected a request to open the specified + * well-known address (WKA) port. No retry is possible. + * User action: + * Verify the setup and check if the maximum number of remote ports + * used through this adapter is below the maximum allowed. If the + * problem persists, gather Linux debug data, collect the FCP adapter + * hardware logs, and report the problem to your support organization. + */ + +/*? + * Text: "%s: The name server reported %d words residual data\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: number of words in residual data + * Description: + * The fibre channel name server sent too much information about remote ports. + * The zfcp device driver did not receive sufficient information to attach all + * available remote ports in the SAN. + * User action: + * Verify that you are running the latest firmware level on the FCP + * adapter. Check your SAN setup and consider reducing the number of ports + * visible to the FCP adapter by using more restrictive zoning in the SAN. + */ + +/*? + * Text: "%s: A port opened with WWPN 0x%016Lx returned data that identifies it as WWPN 0x%016Lx\n" + * Severity: Warning + * Parameter: + * @1: bus ID of the zfcp device + * @2: expected WWPN + * @3: reported WWPN + * Description: + * A remote port was opened successfully, but it reported an + * unexpected WWPN in the returned port login (PLOGI) data. This + * condition might have been caused by a change applied to the SAN + * configuration while the port was being opened. + * User action: + * If this condition is only temporary and access to the remote port + * is possible, no action is required. If the condition persists, + * identify the storage system with the specified WWPN and contact the + * support organization of the storage system. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/s390/zpci +++ linux-azure-4.11.0/Documentation/kmsg/s390/zpci @@ -0,0 +1,42 @@ +/*? + * Text: "%s: Event 0x%x reconfigured PCI function 0x%x\n" + * Severity: Informational + * Parameter: + * @1: device name of the function + * @2: PCI event code + * @3: function ID + * Description: + * The availability of a PCI function has changed. + * Possible reasons for the change include PCI configuration actions on the + * Hardware Management Console or hypervisor. + * For shared PCI functions, the function might also have been reserved or + * released by another system. + * If the device name of a function is shown as 'n/a', the device registration + * with the PCI device driver has not completed. + * The function ID identifies the function to the I/O configuration (IOCDS). + * The PCI event code can be useful diagnostic information for your support + * organization. + * User action: + * None. + */ + +/*? + * Text: "%s: Event 0x%x reports an error for PCI function 0x%x\n" + * Severity: Error + * Parameter: + * @1: device name of the function + * @2: PCI event code + * @3: function ID + * Description: + * A PCI function entered an error state from which it cannot recover + * automatically. + * User action: + * Trigger a recovery action by writing '1' to the 'recover' sysfs attribute + * of the PCI function. + * In sysfs, PCI functions are represented as /sys/bus/pci/devices/, + * where is the device name of the function. + * If the device name of a function is shown as 'n/a', the device + * registration with the PCI device driver has not completed. + * If the problem persists, contact your support organization. + */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ --- linux-azure-4.11.0.orig/Documentation/kmsg/sbp_target +++ linux-azure-4.11.0/Documentation/kmsg/sbp_target @@ -0,0 +1,49 @@ +/*? Text: "ABORT TASK SET not implemented\n" */ +/*? Text: "ABORT TASK not implemented\n" */ +/*? Text: "Cannot change the directory_id on an active target.\n" */ +/*? Text: "Cannot enable a target with no LUNs!\n" */ +/*? Text: "Could not update Config ROM\n" */ +/*? Text: "Ignoring ORB_POINTER write while active.\n" */ +/*? Text: "LOGICAL UNIT RESET not implemented\n" */ +/*? Text: "Node ACL not found for %s\n" */ +/*? Text: "Only one TPG per Unit is possible.\n" */ +/*? Text: "QUERY LOGINS not implemented\n" */ +/*? Text: "Reconnect timer expired for node: %016llx\n" */ +/*? Text: "SET PASSWORD not implemented\n" */ +/*? Text: "TARGET RESET not implemented\n" */ +/*? Text: "Unable to allocate struct sbp_nacl\n" */ +/*? Text: "Unable to allocate struct sbp_tpg\n" */ +/*? Text: "Unable to allocate struct sbp_tport\n" */ +/*? Text: "Waiting for reconnect from node: %016llx\n" */ +/*? Text: "cannot find login: %d\n" */ +/*? Text: "failed to allocate login descriptor\n" */ +/*? Text: "failed to allocate login response block\n" */ +/*? Text: "failed to allocate session descriptor\n" */ +/*? Text: "failed to init se_session\n" */ +/*? Text: "failed to map command block handler: %d\n" */ +/*? Text: "failed to read peer GUID: %d\n" */ +/*? Text: "ignoring management request while busy\n" */ +/*? Text: "ignoring request from foreign node (%x != %x)\n" */ +/*? Text: "ignoring request with wrong generation\n" */ +/*? Text: "initiator already logged-in\n" */ +/*? Text: "login to unknown LUN: %d\n" */ +/*? Text: "logout from different node ID\n" */ +/*? Text: "max number of logins reached\n" */ +/*? Text: "mgt_agent LOGIN to LUN %d from %016llx\n" */ +/*? Text: "mgt_agent LOGOUT from LUN %d session %d\n" */ +/*? Text: "mgt_agent RECONNECT from %016llx\n" */ +/*? Text: "mgt_agent RECONNECT login GUID doesn't match\n" */ +/*? Text: "mgt_agent RECONNECT unknown login ID\n" */ +/*? Text: "mgt_orb bad request\n" */ +/*? Text: "netif_stop_queue() cannot be called before register_netdev()\n" */ +/*? Text: "refusing exclusive login with other active logins\n" */ +/*? Text: "refusing login while another exclusive login present\n" */ +/*? Text: "sbp_run_transaction: page size ignored\n" */ +/*? Text: "sbp_send_sense: unknown sense format: 0x%x\n" */ +/*? Text: "target_fabric_configfs_init() failed\n" */ +/*? Text: "target_fabric_configfs_register() failed for SBP\n" */ +/*? Text: "unknown management function 0x%x\n" */ +/*? Text: "unlink LUN: failed to update unit directory\n" */ +/*? Text: "flen=%u proglen=%u pass=%u image=%pK from=%s pid=%d\n" */ +/*? Text: "%s selects TX queue %d, but real number of TX queues is %d\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ \ No newline at end of file --- linux-azure-4.11.0.orig/Documentation/kmsg/zram +++ linux-azure-4.11.0/Documentation/kmsg/zram @@ -0,0 +1,34 @@ +/*? Text: "Error allocating compressor buffer space\n" */ +/*? Text: "Error allocating memory for compressed page: %u, size=%zu\n" */ +/*? Text: "Error creating memory pool\n" */ +/*? Text: "num_devices not specified. Using default: 1\n" */ +/*? Text: "Error allocating compressor working memory!\n" */ +/*? Text: "Error allocating zram address table\n" */ +/*? Text: "Unable to get major number\n" */ +/*? Text: "Compression failed! err=%d\n" */ +/*? Text: "Decompression failed! err=%d, page=%u\n" */ +/*? Text: "There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1%% of the size of the disk when not in use so a huge zram is wasteful.\n\tMemory Size: %zu kB\n\tSize you selected: %llu kB\nContinuing anyway ...\n" */ +/*? Text: "disk size not provided. You can use disksize_kb module param to specify size.\nUsing default: (%u%% of RAM).\n" */ +/*? Text: "Error creating sysfs group" */ +/*? Text: "Error allocating memory for incompressible page: %u\n" */ +/*? Text: "Creating %u devices ...\n" */ +/*? Text: "Initialization failed: err=%d\n" */ +/*? Text: "Error allocating disk queue for device %d\n" */ +/*? Text: "Error allocating disk structure for device %d\n" */ +/*? Text: "Invalid value for num_devices: %u\n" */ +/*? Text: "Error allocating temp memory!\n" */ +/*? Text: "Unable to allocate temp memory\n" */ +/*? Text: "Created %u device(s) ...\n" */ +/*? Text: "There is little point creating a zram of greater than twice the size of memory since we expect a 2:1 compression ratio. Note that zram uses about 0.1%% of the size of the disk when not in use so a huge zram is wasteful.\n\tMemory Size: %lu kB\n\tSize you selected: %llu kB\nContinuing anyway ...\n" */ +/*? Text: "Cannot change disksize for initialized device\n" */ +/*? Text: "Can't change algorithm for initialized device\n" */ +/*? Text: "Cannot initialise %s compressing backend\n" */ +/*? Text: "Cannot change max compression streams\n" */ +/*? Text: "Destroyed %u device(s)\n" */ +/*? Text: "Created %u device(s)\n" */ +/*? Text: "Unable to register zram-control class\n" */ +/*? Text: "Removed device: %s\n" */ +/*? Text: "Added device: %s\n" */ +/*? Text: "Error creating sysfs group for device %d\n" */ +/*? Text: "Error allocating memory for compressed page: %u, size=%u\n" */ +/*? Text: "%s: %d output lines suppressed due to ratelimiting\n" */ \ No newline at end of file --- linux-azure-4.11.0.orig/Documentation/networking/netvsc.txt +++ linux-azure-4.11.0/Documentation/networking/netvsc.txt @@ -0,0 +1,63 @@ +Hyper-V network driver +====================== + +Compatibility +============= + +This driver is compatible with Windows Server 2012 R2, 2016 and +Windows 10. + +Features +======== + + Checksum offload + ---------------- + The netvsc driver supports checksum offload as long as the + Hyper-V host version does. Windows Server 2016 and Azure + support checksum offload for TCP and UDP for both IPv4 and + IPv6. Windows Server 2012 only supports checksum offload for TCP. + + Receive Side Scaling + -------------------- + Hyper-V supports receive side scaling. For TCP, packets are + distributed among available queues based on IP address and port + number. Current versions of Hyper-V host, only distribute UDP + packets based on the IP source and destination address. + The port number is not used as part of the hash value for UDP. + Fragmented IP packets are not distributed between queues; + all fragmented packets arrive on the first channel. + + Generic Receive Offload, aka GRO + -------------------------------- + The driver supports GRO and it is enabled by default. GRO coalesces + like packets and significantly reduces CPU usage under heavy Rx + load. + + SR-IOV support + -------------- + Hyper-V supports SR-IOV as a hardware acceleration option. If SR-IOV + is enabled in both the vSwitch and the guest configuration, then the + Virtual Function (VF) device is passed to the guest as a PCI + device. In this case, both a synthetic (netvsc) and VF device are + visible in the guest OS and both NIC's have the same MAC address. + + The VF is enslaved by netvsc device. The netvsc driver will transparently + switch the data path to the VF when it is available and up. + Network state (addresses, firewall, etc) should be applied only to the + netvsc device; the slave device should not be accessed directly in + most cases. The exceptions are if some special queue discipline or + flow direction is desired, these should be applied directly to the + VF slave device. + + Receive Buffer + -------------- + Packets are received into a receive area which is created when device + is probed. The receive area is broken into MTU sized chunks and each may + contain one or more packets. The number of receive sections may be changed + via ethtool Rx ring parameters. + + There is a similar send buffer which is used to aggregate packets for sending. + The send area is broken into chunks of 6144 bytes, each of section may + contain one or more packets. The send buffer is an optimization, the driver + will use slower method to handle very large packets or if the send buffer + area is exhausted. --- linux-azure-4.11.0.orig/Documentation/perf/qcom_l3_pmu.txt +++ linux-azure-4.11.0/Documentation/perf/qcom_l3_pmu.txt @@ -0,0 +1,25 @@ +Qualcomm Datacenter Technologies L3 Cache Performance Monitoring Unit (PMU) +=========================================================================== + +This driver supports the L3 cache PMUs found in Qualcomm Datacenter Technologies +Centriq SoCs. The L3 cache on these SOCs is composed of multiple slices, shared +by all cores within a socket. Each slice is exposed as a separate uncore perf +PMU with device name l3cache__. User space is responsible +for aggregating across slices. + +The driver provides a description of its available events and configuration +options in sysfs, see /sys/devices/l3cache*. Given that these are uncore PMUs +the driver also exposes a "cpumask" sysfs attribute which contains a mask +consisting of one CPU per socket which will be used to handle all the PMU +events on that socket. + +The hardware implements 32bit event counters and has a flat 8bit event space +exposed via the "event" format attribute. In addition to the 32bit physical +counters the driver supports virtual 64bit hardware counters by using hardware +counter chaining. This feature is exposed via the "lc" (long counter) format +flag. E.g.: + + perf stat -e l3cache_0_0/read-miss,lc/ + +Given that these are uncore PMUs the driver does not support sampling, therefore +"perf record" will not work. Per-task perf sessions are not supported. --- linux-azure-4.11.0.orig/Documentation/powerpc/cxl.txt +++ linux-azure-4.11.0/Documentation/powerpc/cxl.txt @@ -21,7 +21,7 @@ Hardware overview ================= - POWER8 FPGA + POWER8/9 FPGA +----------+ +---------+ | | | | | CPU | | AFU | @@ -34,7 +34,7 @@ | | CAPP |<------>| | +---+------+ PCIE +---------+ - The POWER8 chip has a Coherently Attached Processor Proxy (CAPP) + The POWER8/9 chip has a Coherently Attached Processor Proxy (CAPP) unit which is part of the PCIe Host Bridge (PHB). This is managed by Linux by calls into OPAL. Linux doesn't directly program the CAPP. @@ -59,6 +59,17 @@ the fault. The context to which this fault is serviced is based on who owns that acceleration function. + POWER8 <-----> PSL Version 8 is compliant to the CAIA Version 1.0. + POWER9 <-----> PSL Version 9 is compliant to the CAIA Version 2.0. + This PSL Version 9 provides new features such as: + * Interaction with the nest MMU on the P9 chip. + * Native DMA support. + * Supports sending ASB_Notify messages for host thread wakeup. + * Supports Atomic operations. + * .... + + Cards with a PSL9 won't work on a POWER8 system and cards with a + PSL8 won't work on a POWER9 system. AFU Modes ========= --- linux-azure-4.11.0.orig/Documentation/powerpc/cxlflash.txt +++ linux-azure-4.11.0/Documentation/powerpc/cxlflash.txt @@ -124,8 +124,8 @@ http://github.com/open-power/capiflash -CXL Flash Driver IOCTLs -======================= +CXL Flash Driver LUN IOCTLs +=========================== Users, such as the block library, that wish to interface with a flash device (LUN) via user space access need to use the services provided @@ -239,6 +239,11 @@ resource handle that is provided is already referencing provisioned storage. This is reflected by the last LBA being a non-zero value. + When a LUN is accessible from more than one port, this ioctl will + return with the DK_CXLFLASH_ALL_PORTS_ACTIVE return flag set. This + provides the user with a hint that I/O can be retried in the event + of an I/O error as the LUN can be reached over multiple paths. + DK_CXLFLASH_VLUN_RESIZE ----------------------- This ioctl is responsible for resizing a previously created virtual @@ -252,6 +257,12 @@ operating in the virtual mode and used to program a LUN translation table that the AFU references when provided with a resource handle. + This ioctl can return -EAGAIN if an AFU sync operation takes too long. + In addition to returning a failure to user, cxlflash will also schedule + an asynchronous AFU reset. Should the user choose to retry the operation, + it is expected to succeed. If this ioctl fails with -EAGAIN, the user + can either retry the operation or treat it as a failure. + DK_CXLFLASH_RELEASE ------------------- This ioctl is responsible for releasing a previously obtained @@ -304,6 +315,12 @@ clone. This is to avoid a stale entry in the file descriptor table of the child process. + This ioctl can return -EAGAIN if an AFU sync operation takes too long. + In addition to returning a failure to user, cxlflash will also schedule + an asynchronous AFU reset. Should the user choose to retry the operation, + it is expected to succeed. If this ioctl fails with -EAGAIN, the user + can either retry the operation or treat it as a failure. + DK_CXLFLASH_VERIFY ------------------ This ioctl is used to detect various changes such as the capacity of @@ -350,3 +367,63 @@ exclusive user space access (superpipe). In case a LUN is visible across multiple ports and adapters, this ioctl is used to uniquely identify each LUN by its World Wide Node Name (WWNN). + + +CXL Flash Driver Host IOCTLs +============================ + + Each host adapter instance that is supported by the cxlflash driver + has a special character device associated with it to enable a set of + host management function. These character devices are hosted in a + class dedicated for cxlflash and can be accessed via /dev/cxlflash/*. + + Applications can be written to perform various functions using the + host ioctl APIs below. + + The structure definitions for these IOCTLs are available in: + uapi/scsi/cxlflash_ioctl.h + +HT_CXLFLASH_LUN_PROVISION +------------------------- + This ioctl is used to create and delete persistent LUNs on cxlflash + devices that lack an external LUN management interface. It is only + valid when used with AFUs that support the LUN provision capability. + + When sufficient space is available, LUNs can be created by specifying + the target port to host the LUN and a desired size in 4K blocks. Upon + success, the LUN ID and WWID of the created LUN will be returned and + the SCSI bus can be scanned to detect the change in LUN topology. Note + that partial allocations are not supported. Should a creation fail due + to a space issue, the target port can be queried for its current LUN + geometry. + + To remove a LUN, the device must first be disassociated from the Linux + SCSI subsystem. The LUN deletion can then be initiated by specifying a + target port and LUN ID. Upon success, the LUN geometry associated with + the port will be updated to reflect new number of provisioned LUNs and + available capacity. + + To query the LUN geometry of a port, the target port is specified and + upon success, the following information is presented: + + - Maximum number of provisioned LUNs allowed for the port + - Current number of provisioned LUNs for the port + - Maximum total capacity of provisioned LUNs for the port (4K blocks) + - Current total capacity of provisioned LUNs for the port (4K blocks) + + With this information, the number of available LUNs and capacity can be + can be calculated. + +HT_CXLFLASH_AFU_DEBUG +--------------------- + This ioctl is used to debug AFUs by supporting a command pass-through + interface. It is only valid when used with AFUs that support the AFU + debug capability. + + With exception of buffer management, AFU debug commands are opaque to + cxlflash and treated as pass-through. For debug commands that do require + data transfer, the user supplies an adequately sized data buffer and must + specify the data transfer direction with respect to the host. There is a + maximum transfer size of 256K imposed. Note that partial read completions + are not supported - when errors are experienced with a host read data + transfer, the data buffer is not copied back to the user. --- linux-azure-4.11.0.orig/Documentation/printk-formats.txt +++ linux-azure-4.11.0/Documentation/printk-formats.txt @@ -324,6 +324,12 @@ Passed by reference. +Kernel messages: + + %pj 123456 + + For generating the jhash of a string truncated to six digits + If you add other %p extensions, please extend lib/test_printf.c with one or more test cases, if at all feasible. --- linux-azure-4.11.0.orig/Documentation/virtual/kvm/devices/vfio.txt +++ linux-azure-4.11.0/Documentation/virtual/kvm/devices/vfio.txt @@ -16,7 +16,21 @@ KVM_DEV_VFIO_GROUP attributes: KVM_DEV_VFIO_GROUP_ADD: Add a VFIO group to VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. KVM_DEV_VFIO_GROUP_DEL: Remove a VFIO group from VFIO-KVM device tracking + kvm_device_attr.addr points to an int32_t file descriptor + for the VFIO group. + KVM_DEV_VFIO_GROUP_SET_SPAPR_TCE: attaches a guest visible TCE table + allocated by sPAPR KVM. + kvm_device_attr.addr points to a struct: -For each, kvm_device_attr.addr points to an int32_t file descriptor -for the VFIO group. + struct kvm_vfio_spapr_tce { + __s32 groupfd; + __s32 tablefd; + }; + + where + @groupfd is a file descriptor for a VFIO group; + @tablefd is a file descriptor for a TCE table allocated via + KVM_CREATE_SPAPR_TCE. --- linux-azure-4.11.0.orig/Documentation/vm/ksm.txt +++ linux-azure-4.11.0/Documentation/vm/ksm.txt @@ -98,6 +98,50 @@ it is only effective for pages merged after the change. Default: 0 (normal KSM behaviour as in earlier releases) +max_page_sharing - Maximum sharing allowed for each KSM page. This + enforces a deduplication limit to avoid the virtual + memory rmap lists to grow too large. The minimum + value is 2 as a newly created KSM page will have at + least two sharers. The rmap walk has O(N) + complexity where N is the number of rmap_items + (i.e. virtual mappings) that are sharing the page, + which is in turn capped by max_page_sharing. So + this effectively spread the the linear O(N) + computational complexity from rmap walk context + over different KSM pages. The ksmd walk over the + stable_node "chains" is also O(N), but N is the + number of stable_node "dups", not the number of + rmap_items, so it has not a significant impact on + ksmd performance. In practice the best stable_node + "dup" candidate will be kept and found at the head + of the "dups" list. The higher this value the + faster KSM will merge the memory (because there + will be fewer stable_node dups queued into the + stable_node chain->hlist to check for pruning) and + the higher the deduplication factor will be, but + the slowest the worst case rmap walk could be for + any given KSM page. Slowing down the rmap_walk + means there will be higher latency for certain + virtual memory operations happening during + swapping, compaction, NUMA balancing and page + migration, in turn decreasing responsiveness for + the caller of those virtual memory operations. The + scheduler latency of other tasks not involved with + the VM operations doing the rmap walk is not + affected by this parameter as the rmap walks are + always schedule friendly themselves. + +stable_node_chains_prune_millisecs - How frequently to walk the whole + list of stable_node "dups" linked in the + stable_node "chains" in order to prune stale + stable_nodes. Smaller milllisecs values will free + up the KSM metadata with lower latency, but they + will make ksmd use more CPU during the scan. This + only applies to the stable_node chains so it's a + noop if not a single KSM page hit the + max_page_sharing yet (there would be no stable_node + chains in such case). + The effectiveness of KSM and MADV_MERGEABLE is shown in /sys/kernel/mm/ksm/: pages_shared - how many shared pages are being used @@ -106,10 +150,29 @@ pages_volatile - how many pages changing too fast to be placed in a tree full_scans - how many times all mergeable areas have been scanned +stable_node_chains - number of stable node chains allocated, this is + effectively the number of KSM pages that hit the + max_page_sharing limit +stable_node_dups - number of stable node dups queued into the + stable_node chains + A high ratio of pages_sharing to pages_shared indicates good sharing, but a high ratio of pages_unshared to pages_sharing indicates wasted effort. pages_volatile embraces several different kinds of activity, but a high proportion there would also indicate poor use of madvise MADV_MERGEABLE. +The maximum possible page_sharing/page_shared ratio is limited by the +max_page_sharing tunable. To increase the ratio max_page_sharing must +be increased accordingly. + +The stable_node_dups/stable_node_chains ratio is also affected by the +max_page_sharing tunable, and an high ratio may indicate fragmentation +in the stable_node dups, which could be solved by introducing +fragmentation algorithms in ksmd which would refile rmap_items from +one stable_node dup to another stable_node dup, in order to freeup +stable_node "dups" with few rmap_items in them, but that may increase +the ksmd CPU usage and possibly slowdown the readonly computations on +the KSM pages of the applications. + Izik Eidus, Hugh Dickins, 17 Nov 2009 --- linux-azure-4.11.0.orig/MAINTAINERS +++ linux-azure-4.11.0/MAINTAINERS @@ -2319,6 +2319,19 @@ F: include/uapi/linux/audit.h F: kernel/audit* +AUFS (advanced multi layered unification filesystem) FILESYSTEM +M: "J. R. Okajima" +L: linux-unionfs@vger.kernel.org +L: aufs-users@lists.sourceforge.net (members only) +W: http://aufs.sourceforge.net +T: git://github.com/sfjro/aufs4-linux.git +S: Supported +F: Documentation/filesystems/aufs/ +F: Documentation/ABI/testing/debugfs-aufs +F: Documentation/ABI/testing/sysfs-aufs +F: fs/aufs/ +F: include/uapi/linux/aufs_type.h + AUXILIARY DISPLAY DRIVERS M: Miguel Ojeda Sandonis W: http://miguelojeda.es/auxdisplay.htm @@ -5959,6 +5972,14 @@ F: net/802/hippi.c F: drivers/net/hippi/ +HISILICON LPC BUS DRIVER +M: Zhichang Yuan +L: linux-arm-kernel@lists.infradead.org +W: http://www.hisilicon.com +S: Maintained +F: drivers/bus/hisi_lpc.c +F: Documentation/devicetree/bindings/arm/hisilicon/hisilicon-low-pin-count.txt + HISILICON NETWORK SUBSYSTEM DRIVER M: Yisen Zhuang M: Salil Mehta @@ -6065,7 +6086,9 @@ M: Stephen Hemminger L: devel@linuxdriverproject.org S: Maintained +F: Documentation/networking/netvsc.txt F: arch/x86/include/asm/mshyperv.h +F: arch/x86/include/asm/trace/hyperv.h F: arch/x86/include/uapi/asm/hyperv.h F: arch/x86/kernel/cpu/mshyperv.c F: arch/x86/hyperv @@ -6077,6 +6100,7 @@ F: drivers/scsi/storvsc_drv.c F: drivers/uio/uio_hv_generic.c F: drivers/video/fbdev/hyperv_fb.c +F: net/vmw_vsock/hyperv_transport.c F: include/linux/hyperv.h F: tools/hv/ F: Documentation/ABI/stable/sysfs-bus-vmbus --- linux-azure-4.11.0.orig/Makefile +++ linux-azure-4.11.0/Makefile @@ -1,6 +1,6 @@ VERSION = 4 PATCHLEVEL = 11 -SUBLEVEL = 0 +SUBLEVEL = 12 EXTRAVERSION = NAME = Fearless Coyote @@ -182,6 +182,20 @@ KBUILD_CHECKSRC = 0 endif +# Call message checker as part of the C compilation +# +# Use 'make D=1' to enable checking +# Use 'make D=2' to create the message catalog + +ifdef D + ifeq ("$(origin D)", "command line") + KBUILD_KMSG_CHECK = $(D) + endif +endif +ifndef KBUILD_KMSG_CHECK + KBUILD_KMSG_CHECK = 0 +endif + # Use make M=dir to specify directory of external module to build # Old syntax make ... SUBDIRS=$PWD is still supported # Setting the environment variable KBUILD_EXTMOD take precedence @@ -365,6 +379,7 @@ CHECKFLAGS := -D__linux__ -Dlinux -D__STDC__ -Dunix -D__unix__ \ -Wbitwise -Wno-return-void $(CF) +KMSG_CHECK = $(srctree)/scripts/kmsg-doc NOSTDINC_FLAGS = CFLAGS_MODULE = AFLAGS_MODULE = @@ -375,6 +390,12 @@ CFLAGS_GCOV := -fprofile-arcs -ftest-coverage -fno-tree-loop-im $(call cc-disable-warning,maybe-uninitialized,) CFLAGS_KCOV := $(call cc-option,-fsanitize-coverage=trace-pc,) +# Prefer linux-backports-modules +ifneq ($(KBUILD_SRC),) +ifneq ($(shell if test -e $(KBUILD_OUTPUT)/ubuntu-build; then echo yes; fi),yes) +UBUNTUINCLUDE := -I/usr/src/linux-headers-lbm-$(KERNELRELEASE) +endif +endif # Use USERINCLUDE when you must reference the UAPI directories only. USERINCLUDE := \ @@ -387,6 +408,7 @@ # Use LINUXINCLUDE when you must reference the include/ directory. # Needed to be compatible with the O= option LINUXINCLUDE := \ + $(UBUNTUINCLUDE) \ -I$(srctree)/arch/$(hdr-arch)/include \ -I$(objtree)/arch/$(hdr-arch)/include/generated/uapi \ -I$(objtree)/arch/$(hdr-arch)/include/generated \ @@ -395,6 +417,9 @@ LINUXINCLUDE += $(filter-out $(LINUXINCLUDE),$(USERINCLUDE)) +# UBUNTU: Include our third party driver stuff too +LINUXINCLUDE += -Iubuntu/include $(if $(KBUILD_SRC),-I$(srctree)/ubuntu/include) + KBUILD_CPPFLAGS := -D__KERNEL__ KBUILD_CFLAGS := -Wall -Wundef -Wstrict-prototypes -Wno-trigraphs \ @@ -424,6 +449,7 @@ export KBUILD_CPPFLAGS NOSTDINC_FLAGS LINUXINCLUDE OBJCOPYFLAGS LDFLAGS export KBUILD_CFLAGS CFLAGS_KERNEL CFLAGS_MODULE CFLAGS_GCOV CFLAGS_KCOV CFLAGS_KASAN CFLAGS_UBSAN export KBUILD_AFLAGS AFLAGS_KERNEL AFLAGS_MODULE +export KBUILD_KMSG_CHECK KMSG_CHECK export KBUILD_AFLAGS_MODULE KBUILD_CFLAGS_MODULE KBUILD_LDFLAGS_MODULE export KBUILD_AFLAGS_KERNEL KBUILD_CFLAGS_KERNEL export KBUILD_ARFLAGS @@ -563,7 +589,7 @@ # Objects we will link into vmlinux / subdirs we need to visit init-y := init/ -drivers-y := drivers/ sound/ firmware/ +drivers-y := drivers/ sound/ firmware/ ubuntu/ net-y := net/ libs-y := lib/ core-y := usr/ @@ -622,6 +648,12 @@ # Defaults to vmlinux, but the arch makefile usually adds further targets all: vmlinux +# force no-pie for distro compilers that enable pie by default +KBUILD_CFLAGS += $(call cc-option, -fno-pie) +KBUILD_CFLAGS += $(call cc-option, -no-pie) +KBUILD_AFLAGS += $(call cc-option, -fno-pie) +KBUILD_CPPFLAGS += $(call cc-option, -fno-pie) + # The arch Makefile can set ARCH_{CPP,A,C}FLAGS to override the default # values of the respective KBUILD_* variables ARCH_CPPFLAGS := @@ -1150,6 +1182,7 @@ $(error Headers not exportable for the $(SRCARCH) architecture)) $(Q)$(MAKE) $(hdr-inst)=include/uapi $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/uapi/asm $(hdr-dst) + $(Q)$(MAKE) $(hdr-inst)=ubuntu/include dst=include oldheaders= PHONY += headers_check_all headers_check_all: headers_install_all @@ -1159,6 +1192,7 @@ headers_check: headers_install $(Q)$(MAKE) $(hdr-inst)=include/uapi HDRCHECK=1 $(Q)$(MAKE) $(hdr-inst)=arch/$(hdr-arch)/include/uapi/asm $(hdr-dst) HDRCHECK=1 + $(Q)$(MAKE) $(hdr-inst)=ubuntu/include dst=include oldheaders= HDRCHECK=1 # --------------------------------------------------------------------------- # Kernel selftest --- linux-azure-4.11.0.orig/arch/alpha/kernel/osf_sys.c +++ linux-azure-4.11.0/arch/alpha/kernel/osf_sys.c @@ -1199,8 +1199,10 @@ if (!access_ok(VERIFY_WRITE, ur, sizeof(*ur))) return -EFAULT; - err = 0; - err |= put_user(status, ustatus); + err = put_user(status, ustatus); + if (ret < 0) + return err ? err : ret; + err |= __put_user(r.ru_utime.tv_sec, &ur->ru_utime.tv_sec); err |= __put_user(r.ru_utime.tv_usec, &ur->ru_utime.tv_usec); err |= __put_user(r.ru_stime.tv_sec, &ur->ru_stime.tv_sec); --- linux-azure-4.11.0.orig/arch/arc/mm/mmap.c +++ linux-azure-4.11.0/arch/arc/mm/mmap.c @@ -65,7 +65,7 @@ vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } --- linux-azure-4.11.0.orig/arch/arm/Kconfig +++ linux-azure-4.11.0/arch/arm/Kconfig @@ -2205,6 +2205,7 @@ source "drivers/Kconfig" source "drivers/firmware/Kconfig" +source "ubuntu/Kconfig" source "fs/Kconfig" --- linux-azure-4.11.0.orig/arch/arm/boot/dts/am335x-sl50.dts +++ linux-azure-4.11.0/arch/arm/boot/dts/am335x-sl50.dts @@ -220,7 +220,7 @@ mmc1_pins: pinmux_mmc1_pins { pinctrl-single,pins = < - AM33XX_IOPAD(0x960, PIN_INPUT | MUX_MODE7) /* spi0_cs1.gpio0_6 */ + AM33XX_IOPAD(0x96c, PIN_INPUT | MUX_MODE7) /* uart0_rtsn.gpio1_9 */ >; }; @@ -280,10 +280,6 @@ AM33XX_IOPAD(0x834, PIN_INPUT_PULLUP | MUX_MODE7) /* nKbdReset - gpmc_ad13.gpio1_13 */ AM33XX_IOPAD(0x838, PIN_INPUT_PULLUP | MUX_MODE7) /* nDispReset - gpmc_ad14.gpio1_14 */ AM33XX_IOPAD(0x844, PIN_INPUT_PULLUP | MUX_MODE7) /* USB1_enPower - gpmc_a1.gpio1_17 */ - /* AVR Programming - SPI Bus (bit bang) - Screen and Keyboard */ - AM33XX_IOPAD(0x954, PIN_INPUT_PULLUP | MUX_MODE7) /* Kbd/Disp/BattMOSI spi0_d0.gpio0_3 */ - AM33XX_IOPAD(0x958, PIN_INPUT_PULLUP | MUX_MODE7) /* Kbd/Disp/BattMISO spi0_d1.gpio0_4 */ - AM33XX_IOPAD(0x950, PIN_INPUT_PULLUP | MUX_MODE7) /* Kbd/Disp/BattSCLK spi0_clk.gpio0_2 */ /* PDI Bus - Battery system */ AM33XX_IOPAD(0x840, PIN_INPUT_PULLUP | MUX_MODE7) /* nBattReset gpmc_a0.gpio1_16 */ AM33XX_IOPAD(0x83c, PIN_INPUT_PULLUP | MUX_MODE7) /* BattPDIData gpmc_ad15.gpio1_15 */ @@ -384,7 +380,7 @@ pinctrl-names = "default"; pinctrl-0 = <&mmc1_pins>; bus-width = <4>; - cd-gpios = <&gpio0 6 GPIO_ACTIVE_LOW>; + cd-gpios = <&gpio1 9 GPIO_ACTIVE_LOW>; vmmc-supply = <&vmmcsd_fixed>; }; --- linux-azure-4.11.0.orig/arch/arm/boot/dts/at91-sama5d3_xplained.dts +++ linux-azure-4.11.0/arch/arm/boot/dts/at91-sama5d3_xplained.dts @@ -162,9 +162,10 @@ }; adc0: adc@f8018000 { + atmel,adc-vref = <3300>; + atmel,adc-channels-used = <0xfe>; pinctrl-0 = < &pinctrl_adc0_adtrg - &pinctrl_adc0_ad0 &pinctrl_adc0_ad1 &pinctrl_adc0_ad2 &pinctrl_adc0_ad3 @@ -172,8 +173,6 @@ &pinctrl_adc0_ad5 &pinctrl_adc0_ad6 &pinctrl_adc0_ad7 - &pinctrl_adc0_ad8 - &pinctrl_adc0_ad9 >; status = "okay"; }; --- linux-azure-4.11.0.orig/arch/arm/boot/dts/imx6sx-sdb.dts +++ linux-azure-4.11.0/arch/arm/boot/dts/imx6sx-sdb.dts @@ -12,23 +12,6 @@ model = "Freescale i.MX6 SoloX SDB RevB Board"; }; -&cpu0 { - operating-points = < - /* kHz uV */ - 996000 1250000 - 792000 1175000 - 396000 1175000 - 198000 1175000 - >; - fsl,soc-operating-points = < - /* ARM kHz SOC uV */ - 996000 1250000 - 792000 1175000 - 396000 1175000 - 198000 1175000 - >; -}; - &i2c1 { clock-frequency = <100000>; pinctrl-names = "default"; --- linux-azure-4.11.0.orig/arch/arm/boot/dts/keystone-k2l-netcp.dtsi +++ linux-azure-4.11.0/arch/arm/boot/dts/keystone-k2l-netcp.dtsi @@ -137,8 +137,8 @@ /* NetCP address range */ ranges = <0 0x26000000 0x1000000>; - clocks = <&clkpa>, <&clkcpgmac>, <&chipclk12>, <&clkosr>; - clock-names = "pa_clk", "ethss_clk", "cpts", "osr_clk"; + clocks = <&clkpa>, <&clkcpgmac>, <&chipclk12>; + clock-names = "pa_clk", "ethss_clk", "cpts"; dma-coherent; ti,navigator-dmas = <&dma_gbe 0>, --- linux-azure-4.11.0.orig/arch/arm/boot/dts/keystone-k2l.dtsi +++ linux-azure-4.11.0/arch/arm/boot/dts/keystone-k2l.dtsi @@ -232,6 +232,14 @@ }; }; + osr: sram@70000000 { + compatible = "mmio-sram"; + reg = <0x70000000 0x10000>; + #address-cells = <1>; + #size-cells = <1>; + clocks = <&clkosr>; + }; + dspgpio0: keystone_dsp_gpio@02620240 { compatible = "ti,keystone-dsp-gpio"; gpio-controller; --- linux-azure-4.11.0.orig/arch/arm/include/asm/device.h +++ linux-azure-4.11.0/arch/arm/include/asm/device.h @@ -16,6 +16,9 @@ #ifdef CONFIG_ARM_DMA_USE_IOMMU struct dma_iommu_mapping *mapping; #endif +#ifdef CONFIG_XEN + const struct dma_map_ops *dev_dma_ops; +#endif bool dma_coherent; }; --- linux-azure-4.11.0.orig/arch/arm/include/asm/dma-mapping.h +++ linux-azure-4.11.0/arch/arm/include/asm/dma-mapping.h @@ -16,19 +16,9 @@ extern const struct dma_map_ops arm_dma_ops; extern const struct dma_map_ops arm_coherent_dma_ops; -static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev) -{ - if (dev && dev->dma_ops) - return dev->dma_ops; - return &arm_dma_ops; -} - static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { - if (xen_initial_domain()) - return xen_dma_ops; - else - return __generic_dma_ops(NULL); + return &arm_dma_ops; } #define HAVE_ARCH_DMA_SUPPORTED 1 --- linux-azure-4.11.0.orig/arch/arm/include/asm/efi.h +++ linux-azure-4.11.0/arch/arm/include/asm/efi.h @@ -85,6 +85,18 @@ */ #define ZIMAGE_OFFSET_LIMIT SZ_128M #define MIN_ZIMAGE_OFFSET MAX_UNCOMP_KERNEL_SIZE -#define MAX_FDT_OFFSET ZIMAGE_OFFSET_LIMIT + +/* on ARM, the FDT should be located in the first 128 MB of RAM */ +static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base) +{ + return dram_base + ZIMAGE_OFFSET_LIMIT; +} + +/* on ARM, the initrd should be loaded in a lowmem region */ +static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base, + unsigned long image_addr) +{ + return dram_base + SZ_512M; +} #endif /* _ASM_ARM_EFI_H */ --- linux-azure-4.11.0.orig/arch/arm/include/asm/elf.h +++ linux-azure-4.11.0/arch/arm/include/asm/elf.h @@ -112,12 +112,8 @@ #define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE 4096 -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) +/* This is the base location for PIE (ET_DYN with INTERP) loads. */ +#define ELF_ET_DYN_BASE 0x400000UL /* When the program starts, a1 contains a pointer to a function to be registered with atexit, as per the SVR4 ABI. A value of 0 means we --- linux-azure-4.11.0.orig/arch/arm/include/asm/fixmap.h +++ linux-azure-4.11.0/arch/arm/include/asm/fixmap.h @@ -41,7 +41,7 @@ #define FIXMAP_PAGE_COMMON (L_PTE_YOUNG | L_PTE_PRESENT | L_PTE_XN | L_PTE_DIRTY) -#define FIXMAP_PAGE_NORMAL (FIXMAP_PAGE_COMMON | L_PTE_MT_WRITEBACK) +#define FIXMAP_PAGE_NORMAL (pgprot_kernel | L_PTE_XN) #define FIXMAP_PAGE_RO (FIXMAP_PAGE_NORMAL | L_PTE_RDONLY) /* Used by set_fixmap_(io|nocache), both meant for mapping a device */ --- linux-azure-4.11.0.orig/arch/arm/include/asm/kvm_coproc.h +++ linux-azure-4.11.0/arch/arm/include/asm/kvm_coproc.h @@ -31,7 +31,8 @@ int kvm_handle_cp10_id(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_handle_cp_0_13_access(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_handle_cp14_load_store(struct kvm_vcpu *vcpu, struct kvm_run *run); -int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run); +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run); int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run); --- linux-azure-4.11.0.orig/arch/arm/include/asm/module.h +++ linux-azure-4.11.0/arch/arm/include/asm/module.h @@ -18,13 +18,18 @@ }; #endif +struct mod_plt_sec { + struct elf32_shdr *plt; + int plt_count; +}; + struct mod_arch_specific { #ifdef CONFIG_ARM_UNWIND struct unwind_table *unwind[ARM_SEC_MAX]; #endif #ifdef CONFIG_ARM_MODULE_PLTS - struct elf32_shdr *plt; - int plt_count; + struct mod_plt_sec core; + struct mod_plt_sec init; #endif }; --- linux-azure-4.11.0.orig/arch/arm/kernel/module-plts.c +++ linux-azure-4.11.0/arch/arm/kernel/module-plts.c @@ -1,5 +1,5 @@ /* - * Copyright (C) 2014 Linaro Ltd. + * Copyright (C) 2014-2017 Linaro Ltd. * * This program is free software; you can redistribute it and/or modify * it under the terms of the GNU General Public License version 2 as @@ -31,9 +31,17 @@ u32 lit[PLT_ENT_COUNT]; }; +static bool in_init(const struct module *mod, unsigned long loc) +{ + return loc - (u32)mod->init_layout.base < mod->init_layout.size; +} + u32 get_module_plt(struct module *mod, unsigned long loc, Elf32_Addr val) { - struct plt_entries *plt = (struct plt_entries *)mod->arch.plt->sh_addr; + struct mod_plt_sec *pltsec = !in_init(mod, loc) ? &mod->arch.core : + &mod->arch.init; + + struct plt_entries *plt = (struct plt_entries *)pltsec->plt->sh_addr; int idx = 0; /* @@ -41,9 +49,9 @@ * relocations are sorted, this will be the last entry we allocated. * (if one exists). */ - if (mod->arch.plt_count > 0) { - plt += (mod->arch.plt_count - 1) / PLT_ENT_COUNT; - idx = (mod->arch.plt_count - 1) % PLT_ENT_COUNT; + if (pltsec->plt_count > 0) { + plt += (pltsec->plt_count - 1) / PLT_ENT_COUNT; + idx = (pltsec->plt_count - 1) % PLT_ENT_COUNT; if (plt->lit[idx] == val) return (u32)&plt->ldr[idx]; @@ -53,8 +61,8 @@ plt++; } - mod->arch.plt_count++; - BUG_ON(mod->arch.plt_count * PLT_ENT_SIZE > mod->arch.plt->sh_size); + pltsec->plt_count++; + BUG_ON(pltsec->plt_count * PLT_ENT_SIZE > pltsec->plt->sh_size); if (!idx) /* Populate a new set of entries */ @@ -129,7 +137,7 @@ /* Count how many PLT entries we may need */ static unsigned int count_plts(const Elf32_Sym *syms, Elf32_Addr base, - const Elf32_Rel *rel, int num) + const Elf32_Rel *rel, int num, Elf32_Word dstidx) { unsigned int ret = 0; const Elf32_Sym *s; @@ -144,13 +152,17 @@ case R_ARM_THM_JUMP24: /* * We only have to consider branch targets that resolve - * to undefined symbols. This is not simply a heuristic, - * it is a fundamental limitation, since the PLT itself - * is part of the module, and needs to be within range - * as well, so modules can never grow beyond that limit. + * to symbols that are defined in a different section. + * This is not simply a heuristic, it is a fundamental + * limitation, since there is no guaranteed way to emit + * PLT entries sufficiently close to the branch if the + * section size exceeds the range of a branch + * instruction. So ignore relocations against defined + * symbols if they live in the same section as the + * relocation target. */ s = syms + ELF32_R_SYM(rel[i].r_info); - if (s->st_shndx != SHN_UNDEF) + if (s->st_shndx == dstidx) break; /* @@ -161,7 +173,12 @@ * So we need to support them, but there is no need to * take them into consideration when trying to optimize * this code. So let's only check for duplicates when - * the addend is zero. + * the addend is zero. (Note that calls into the core + * module via init PLT entries could involve section + * relative symbol references with non-zero addends, for + * which we may end up emitting duplicates, but the init + * PLT is released along with the rest of the .init + * region as soon as module loading completes.) */ if (!is_zero_addend_relocation(base, rel + i) || !duplicate_rel(base, rel, i)) @@ -174,7 +191,8 @@ int module_frob_arch_sections(Elf_Ehdr *ehdr, Elf_Shdr *sechdrs, char *secstrings, struct module *mod) { - unsigned long plts = 0; + unsigned long core_plts = 0; + unsigned long init_plts = 0; Elf32_Shdr *s, *sechdrs_end = sechdrs + ehdr->e_shnum; Elf32_Sym *syms = NULL; @@ -184,13 +202,15 @@ */ for (s = sechdrs; s < sechdrs_end; ++s) { if (strcmp(".plt", secstrings + s->sh_name) == 0) - mod->arch.plt = s; + mod->arch.core.plt = s; + else if (strcmp(".init.plt", secstrings + s->sh_name) == 0) + mod->arch.init.plt = s; else if (s->sh_type == SHT_SYMTAB) syms = (Elf32_Sym *)s->sh_addr; } - if (!mod->arch.plt) { - pr_err("%s: module PLT section missing\n", mod->name); + if (!mod->arch.core.plt || !mod->arch.init.plt) { + pr_err("%s: module PLT section(s) missing\n", mod->name); return -ENOEXEC; } if (!syms) { @@ -213,16 +233,29 @@ /* sort by type and symbol index */ sort(rels, numrels, sizeof(Elf32_Rel), cmp_rel, NULL); - plts += count_plts(syms, dstsec->sh_addr, rels, numrels); + if (strncmp(secstrings + dstsec->sh_name, ".init", 5) != 0) + core_plts += count_plts(syms, dstsec->sh_addr, rels, + numrels, s->sh_info); + else + init_plts += count_plts(syms, dstsec->sh_addr, rels, + numrels, s->sh_info); } - mod->arch.plt->sh_type = SHT_NOBITS; - mod->arch.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; - mod->arch.plt->sh_addralign = L1_CACHE_BYTES; - mod->arch.plt->sh_size = round_up(plts * PLT_ENT_SIZE, - sizeof(struct plt_entries)); - mod->arch.plt_count = 0; + mod->arch.core.plt->sh_type = SHT_NOBITS; + mod->arch.core.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.core.plt->sh_addralign = L1_CACHE_BYTES; + mod->arch.core.plt->sh_size = round_up(core_plts * PLT_ENT_SIZE, + sizeof(struct plt_entries)); + mod->arch.core.plt_count = 0; + + mod->arch.init.plt->sh_type = SHT_NOBITS; + mod->arch.init.plt->sh_flags = SHF_EXECINSTR | SHF_ALLOC; + mod->arch.init.plt->sh_addralign = L1_CACHE_BYTES; + mod->arch.init.plt->sh_size = round_up(init_plts * PLT_ENT_SIZE, + sizeof(struct plt_entries)); + mod->arch.init.plt_count = 0; - pr_debug("%s: plt=%x\n", __func__, mod->arch.plt->sh_size); + pr_debug("%s: plt=%x, init.plt=%x\n", __func__, + mod->arch.core.plt->sh_size, mod->arch.init.plt->sh_size); return 0; } --- linux-azure-4.11.0.orig/arch/arm/kernel/module.lds +++ linux-azure-4.11.0/arch/arm/kernel/module.lds @@ -1,3 +1,4 @@ SECTIONS { .plt : { BYTE(0) } + .init.plt : { BYTE(0) } } --- linux-azure-4.11.0.orig/arch/arm/kernel/setup.c +++ linux-azure-4.11.0/arch/arm/kernel/setup.c @@ -80,7 +80,7 @@ extern void init_default_cache_policy(unsigned long); extern void paging_init(const struct machine_desc *desc); -extern void early_paging_init(const struct machine_desc *); +extern void early_mm_init(const struct machine_desc *); extern void adjust_lowmem_bounds(void); extern enum reboot_mode reboot_mode; extern void setup_dma_zone(const struct machine_desc *desc); @@ -1088,7 +1088,7 @@ parse_early_param(); #ifdef CONFIG_MMU - early_paging_init(mdesc); + early_mm_init(mdesc); #endif setup_dma_zone(mdesc); xen_early_init(); --- linux-azure-4.11.0.orig/arch/arm/kernel/topology.c +++ linux-azure-4.11.0/arch/arm/kernel/topology.c @@ -57,7 +57,6 @@ per_cpu(cpu_scale, cpu) = capacity; } -#ifdef CONFIG_PROC_SYSCTL static ssize_t cpu_capacity_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -114,7 +113,6 @@ return 0; } subsys_initcall(register_cpu_capacity_sysctl); -#endif #ifdef CONFIG_OF struct cpu_efficiency { --- linux-azure-4.11.0.orig/arch/arm/kvm/coproc.c +++ linux-azure-4.11.0/arch/arm/kvm/coproc.c @@ -93,12 +93,6 @@ return 1; } -int kvm_handle_cp14_access(struct kvm_vcpu *vcpu, struct kvm_run *run) -{ - kvm_inject_undefined(vcpu); - return 1; -} - static void reset_mpidr(struct kvm_vcpu *vcpu, const struct coproc_reg *r) { /* @@ -514,12 +508,7 @@ return 1; } -/** - * kvm_handle_cp15_64 -- handles a mrrc/mcrr trap on a guest CP15 access - * @vcpu: The VCPU pointer - * @run: The kvm_run struct - */ -int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +static struct coproc_params decode_64bit_hsr(struct kvm_vcpu *vcpu) { struct coproc_params params; @@ -533,9 +522,38 @@ params.Rt2 = (kvm_vcpu_get_hsr(vcpu) >> 10) & 0xf; params.CRm = 0; + return params; +} + +/** + * kvm_handle_cp15_64 -- handles a mrrc/mcrr trap on a guest CP15 access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_cp15_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct coproc_params params = decode_64bit_hsr(vcpu); + return emulate_cp15(vcpu, ¶ms); } +/** + * kvm_handle_cp14_64 -- handles a mrrc/mcrr trap on a guest CP14 access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_cp14_64(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct coproc_params params = decode_64bit_hsr(vcpu); + + /* raz_wi cp14 */ + pm_fake(vcpu, ¶ms, NULL); + + /* handled */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + return 1; +} + static void reset_coproc_regs(struct kvm_vcpu *vcpu, const struct coproc_reg *table, size_t num) { @@ -546,12 +564,7 @@ table[i].reset(vcpu, &table[i]); } -/** - * kvm_handle_cp15_32 -- handles a mrc/mcr trap on a guest CP15 access - * @vcpu: The VCPU pointer - * @run: The kvm_run struct - */ -int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +static struct coproc_params decode_32bit_hsr(struct kvm_vcpu *vcpu) { struct coproc_params params; @@ -565,9 +578,37 @@ params.Op2 = (kvm_vcpu_get_hsr(vcpu) >> 17) & 0x7; params.Rt2 = 0; + return params; +} + +/** + * kvm_handle_cp15_32 -- handles a mrc/mcr trap on a guest CP15 access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_cp15_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct coproc_params params = decode_32bit_hsr(vcpu); return emulate_cp15(vcpu, ¶ms); } +/** + * kvm_handle_cp14_32 -- handles a mrc/mcr trap on a guest CP14 access + * @vcpu: The VCPU pointer + * @run: The kvm_run struct + */ +int kvm_handle_cp14_32(struct kvm_vcpu *vcpu, struct kvm_run *run) +{ + struct coproc_params params = decode_32bit_hsr(vcpu); + + /* raz_wi cp14 */ + pm_fake(vcpu, ¶ms, NULL); + + /* handled */ + kvm_skip_instr(vcpu, kvm_vcpu_trap_il_is32bit(vcpu)); + return 1; +} + /****************************************************************************** * Userspace API *****************************************************************************/ --- linux-azure-4.11.0.orig/arch/arm/kvm/handle_exit.c +++ linux-azure-4.11.0/arch/arm/kvm/handle_exit.c @@ -95,9 +95,9 @@ [HSR_EC_WFI] = kvm_handle_wfx, [HSR_EC_CP15_32] = kvm_handle_cp15_32, [HSR_EC_CP15_64] = kvm_handle_cp15_64, - [HSR_EC_CP14_MR] = kvm_handle_cp14_access, + [HSR_EC_CP14_MR] = kvm_handle_cp14_32, [HSR_EC_CP14_LS] = kvm_handle_cp14_load_store, - [HSR_EC_CP14_64] = kvm_handle_cp14_access, + [HSR_EC_CP14_64] = kvm_handle_cp14_64, [HSR_EC_CP_0_13] = kvm_handle_cp_0_13_access, [HSR_EC_CP10_ID] = kvm_handle_cp10_id, [HSR_EC_HVC] = handle_hvc, --- linux-azure-4.11.0.orig/arch/arm/kvm/hyp/Makefile +++ linux-azure-4.11.0/arch/arm/kvm/hyp/Makefile @@ -2,6 +2,8 @@ # Makefile for Kernel-based Virtual Machine module, HYP part # +ccflags-y += -fno-stack-protector + KVM=../../../../virt/kvm obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o --- linux-azure-4.11.0.orig/arch/arm/kvm/hyp/switch.c +++ linux-azure-4.11.0/arch/arm/kvm/hyp/switch.c @@ -48,7 +48,9 @@ write_sysreg(HSTR_T(15), HSTR); write_sysreg(HCPTR_TTA | HCPTR_TCP(10) | HCPTR_TCP(11), HCPTR); val = read_sysreg(HDCR); - write_sysreg(val | HDCR_TPM | HDCR_TPMCR, HDCR); + val |= HDCR_TPM | HDCR_TPMCR; /* trap performance monitors */ + val |= HDCR_TDRA | HDCR_TDOSA | HDCR_TDA; /* trap debug regs */ + write_sysreg(val, HDCR); } static void __hyp_text __deactivate_traps(struct kvm_vcpu *vcpu) --- linux-azure-4.11.0.orig/arch/arm/kvm/init.S +++ linux-azure-4.11.0/arch/arm/kvm/init.S @@ -95,7 +95,6 @@ @ - Write permission implies XN: disabled @ - Instruction cache: enabled @ - Data/Unified cache: enabled - @ - Memory alignment checks: enabled @ - MMU: enabled (this code must be run from an identity mapping) mrc p15, 4, r0, c1, c0, 0 @ HSCR ldr r2, =HSCTLR_MASK @@ -103,8 +102,8 @@ mrc p15, 0, r1, c1, c0, 0 @ SCTLR ldr r2, =(HSCTLR_EE | HSCTLR_FI | HSCTLR_I | HSCTLR_C) and r1, r1, r2 - ARM( ldr r2, =(HSCTLR_M | HSCTLR_A) ) - THUMB( ldr r2, =(HSCTLR_M | HSCTLR_A | HSCTLR_TE) ) + ARM( ldr r2, =(HSCTLR_M) ) + THUMB( ldr r2, =(HSCTLR_M | HSCTLR_TE) ) orr r1, r1, r2 orr r0, r0, r1 mcr p15, 4, r0, c1, c0, 0 @ HSCR --- linux-azure-4.11.0.orig/arch/arm/kvm/mmu.c +++ linux-azure-4.11.0/arch/arm/kvm/mmu.c @@ -295,6 +295,13 @@ assert_spin_locked(&kvm->mmu_lock); pgd = kvm->arch.pgd + stage2_pgd_index(addr); do { + /* + * Make sure the page table is still active, as another thread + * could have possibly freed the page table, while we released + * the lock. + */ + if (!READ_ONCE(kvm->arch.pgd)) + break; next = stage2_pgd_addr_end(addr, end); if (!stage2_pgd_none(*pgd)) unmap_stage2_puds(kvm, pgd, addr, next); @@ -829,22 +836,22 @@ * Walks the level-1 page table pointed to by kvm->arch.pgd and frees all * underlying level-2 and level-3 tables before freeing the actual level-1 table * and setting the struct pointer to NULL. - * - * Note we don't need locking here as this is only called when the VM is - * destroyed, which can only be done once. */ void kvm_free_stage2_pgd(struct kvm *kvm) { - if (kvm->arch.pgd == NULL) - return; + void *pgd = NULL; spin_lock(&kvm->mmu_lock); - unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); + if (kvm->arch.pgd) { + unmap_stage2_range(kvm, 0, KVM_PHYS_SIZE); + pgd = READ_ONCE(kvm->arch.pgd); + kvm->arch.pgd = NULL; + } spin_unlock(&kvm->mmu_lock); /* Free the HW pgd, one page at a time */ - free_pages_exact(kvm->arch.pgd, S2_PGD_SIZE); - kvm->arch.pgd = NULL; + if (pgd) + free_pages_exact(pgd, S2_PGD_SIZE); } static pud_t *stage2_get_pud(struct kvm *kvm, struct kvm_mmu_memory_cache *cache, @@ -872,6 +879,9 @@ pmd_t *pmd; pud = stage2_get_pud(kvm, cache, addr); + if (!pud) + return NULL; + if (stage2_pud_none(*pud)) { if (!cache) return NULL; @@ -1170,11 +1180,13 @@ * large. Otherwise, we may see kernel panics with * CONFIG_DETECT_HUNG_TASK, CONFIG_LOCKUP_DETECTOR, * CONFIG_LOCKDEP. Additionally, holding the lock too long - * will also starve other vCPUs. + * will also starve other vCPUs. We have to also make sure + * that the page tables are not freed while we released + * the lock. */ - if (need_resched() || spin_needbreak(&kvm->mmu_lock)) - cond_resched_lock(&kvm->mmu_lock); - + cond_resched_lock(&kvm->mmu_lock); + if (!READ_ONCE(kvm->arch.pgd)) + break; next = stage2_pgd_addr_end(addr, end); if (stage2_pgd_present(*pgd)) stage2_wp_puds(pgd, addr, next); --- linux-azure-4.11.0.orig/arch/arm/kvm/psci.c +++ linux-azure-4.11.0/arch/arm/kvm/psci.c @@ -208,9 +208,10 @@ static int kvm_psci_0_2_call(struct kvm_vcpu *vcpu) { - int ret = 1; + struct kvm *kvm = vcpu->kvm; unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0); unsigned long val; + int ret = 1; switch (psci_fn) { case PSCI_0_2_FN_PSCI_VERSION: @@ -230,7 +231,9 @@ break; case PSCI_0_2_FN_CPU_ON: case PSCI_0_2_FN64_CPU_ON: + mutex_lock(&kvm->lock); val = kvm_psci_vcpu_on(vcpu); + mutex_unlock(&kvm->lock); break; case PSCI_0_2_FN_AFFINITY_INFO: case PSCI_0_2_FN64_AFFINITY_INFO: @@ -279,6 +282,7 @@ static int kvm_psci_0_1_call(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; unsigned long psci_fn = vcpu_get_reg(vcpu, 0) & ~((u32) 0); unsigned long val; @@ -288,7 +292,9 @@ val = PSCI_RET_SUCCESS; break; case KVM_PSCI_FN_CPU_ON: + mutex_lock(&kvm->lock); val = kvm_psci_vcpu_on(vcpu); + mutex_unlock(&kvm->lock); break; default: val = PSCI_RET_NOT_SUPPORTED; --- linux-azure-4.11.0.orig/arch/arm/mach-davinci/pm.c +++ linux-azure-4.11.0/arch/arm/mach-davinci/pm.c @@ -154,7 +154,8 @@ davinci_sram_suspend = sram_alloc(davinci_cpu_suspend_sz, NULL); if (!davinci_sram_suspend) { pr_err("PM: cannot allocate SRAM memory\n"); - return -ENOMEM; + ret = -ENOMEM; + goto no_sram_mem; } davinci_sram_push(davinci_sram_suspend, davinci_cpu_suspend, @@ -162,6 +163,10 @@ suspend_set_ops(&davinci_pm_ops); + return 0; + +no_sram_mem: + iounmap(pm_config.ddrpsc_reg_base); no_ddrpsc_mem: iounmap(pm_config.ddrpll_reg_base); no_ddrpll_mem: --- linux-azure-4.11.0.orig/arch/arm/mach-highbank/Makefile +++ linux-azure-4.11.0/arch/arm/mach-highbank/Makefile @@ -1,3 +1,5 @@ +KBUILD_CFLAGS += -I$(srctree)/arch/arm/mach-highbank/include + obj-y := highbank.o system.o smc.o plus_sec := $(call as-instr,.arch_extension sec,+sec) --- linux-azure-4.11.0.orig/arch/arm/mm/dma-mapping.c +++ linux-azure-4.11.0/arch/arm/mm/dma-mapping.c @@ -2414,6 +2414,13 @@ dma_ops = arm_get_dma_map_ops(coherent); set_dma_ops(dev, dma_ops); + +#ifdef CONFIG_XEN + if (xen_initial_domain()) { + dev->archdata.dev_dma_ops = dev->dma_ops; + dev->dma_ops = xen_dma_ops; + } +#endif } void arch_teardown_dma_ops(struct device *dev) --- linux-azure-4.11.0.orig/arch/arm/mm/mmap.c +++ linux-azure-4.11.0/arch/arm/mm/mmap.c @@ -90,7 +90,7 @@ vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -141,7 +141,7 @@ addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } --- linux-azure-4.11.0.orig/arch/arm/mm/mmu.c +++ linux-azure-4.11.0/arch/arm/mm/mmu.c @@ -414,6 +414,11 @@ FIXADDR_END); BUG_ON(idx >= __end_of_fixed_addresses); + /* we only support device mappings until pgprot_kernel has been set */ + if (WARN_ON(pgprot_val(prot) != pgprot_val(FIXMAP_PAGE_IO) && + pgprot_val(pgprot_kernel) == 0)) + return; + if (pgprot_val(prot)) set_pte_at(NULL, vaddr, pte, pfn_pte(phys >> PAGE_SHIFT, prot)); @@ -1211,15 +1216,15 @@ high_memory = __va(arm_lowmem_limit - 1) + 1; + if (!memblock_limit) + memblock_limit = arm_lowmem_limit; + /* * Round the memblock limit down to a pmd size. This * helps to ensure that we will allocate memory from the * last full pmd, which should be mapped. */ - if (memblock_limit) - memblock_limit = round_down(memblock_limit, PMD_SIZE); - if (!memblock_limit) - memblock_limit = arm_lowmem_limit; + memblock_limit = round_down(memblock_limit, PMD_SIZE); if (!IS_ENABLED(CONFIG_HIGHMEM) || cache_is_vipt_aliasing()) { if (memblock_end_of_DRAM() > arm_lowmem_limit) { @@ -1492,7 +1497,7 @@ * early_paging_init() recreates boot time page table setup, allowing machines * to switch over to a high (>4G) address space on LPAE systems */ -void __init early_paging_init(const struct machine_desc *mdesc) +static void __init early_paging_init(const struct machine_desc *mdesc) { pgtables_remap *lpae_pgtables_remap; unsigned long pa_pgd; @@ -1560,7 +1565,7 @@ #else -void __init early_paging_init(const struct machine_desc *mdesc) +static void __init early_paging_init(const struct machine_desc *mdesc) { long long offset; @@ -1616,7 +1621,6 @@ { void *zero_page; - build_mem_type_table(); prepare_page_table(); map_lowmem(); memblock_set_current_limit(arm_lowmem_limit); @@ -1636,3 +1640,9 @@ empty_zero_page = virt_to_page(zero_page); __flush_dcache_page(NULL, empty_zero_page); } + +void __init early_mm_init(const struct machine_desc *mdesc) +{ + build_mem_type_table(); + early_paging_init(mdesc); +} --- linux-azure-4.11.0.orig/arch/arm/mm/proc-v7m.S +++ linux-azure-4.11.0/arch/arm/mm/proc-v7m.S @@ -147,10 +147,10 @@ @ Configure caches (if implemented) teq r8, #0 - stmneia r12, {r0-r6, lr} @ v7m_invalidate_l1 touches r0-r6 + stmneia sp, {r0-r6, lr} @ v7m_invalidate_l1 touches r0-r6 blne v7m_invalidate_l1 teq r8, #0 @ re-evalutae condition - ldmneia r12, {r0-r6, lr} + ldmneia sp, {r0-r6, lr} @ Configure the System Control Register to ensure 8-byte stack alignment @ Note the STKALIGN bit is either RW or RAO. --- linux-azure-4.11.0.orig/arch/arm64/Kconfig +++ linux-azure-4.11.0/arch/arm64/Kconfig @@ -2,6 +2,7 @@ def_bool y select ACPI_CCA_REQUIRED if ACPI select ACPI_GENERIC_GSI if ACPI + select ACPI_GTDT if ACPI select ACPI_REDUCED_HARDWARE_ONLY if ACPI select ACPI_MCFG if ACPI select ACPI_SPCR_TABLE if ACPI @@ -244,6 +245,9 @@ config ARCH_SUPPORTS_UPROBES def_bool y +config ARCH_PROC_KCORE_TEXT + def_bool y + source "init/Kconfig" source "kernel/Kconfig.freezer" @@ -480,6 +484,17 @@ If unsure, say Y. +config CAVIUM_ERRATUM_30115 + bool "Cavium erratum 30115: Guest may disable interrupts in host" + default y + help + On ThunderX T88 pass 1.x through 2.2, T81 pass 1.0 through + 1.2, and T83 Pass 1.0, KVM guest execution may disable + interrupts in host. Trapping both GICv3 group-0 and group-1 + accesses sidesteps the issue. + + If unsure, say Y. + config QCOM_FALKOR_ERRATUM_1003 bool "Falkor E1003: Incorrect translation due to ASID change" default y @@ -736,6 +751,17 @@ but it is independent of the system firmware. And like a reboot you can start any kernel with it, not just Linux. +config CRASH_DUMP + bool "Build kdump crash kernel" + help + Generate crash dump after being started by kexec. This should + be normally only set in special crash dump kernels which are + loaded in the main kernel with kexec-tools into a specially + reserved region and then later executed after a crash by + kdump/kexec. + + For more details see Documentation/kdump/kdump.txt + config XEN_DOM0 def_bool y depends on XEN @@ -751,6 +777,7 @@ config FORCE_MAX_ZONEORDER int default "14" if (ARM64_64K_PAGES && TRANSPARENT_HUGEPAGE) + default "13" if (ARCH_THUNDER && ARM64_4K_PAGES) default "12" if (ARM64_16K_PAGES && TRANSPARENT_HUGEPAGE) default "11" help @@ -1108,6 +1135,8 @@ source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "drivers/firmware/Kconfig" source "drivers/acpi/Kconfig" --- linux-azure-4.11.0.orig/arch/arm64/boot/dts/hisilicon/hi6220.dtsi +++ linux-azure-4.11.0/arch/arm64/boot/dts/hisilicon/hi6220.dtsi @@ -774,6 +774,7 @@ clocks = <&sys_ctrl 2>, <&sys_ctrl 1>; clock-names = "ciu", "biu"; resets = <&sys_ctrl PERIPH_RSTDIS0_MMC0>; + reset-names = "reset"; bus-width = <0x8>; vmmc-supply = <&ldo19>; pinctrl-names = "default"; @@ -797,6 +798,7 @@ clocks = <&sys_ctrl 4>, <&sys_ctrl 3>; clock-names = "ciu", "biu"; resets = <&sys_ctrl PERIPH_RSTDIS0_MMC1>; + reset-names = "reset"; vqmmc-supply = <&ldo7>; vmmc-supply = <&ldo10>; bus-width = <0x4>; @@ -815,6 +817,7 @@ clocks = <&sys_ctrl HI6220_MMC2_CIUCLK>, <&sys_ctrl HI6220_MMC2_CLK>; clock-names = "ciu", "biu"; resets = <&sys_ctrl PERIPH_RSTDIS0_MMC2>; + reset-names = "reset"; bus-width = <0x4>; broken-cd; pinctrl-names = "default", "idle"; --- linux-azure-4.11.0.orig/arch/arm64/boot/dts/hisilicon/hip06-d03.dts +++ linux-azure-4.11.0/arch/arm64/boot/dts/hisilicon/hip06-d03.dts @@ -52,3 +52,7 @@ &usb_ehci { status = "ok"; }; + +&ipmi0 { + status = "ok"; +}; --- linux-azure-4.11.0.orig/arch/arm64/boot/dts/hisilicon/hip06.dtsi +++ linux-azure-4.11.0/arch/arm64/boot/dts/hisilicon/hip06.dtsi @@ -318,6 +318,20 @@ #size-cells = <2>; ranges; + isa@a01b0000 { + compatible = "hisilicon,hip06-lpc"; + #size-cells = <1>; + #address-cells = <2>; + reg = <0x0 0xa01b0000 0x0 0x1000>; + + ipmi0: bt@e4 { + compatible = "ipmi-bt"; + device_type = "ipmi"; + reg = <0x01 0xe4 0x04>; + status = "disabled"; + }; + }; + refclk: refclk { compatible = "fixed-clock"; clock-frequency = <50000000>; --- linux-azure-4.11.0.orig/arch/arm64/boot/dts/hisilicon/hip07-d05.dts +++ linux-azure-4.11.0/arch/arm64/boot/dts/hisilicon/hip07-d05.dts @@ -64,3 +64,7 @@ &usb_ehci { status = "ok"; }; + +&ipmi0 { + status = "ok"; +}; --- linux-azure-4.11.0.orig/arch/arm64/boot/dts/hisilicon/hip07.dtsi +++ linux-azure-4.11.0/arch/arm64/boot/dts/hisilicon/hip07.dtsi @@ -1028,6 +1028,20 @@ #size-cells = <2>; ranges; + isa@a01b0000 { + compatible = "hisilicon,hip07-lpc"; + #size-cells = <1>; + #address-cells = <2>; + reg = <0x0 0xa01b0000 0x0 0x1000>; + + ipmi0: bt@e4 { + compatible = "ipmi-bt"; + device_type = "ipmi"; + reg = <0x01 0xe4 0x04>; + status = "disabled"; + }; + }; + uart0: uart@602b0000 { compatible = "arm,sbsa-uart"; reg = <0x0 0x602b0000 0x0 0x1000>; --- linux-azure-4.11.0.orig/arch/arm64/boot/dts/marvell/armada-37xx.dtsi +++ linux-azure-4.11.0/arch/arm64/boot/dts/marvell/armada-37xx.dtsi @@ -75,14 +75,10 @@ timer { compatible = "arm,armv8-timer"; - interrupts = , - , - , - ; + interrupts = , + , + , + ; }; soc { --- linux-azure-4.11.0.orig/arch/arm64/include/asm/acpi.h +++ linux-azure-4.11.0/arch/arm64/include/asm/acpi.h @@ -23,9 +23,9 @@ #define ACPI_MADT_GICC_LENGTH \ (acpi_gbl_FADT.header.revision < 6 ? 76 : 80) -#define BAD_MADT_GICC_ENTRY(entry, end) \ - (!(entry) || (unsigned long)(entry) + sizeof(*(entry)) > (end) || \ - (entry)->header.length != ACPI_MADT_GICC_LENGTH) +#define BAD_MADT_GICC_ENTRY(entry, end) \ + (!(entry) || (entry)->header.length != ACPI_MADT_GICC_LENGTH || \ + (unsigned long)(entry) + ACPI_MADT_GICC_LENGTH > (end)) /* Basic configuration for ACPI */ #ifdef CONFIG_ACPI @@ -85,6 +85,8 @@ return true; } +struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu); + static inline void arch_fix_phys_package_id(int num, u32 slot) { } void __init acpi_init_cpus(void); --- linux-azure-4.11.0.orig/arch/arm64/include/asm/arch_gicv3.h +++ linux-azure-4.11.0/arch/arm64/include/asm/arch_gicv3.h @@ -20,16 +20,24 @@ #include +#define ICC_IAR0_EL1 sys_reg(3, 0, 12, 8, 0) +#define ICC_EOIR0_EL1 sys_reg(3, 0, 12, 8, 1) +#define ICC_HPPIR0_EL1 sys_reg(3, 0, 12, 8, 2) +#define ICC_BPR0_EL1 sys_reg(3, 0, 12, 8, 3) +#define ICC_AP0Rn_EL1(n) sys_reg(3, 0, 12, 8, 4 | n) +#define ICC_AP1Rn_EL1(n) sys_reg(3, 0, 12, 9, n) #define ICC_EOIR1_EL1 sys_reg(3, 0, 12, 12, 1) #define ICC_DIR_EL1 sys_reg(3, 0, 12, 11, 1) +#define ICC_RPR_EL1 sys_reg(3, 0, 12, 11, 3) #define ICC_IAR1_EL1 sys_reg(3, 0, 12, 12, 0) #define ICC_SGI1R_EL1 sys_reg(3, 0, 12, 11, 5) #define ICC_PMR_EL1 sys_reg(3, 0, 4, 6, 0) #define ICC_CTLR_EL1 sys_reg(3, 0, 12, 12, 4) #define ICC_SRE_EL1 sys_reg(3, 0, 12, 12, 5) +#define ICC_GRPEN0_EL1 sys_reg(3, 0, 12, 12, 6) #define ICC_GRPEN1_EL1 sys_reg(3, 0, 12, 12, 7) #define ICC_BPR1_EL1 sys_reg(3, 0, 12, 12, 3) - +#define ICC_HPPIR1_EL1 sys_reg(3, 0, 12, 12, 2) #define ICC_SRE_EL2 sys_reg(3, 4, 12, 9, 5) /* --- linux-azure-4.11.0.orig/arch/arm64/include/asm/arch_timer.h +++ linux-azure-4.11.0/arch/arm64/include/asm/arch_timer.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -37,24 +38,45 @@ #define needs_unstable_timer_counter_workaround() false #endif +enum arch_timer_erratum_match_type { + ate_match_dt, + ate_match_global_cap_id, + ate_match_local_cap_id, + ate_match_acpi_oem_info, +}; + +struct clock_event_device; struct arch_timer_erratum_workaround { - const char *id; /* Indicate the Erratum ID */ + enum arch_timer_erratum_match_type match_type; + const void *id; + const char *desc; u32 (*read_cntp_tval_el0)(void); u32 (*read_cntv_tval_el0)(void); u64 (*read_cntvct_el0)(void); + int (*set_next_event_phys)(unsigned long, struct clock_event_device *); + int (*set_next_event_virt)(unsigned long, struct clock_event_device *); }; -extern const struct arch_timer_erratum_workaround *timer_unstable_counter_workaround; +DECLARE_PER_CPU(const struct arch_timer_erratum_workaround *, + timer_unstable_counter_workaround); -#define arch_timer_reg_read_stable(reg) \ -({ \ - u64 _val; \ - if (needs_unstable_timer_counter_workaround()) \ - _val = timer_unstable_counter_workaround->read_##reg();\ - else \ - _val = read_sysreg(reg); \ - _val; \ +#define arch_timer_reg_read_stable(reg) \ +({ \ + u64 _val; \ + if (needs_unstable_timer_counter_workaround()) { \ + const struct arch_timer_erratum_workaround *wa; \ + preempt_disable(); \ + wa = __this_cpu_read(timer_unstable_counter_workaround); \ + if (wa && wa->read_##reg) \ + _val = wa->read_##reg(); \ + else \ + _val = read_sysreg(reg); \ + preempt_enable(); \ + } else { \ + _val = read_sysreg(reg); \ + } \ + _val; \ }) /* --- linux-azure-4.11.0.orig/arch/arm64/include/asm/asm-uaccess.h +++ linux-azure-4.11.0/arch/arm64/include/asm/asm-uaccess.h @@ -62,4 +62,13 @@ alternative_else_nop_endif .endm +/* + * Remove the address tag from a virtual address, if present. + */ + .macro clear_address_tag, dst, addr + tst \addr, #(1 << 55) + bic \dst, \addr, #(0xff << 56) + csel \dst, \dst, \addr, eq + .endm + #endif --- linux-azure-4.11.0.orig/arch/arm64/include/asm/barrier.h +++ linux-azure-4.11.0/arch/arm64/include/asm/barrier.h @@ -42,25 +42,35 @@ #define __smp_rmb() dmb(ishld) #define __smp_wmb() dmb(ishst) -#define __smp_store_release(p, v) \ +#define __smp_store_release(p, v) \ do { \ + union { typeof(*p) __val; char __c[1]; } __u = \ + { .__val = (__force typeof(*p)) (v) }; \ compiletime_assert_atomic_type(*p); \ switch (sizeof(*p)) { \ case 1: \ asm volatile ("stlrb %w1, %0" \ - : "=Q" (*p) : "r" (v) : "memory"); \ + : "=Q" (*p) \ + : "r" (*(__u8 *)__u.__c) \ + : "memory"); \ break; \ case 2: \ asm volatile ("stlrh %w1, %0" \ - : "=Q" (*p) : "r" (v) : "memory"); \ + : "=Q" (*p) \ + : "r" (*(__u16 *)__u.__c) \ + : "memory"); \ break; \ case 4: \ asm volatile ("stlr %w1, %0" \ - : "=Q" (*p) : "r" (v) : "memory"); \ + : "=Q" (*p) \ + : "r" (*(__u32 *)__u.__c) \ + : "memory"); \ break; \ case 8: \ asm volatile ("stlr %1, %0" \ - : "=Q" (*p) : "r" (v) : "memory"); \ + : "=Q" (*p) \ + : "r" (*(__u64 *)__u.__c) \ + : "memory"); \ break; \ } \ } while (0) --- linux-azure-4.11.0.orig/arch/arm64/include/asm/cacheflush.h +++ linux-azure-4.11.0/arch/arm64/include/asm/cacheflush.h @@ -154,5 +154,6 @@ int set_memory_rw(unsigned long addr, int numpages); int set_memory_x(unsigned long addr, int numpages); int set_memory_nx(unsigned long addr, int numpages); +int set_memory_valid(unsigned long addr, unsigned long size, int enable); #endif --- linux-azure-4.11.0.orig/arch/arm64/include/asm/cmpxchg.h +++ linux-azure-4.11.0/arch/arm64/include/asm/cmpxchg.h @@ -46,7 +46,7 @@ " swp" #acq_lse #rel #sz "\t%" #w "3, %" #w "0, %2\n" \ __nops(3) \ " " #nop_lse) \ - : "=&r" (ret), "=&r" (tmp), "+Q" (*(u8 *)ptr) \ + : "=&r" (ret), "=&r" (tmp), "+Q" (*(unsigned long *)ptr) \ : "r" (x) \ : cl); \ \ --- linux-azure-4.11.0.orig/arch/arm64/include/asm/cpucaps.h +++ linux-azure-4.11.0/arch/arm64/include/asm/cpucaps.h @@ -37,7 +37,9 @@ #define ARM64_HAS_NO_FPSIMD 16 #define ARM64_WORKAROUND_REPEAT_TLBI 17 #define ARM64_WORKAROUND_QCOM_FALKOR_E1003 18 +#define ARM64_WORKAROUND_858921 19 +#define ARM64_WORKAROUND_CAVIUM_30115 20 -#define ARM64_NCAPS 19 +#define ARM64_NCAPS 21 #endif /* __ASM_CPUCAPS_H */ --- linux-azure-4.11.0.orig/arch/arm64/include/asm/cputype.h +++ linux-azure-4.11.0/arch/arm64/include/asm/cputype.h @@ -80,11 +80,13 @@ #define ARM_CPU_PART_FOUNDATION 0xD00 #define ARM_CPU_PART_CORTEX_A57 0xD07 #define ARM_CPU_PART_CORTEX_A53 0xD03 +#define ARM_CPU_PART_CORTEX_A73 0xD09 #define APM_CPU_PART_POTENZA 0x000 #define CAVIUM_CPU_PART_THUNDERX 0x0A1 #define CAVIUM_CPU_PART_THUNDERX_81XX 0x0A2 +#define CAVIUM_CPU_PART_THUNDERX_83XX 0x0A3 #define BRCM_CPU_PART_VULCAN 0x516 @@ -92,8 +94,10 @@ #define MIDR_CORTEX_A53 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A53) #define MIDR_CORTEX_A57 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A57) +#define MIDR_CORTEX_A73 MIDR_CPU_MODEL(ARM_CPU_IMP_ARM, ARM_CPU_PART_CORTEX_A73) #define MIDR_THUNDERX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX) #define MIDR_THUNDERX_81XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_81XX) +#define MIDR_THUNDERX_83XX MIDR_CPU_MODEL(ARM_CPU_IMP_CAVIUM, CAVIUM_CPU_PART_THUNDERX_83XX) #define MIDR_QCOM_FALKOR_V1 MIDR_CPU_MODEL(ARM_CPU_IMP_QCOM, QCOM_CPU_PART_FALKOR_V1) #ifndef __ASSEMBLY__ --- linux-azure-4.11.0.orig/arch/arm64/include/asm/device.h +++ linux-azure-4.11.0/arch/arm64/include/asm/device.h @@ -20,6 +20,9 @@ #ifdef CONFIG_IOMMU_API void *iommu; /* private IOMMU data */ #endif +#ifdef CONFIG_XEN + const struct dma_map_ops *dev_dma_ops; +#endif bool dma_coherent; }; --- linux-azure-4.11.0.orig/arch/arm64/include/asm/dma-mapping.h +++ linux-azure-4.11.0/arch/arm64/include/asm/dma-mapping.h @@ -27,11 +27,8 @@ #define DMA_ERROR_CODE (~(dma_addr_t)0) extern const struct dma_map_ops dummy_dma_ops; -static inline const struct dma_map_ops *__generic_dma_ops(struct device *dev) +static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) { - if (dev && dev->dma_ops) - return dev->dma_ops; - /* * We expect no ISA devices, and all other DMA masters are expected to * have someone call arch_setup_dma_ops at device creation time. @@ -39,14 +36,6 @@ return &dummy_dma_ops; } -static inline const struct dma_map_ops *get_arch_dma_ops(struct bus_type *bus) -{ - if (xen_initial_domain()) - return xen_dma_ops; - else - return __generic_dma_ops(NULL); -} - void arch_setup_dma_ops(struct device *dev, u64 dma_base, u64 size, const struct iommu_ops *iommu, bool coherent); #define arch_setup_dma_ops arch_setup_dma_ops --- linux-azure-4.11.0.orig/arch/arm64/include/asm/efi.h +++ linux-azure-4.11.0/arch/arm64/include/asm/efi.h @@ -1,6 +1,7 @@ #ifndef _ASM_EFI_H #define _ASM_EFI_H +#include #include #include #include @@ -46,7 +47,23 @@ * 2MiB so we know it won't cross a 2MiB boundary. */ #define EFI_FDT_ALIGN SZ_2M /* used by allocate_new_fdt_and_exit_boot() */ -#define MAX_FDT_OFFSET SZ_512M + +/* on arm64, the FDT may be located anywhere in system RAM */ +static inline unsigned long efi_get_max_fdt_addr(unsigned long dram_base) +{ + return ULONG_MAX; +} + +/* + * On arm64, we have to ensure that the initrd ends up in the linear region, + * which is a 1 GB aligned region of size '1UL << (VA_BITS - 1)' that is + * guaranteed to cover the kernel Image. + */ +static inline unsigned long efi_get_max_initrd_addr(unsigned long dram_base, + unsigned long image_addr) +{ + return (image_addr & ~(SZ_1G - 1UL)) + (1UL << (VA_BITS - 1)); +} #define efi_call_early(f, ...) sys_table_arg->boottime->f(__VA_ARGS__) #define __efi_call_early(f, ...) f(__VA_ARGS__) --- linux-azure-4.11.0.orig/arch/arm64/include/asm/elf.h +++ linux-azure-4.11.0/arch/arm64/include/asm/elf.h @@ -113,12 +113,11 @@ #define ELF_EXEC_PAGESIZE PAGE_SIZE /* - * This is the location that an ET_DYN program is loaded if exec'ed. Typical - * use of this is to invoke "./ld.so someprog" to test out a new version of - * the loader. We need to make sure that it is out of the way of the program - * that it will "exec", and that there is sufficient room for the brk. + * This is the base location for PIE (ET_DYN with INTERP) loads. On + * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. */ -#define ELF_ET_DYN_BASE (2 * TASK_SIZE_64 / 3) +#define ELF_ET_DYN_BASE 0x100000000UL #ifndef __ASSEMBLY__ @@ -142,6 +141,7 @@ ({ \ clear_bit(TIF_32BIT, ¤t->mm->context.flags); \ clear_thread_flag(TIF_32BIT); \ + current->personality &= ~READ_IMPLIES_EXEC; \ }) /* update AT_VECTOR_SIZE_ARCH if the number of NEW_AUX_ENT entries changes */ @@ -173,7 +173,8 @@ #ifdef CONFIG_COMPAT -#define COMPAT_ELF_ET_DYN_BASE (2 * TASK_SIZE_32 / 3) +/* PIE load location for compat arm. Must match ARM ELF_ET_DYN_BASE. */ +#define COMPAT_ELF_ET_DYN_BASE 0x000400000UL /* AArch32 registers. */ #define COMPAT_ELF_NGREG 18 @@ -187,6 +188,11 @@ ((x)->e_flags & EF_ARM_EABI_MASK)) #define compat_start_thread compat_start_thread +/* + * Unlike the native SET_PERSONALITY macro, the compat version inherits + * READ_IMPLIES_EXEC across a fork() since this is the behaviour on + * arch/arm/. + */ #define COMPAT_SET_PERSONALITY(ex) \ ({ \ set_bit(TIF_32BIT, ¤t->mm->context.flags); \ --- linux-azure-4.11.0.orig/arch/arm64/include/asm/esr.h +++ linux-azure-4.11.0/arch/arm64/include/asm/esr.h @@ -19,6 +19,7 @@ #define __ASM_ESR_H #include +#include #define ESR_ELx_EC_UNKNOWN (0x00) #define ESR_ELx_EC_WFx (0x01) @@ -175,6 +176,35 @@ #define ESR_ELx_SYS64_ISS_SYS_CTR_READ (ESR_ELx_SYS64_ISS_SYS_CTR | \ ESR_ELx_SYS64_ISS_DIR_READ) +#define ESR_ELx_SYS64_ISS_SYS_CNTVCT (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 2, 14, 0) | \ + ESR_ELx_SYS64_ISS_DIR_READ) + +#define ESR_ELx_SYS64_ISS_SYS_CNTFRQ (ESR_ELx_SYS64_ISS_SYS_VAL(3, 3, 0, 14, 0) | \ + ESR_ELx_SYS64_ISS_DIR_READ) + +#define esr_sys64_to_sysreg(e) \ + sys_reg((((e) & ESR_ELx_SYS64_ISS_OP0_MASK) >> \ + ESR_ELx_SYS64_ISS_OP0_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \ + ESR_ELx_SYS64_ISS_OP1_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \ + ESR_ELx_SYS64_ISS_CRN_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \ + ESR_ELx_SYS64_ISS_CRM_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ + ESR_ELx_SYS64_ISS_OP2_SHIFT)) + +#define esr_cp15_to_sysreg(e) \ + sys_reg(3, \ + (((e) & ESR_ELx_SYS64_ISS_OP1_MASK) >> \ + ESR_ELx_SYS64_ISS_OP1_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRN_MASK) >> \ + ESR_ELx_SYS64_ISS_CRN_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_CRM_MASK) >> \ + ESR_ELx_SYS64_ISS_CRM_SHIFT), \ + (((e) & ESR_ELx_SYS64_ISS_OP2_MASK) >> \ + ESR_ELx_SYS64_ISS_OP2_SHIFT)) + #ifndef __ASSEMBLY__ #include --- linux-azure-4.11.0.orig/arch/arm64/include/asm/hardirq.h +++ linux-azure-4.11.0/arch/arm64/include/asm/hardirq.h @@ -20,7 +20,7 @@ #include #include -#define NR_IPI 6 +#define NR_IPI 7 typedef struct { unsigned int __softirq_pending; --- linux-azure-4.11.0.orig/arch/arm64/include/asm/kexec.h +++ linux-azure-4.11.0/arch/arm64/include/asm/kexec.h @@ -40,9 +40,59 @@ static inline void crash_setup_regs(struct pt_regs *newregs, struct pt_regs *oldregs) { - /* Empty routine needed to avoid build errors. */ + if (oldregs) { + memcpy(newregs, oldregs, sizeof(*newregs)); + } else { + u64 tmp1, tmp2; + + __asm__ __volatile__ ( + "stp x0, x1, [%2, #16 * 0]\n" + "stp x2, x3, [%2, #16 * 1]\n" + "stp x4, x5, [%2, #16 * 2]\n" + "stp x6, x7, [%2, #16 * 3]\n" + "stp x8, x9, [%2, #16 * 4]\n" + "stp x10, x11, [%2, #16 * 5]\n" + "stp x12, x13, [%2, #16 * 6]\n" + "stp x14, x15, [%2, #16 * 7]\n" + "stp x16, x17, [%2, #16 * 8]\n" + "stp x18, x19, [%2, #16 * 9]\n" + "stp x20, x21, [%2, #16 * 10]\n" + "stp x22, x23, [%2, #16 * 11]\n" + "stp x24, x25, [%2, #16 * 12]\n" + "stp x26, x27, [%2, #16 * 13]\n" + "stp x28, x29, [%2, #16 * 14]\n" + "mov %0, sp\n" + "stp x30, %0, [%2, #16 * 15]\n" + + "/* faked current PSTATE */\n" + "mrs %0, CurrentEL\n" + "mrs %1, SPSEL\n" + "orr %0, %0, %1\n" + "mrs %1, DAIF\n" + "orr %0, %0, %1\n" + "mrs %1, NZCV\n" + "orr %0, %0, %1\n" + /* pc */ + "adr %1, 1f\n" + "1:\n" + "stp %1, %0, [%2, #16 * 16]\n" + : "=&r" (tmp1), "=&r" (tmp2) + : "r" (newregs) + : "memory" + ); + } } +#if defined(CONFIG_KEXEC_CORE) && defined(CONFIG_HIBERNATION) +extern bool crash_is_nosave(unsigned long pfn); +extern void crash_prepare_suspend(void); +extern void crash_post_resume(void); +#else +static inline bool crash_is_nosave(unsigned long pfn) {return false; } +static inline void crash_prepare_suspend(void) {} +static inline void crash_post_resume(void) {} +#endif + #endif /* __ASSEMBLY__ */ #endif --- linux-azure-4.11.0.orig/arch/arm64/include/asm/kvm_emulate.h +++ linux-azure-4.11.0/arch/arm64/include/asm/kvm_emulate.h @@ -240,6 +240,12 @@ return kvm_vcpu_get_hsr(vcpu) & ESR_ELx_FSC_TYPE; } +static inline int kvm_vcpu_sys_get_rt(struct kvm_vcpu *vcpu) +{ + u32 esr = kvm_vcpu_get_hsr(vcpu); + return (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT; +} + static inline unsigned long kvm_vcpu_get_mpidr_aff(struct kvm_vcpu *vcpu) { return vcpu_sys_reg(vcpu, MPIDR_EL1) & MPIDR_HWID_BITMASK; --- linux-azure-4.11.0.orig/arch/arm64/include/asm/kvm_hyp.h +++ linux-azure-4.11.0/arch/arm64/include/asm/kvm_hyp.h @@ -127,6 +127,7 @@ void __vgic_v3_save_state(struct kvm_vcpu *vcpu); void __vgic_v3_restore_state(struct kvm_vcpu *vcpu); +int __vgic_v3_perform_cpuif_access(struct kvm_vcpu *vcpu); void __timer_save_state(struct kvm_vcpu *vcpu); void __timer_restore_state(struct kvm_vcpu *vcpu); --- linux-azure-4.11.0.orig/arch/arm64/include/asm/smp.h +++ linux-azure-4.11.0/arch/arm64/include/asm/smp.h @@ -148,6 +148,9 @@ */ bool cpus_are_stuck_in_kernel(void); +extern void smp_send_crash_stop(void); +extern bool smp_crash_stop_failed(void); + #endif /* ifndef __ASSEMBLY__ */ #endif /* ifndef __ASM_SMP_H */ --- linux-azure-4.11.0.orig/arch/arm64/include/asm/sysreg.h +++ linux-azure-4.11.0/arch/arm64/include/asm/sysreg.h @@ -138,6 +138,10 @@ #define SCTLR_ELx_A (1 << 1) #define SCTLR_ELx_M 1 +#define SCTLR_EL2_RES1 ((1 << 4) | (1 << 5) | (1 << 11) | (1 << 16) | \ + (1 << 16) | (1 << 18) | (1 << 22) | (1 << 23) | \ + (1 << 28) | (1 << 29)) + #define SCTLR_ELx_FLAGS (SCTLR_ELx_M | SCTLR_ELx_A | SCTLR_ELx_C | \ SCTLR_ELx_SA | SCTLR_ELx_I) --- linux-azure-4.11.0.orig/arch/arm64/include/asm/uaccess.h +++ linux-azure-4.11.0/arch/arm64/include/asm/uaccess.h @@ -95,20 +95,21 @@ */ #define __range_ok(addr, size) \ ({ \ + unsigned long __addr = (unsigned long __force)(addr); \ unsigned long flag, roksum; \ __chk_user_ptr(addr); \ asm("adds %1, %1, %3; ccmp %1, %4, #2, cc; cset %0, ls" \ : "=&r" (flag), "=&r" (roksum) \ - : "1" (addr), "Ir" (size), \ + : "1" (__addr), "Ir" (size), \ "r" (current_thread_info()->addr_limit) \ : "cc"); \ flag; \ }) /* - * When dealing with data aborts or instruction traps we may end up with - * a tagged userland pointer. Clear the tag to get a sane pointer to pass - * on to access_ok(), for instance. + * When dealing with data aborts, watchpoints, or instruction traps we may end + * up with a tagged userland pointer. Clear the tag to get a sane pointer to + * pass on to access_ok(), for instance. */ #define untagged_addr(addr) sign_extend64(addr, 55) --- linux-azure-4.11.0.orig/arch/arm64/kernel/Makefile +++ linux-azure-4.11.0/arch/arm64/kernel/Makefile @@ -50,6 +50,7 @@ arm64-obj-$(CONFIG_HIBERNATION) += hibernate.o hibernate-asm.o arm64-obj-$(CONFIG_KEXEC) += machine_kexec.o relocate_kernel.o \ cpu-reset.o +arm64-obj-$(CONFIG_CRASH_DUMP) += crash_dump.o obj-y += $(arm64-obj-y) vdso/ probes/ obj-m += $(arm64-obj-m) --- linux-azure-4.11.0.orig/arch/arm64/kernel/armv8_deprecated.c +++ linux-azure-4.11.0/arch/arm64/kernel/armv8_deprecated.c @@ -306,7 +306,8 @@ _ASM_EXTABLE(0b, 4b) \ _ASM_EXTABLE(1b, 4b) \ : "=&r" (res), "+r" (data), "=&r" (temp), "=&r" (temp2) \ - : "r" (addr), "i" (-EAGAIN), "i" (-EFAULT), \ + : "r" ((unsigned long)addr), "i" (-EAGAIN), \ + "i" (-EFAULT), \ "i" (__SWP_LL_SC_LOOPS) \ : "memory"); \ uaccess_disable(); \ --- linux-azure-4.11.0.orig/arch/arm64/kernel/cpu_errata.c +++ linux-azure-4.11.0/arch/arm64/kernel/cpu_errata.c @@ -53,6 +53,13 @@ .midr_range_min = min, \ .midr_range_max = max +#define MIDR_ALL_VERSIONS(model) \ + .def_scope = SCOPE_LOCAL_CPU, \ + .matches = is_affected_midr_range, \ + .midr_model = model, \ + .midr_range_min = 0, \ + .midr_range_max = (MIDR_VARIANT_MASK | MIDR_REVISION_MASK) + const struct arm64_cpu_capabilities arm64_errata[] = { #if defined(CONFIG_ARM64_ERRATUM_826319) || \ defined(CONFIG_ARM64_ERRATUM_827319) || \ @@ -126,6 +133,27 @@ MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x00), }, #endif +#ifdef CONFIG_CAVIUM_ERRATUM_30115 + { + /* Cavium ThunderX, T88 pass 1.x - 2.2 */ + .desc = "Cavium erratum 30115", + .capability = ARM64_WORKAROUND_CAVIUM_30115, + MIDR_RANGE(MIDR_THUNDERX, 0x00, + (1 << MIDR_VARIANT_SHIFT) | 2), + }, + { + /* Cavium ThunderX, T81 pass 1.0 - 1.2 */ + .desc = "Cavium erratum 30115", + .capability = ARM64_WORKAROUND_CAVIUM_30115, + MIDR_RANGE(MIDR_THUNDERX_81XX, 0x00, 0x02), + }, + { + /* Cavium ThunderX, T83 pass 1.0 */ + .desc = "Cavium erratum 30115", + .capability = ARM64_WORKAROUND_CAVIUM_30115, + MIDR_RANGE(MIDR_THUNDERX_83XX, 0x00, 0x00), + }, +#endif { .desc = "Mismatched cache line size", .capability = ARM64_MISMATCHED_CACHE_LINE_SIZE, @@ -151,6 +179,14 @@ MIDR_CPU_VAR_REV(0, 0)), }, #endif +#ifdef CONFIG_ARM64_ERRATUM_858921 + { + /* Cortex-A73 all versions */ + .desc = "ARM erratum 858921", + .capability = ARM64_WORKAROUND_858921, + MIDR_ALL_VERSIONS(MIDR_CORTEX_A73), + }, +#endif { } }; --- linux-azure-4.11.0.orig/arch/arm64/kernel/cpufeature.c +++ linux-azure-4.11.0/arch/arm64/kernel/cpufeature.c @@ -1090,20 +1090,29 @@ * Check if the current CPU has a given feature capability. * Should be called from non-preemptible context. */ -bool this_cpu_has_cap(unsigned int cap) +static bool __this_cpu_has_cap(const struct arm64_cpu_capabilities *cap_array, + unsigned int cap) { const struct arm64_cpu_capabilities *caps; if (WARN_ON(preemptible())) return false; - for (caps = arm64_features; caps->desc; caps++) + for (caps = cap_array; caps->desc; caps++) if (caps->capability == cap && caps->matches) return caps->matches(caps, SCOPE_LOCAL_CPU); return false; } +extern const struct arm64_cpu_capabilities arm64_errata[]; + +bool this_cpu_has_cap(unsigned int cap) +{ + return (__this_cpu_has_cap(arm64_features, cap) || + __this_cpu_has_cap(arm64_errata, cap)); +} + void __init setup_cpu_features(void) { u32 cwg; --- linux-azure-4.11.0.orig/arch/arm64/kernel/crash_dump.c +++ linux-azure-4.11.0/arch/arm64/kernel/crash_dump.c @@ -0,0 +1,71 @@ +/* + * Routines for doing kexec-based kdump + * + * Copyright (C) 2017 Linaro Limited + * Author: AKASHI Takahiro + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include +#include +#include + +/** + * copy_oldmem_page() - copy one page from old kernel memory + * @pfn: page frame number to be copied + * @buf: buffer where the copied page is placed + * @csize: number of bytes to copy + * @offset: offset in bytes into the page + * @userbuf: if set, @buf is in a user address space + * + * This function copies one page from old kernel memory into buffer pointed by + * @buf. If @buf is in userspace, set @userbuf to %1. Returns number of bytes + * copied or negative error in case of failure. + */ +ssize_t copy_oldmem_page(unsigned long pfn, char *buf, + size_t csize, unsigned long offset, + int userbuf) +{ + void *vaddr; + + if (!csize) + return 0; + + vaddr = memremap(__pfn_to_phys(pfn), PAGE_SIZE, MEMREMAP_WB); + if (!vaddr) + return -ENOMEM; + + if (userbuf) { + if (copy_to_user((char __user *)buf, vaddr + offset, csize)) { + memunmap(vaddr); + return -EFAULT; + } + } else { + memcpy(buf, vaddr + offset, csize); + } + + memunmap(vaddr); + + return csize; +} + +/** + * elfcorehdr_read - read from ELF core header + * @buf: buffer where the data is placed + * @csize: number of bytes to read + * @ppos: address in the memory + * + * This function reads @count bytes from elf core header which exists + * on crash dump kernel's memory. + */ +ssize_t elfcorehdr_read(char *buf, size_t count, u64 *ppos) +{ + memcpy(buf, phys_to_virt((phys_addr_t)*ppos), count); + return count; +} --- linux-azure-4.11.0.orig/arch/arm64/kernel/entry.S +++ linux-azure-4.11.0/arch/arm64/kernel/entry.S @@ -428,12 +428,13 @@ /* * Data abort handling */ - mrs x0, far_el1 + mrs x3, far_el1 enable_dbg // re-enable interrupts if they were enabled in the aborted context tbnz x23, #7, 1f // PSR_I_BIT enable_irq 1: + clear_address_tag x0, x3 mov x2, sp // struct pt_regs bl do_mem_abort @@ -594,7 +595,7 @@ // enable interrupts before calling the main handler enable_dbg_and_irq ct_user_exit - bic x0, x26, #(0xff << 56) + clear_address_tag x0, x26 mov x1, x25 mov x2, sp bl do_mem_abort --- linux-azure-4.11.0.orig/arch/arm64/kernel/hibernate.c +++ linux-azure-4.11.0/arch/arm64/kernel/hibernate.c @@ -28,6 +28,7 @@ #include #include #include +#include #include #include #include @@ -102,7 +103,8 @@ unsigned long nosave_begin_pfn = sym_to_pfn(&__nosave_begin); unsigned long nosave_end_pfn = sym_to_pfn(&__nosave_end - 1); - return (pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn); + return ((pfn >= nosave_begin_pfn) && (pfn <= nosave_end_pfn)) || + crash_is_nosave(pfn); } void notrace save_processor_state(void) @@ -286,6 +288,9 @@ local_dbg_save(flags); if (__cpu_suspend_enter(&state)) { + /* make the crash dump kernel image visible/saveable */ + crash_prepare_suspend(); + sleep_cpu = smp_processor_id(); ret = swsusp_save(); } else { @@ -297,6 +302,9 @@ if (el2_reset_needed()) dcache_clean_range(__hyp_idmap_text_start, __hyp_idmap_text_end); + /* make the crash dump kernel image protected again */ + crash_post_resume(); + /* * Tell the hibernation core that we've just restored * the memory --- linux-azure-4.11.0.orig/arch/arm64/kernel/hw_breakpoint.c +++ linux-azure-4.11.0/arch/arm64/kernel/hw_breakpoint.c @@ -36,6 +36,7 @@ #include #include #include +#include /* Breakpoint currently in use for each BRP. */ static DEFINE_PER_CPU(struct perf_event *, bp_on_reg[ARM_MAX_BRP]); @@ -721,6 +722,8 @@ u64 wp_low, wp_high; u32 lens, lene; + addr = untagged_addr(addr); + lens = __ffs(ctrl->len); lene = __fls(ctrl->len); --- linux-azure-4.11.0.orig/arch/arm64/kernel/machine_kexec.c +++ linux-azure-4.11.0/arch/arm64/kernel/machine_kexec.c @@ -9,12 +9,19 @@ * published by the Free Software Foundation. */ +#include +#include +#include #include +#include #include #include #include +#include +#include #include +#include #include "cpu-reset.h" @@ -22,8 +29,6 @@ extern const unsigned char arm64_relocate_new_kernel[]; extern const unsigned long arm64_relocate_new_kernel_size; -static unsigned long kimage_start; - /** * kexec_image_info - For debugging output. */ @@ -64,8 +69,6 @@ */ int machine_kexec_prepare(struct kimage *kimage) { - kimage_start = kimage->start; - kexec_image_info(kimage); if (kimage->type != KEXEC_TYPE_CRASH && cpus_are_stuck_in_kernel()) { @@ -144,11 +147,15 @@ { phys_addr_t reboot_code_buffer_phys; void *reboot_code_buffer; + bool in_kexec_crash = (kimage == kexec_crash_image); + bool stuck_cpus = cpus_are_stuck_in_kernel(); /* * New cpus may have become stuck_in_kernel after we loaded the image. */ - BUG_ON(cpus_are_stuck_in_kernel() || (num_online_cpus() > 1)); + BUG_ON(!in_kexec_crash && (stuck_cpus || (num_online_cpus() > 1))); + WARN(in_kexec_crash && (stuck_cpus || smp_crash_stop_failed()), + "Some CPUs may be stale, kdump will be unreliable.\n"); reboot_code_buffer_phys = page_to_phys(kimage->control_code_page); reboot_code_buffer = phys_to_virt(reboot_code_buffer_phys); @@ -183,7 +190,7 @@ kexec_list_flush(kimage); /* Flush the new image if already in place. */ - if (kimage->head & IND_DONE) + if ((kimage != kexec_crash_image) && (kimage->head & IND_DONE)) kexec_segment_flush(kimage); pr_info("Bye!\n"); @@ -200,13 +207,158 @@ * relocation is complete. */ - cpu_soft_restart(1, reboot_code_buffer_phys, kimage->head, - kimage_start, 0); + cpu_soft_restart(kimage != kexec_crash_image, + reboot_code_buffer_phys, kimage->head, kimage->start, 0); BUG(); /* Should never get here. */ } +static void machine_kexec_mask_interrupts(void) +{ + unsigned int i; + struct irq_desc *desc; + + for_each_irq_desc(i, desc) { + struct irq_chip *chip; + int ret; + + chip = irq_desc_get_chip(desc); + if (!chip) + continue; + + /* + * First try to remove the active state. If this + * fails, try to EOI the interrupt. + */ + ret = irq_set_irqchip_state(i, IRQCHIP_STATE_ACTIVE, false); + + if (ret && irqd_irq_inprogress(&desc->irq_data) && + chip->irq_eoi) + chip->irq_eoi(&desc->irq_data); + + if (chip->irq_mask) + chip->irq_mask(&desc->irq_data); + + if (chip->irq_disable && !irqd_irq_disabled(&desc->irq_data)) + chip->irq_disable(&desc->irq_data); + } +} + +/** + * machine_crash_shutdown - shutdown non-crashing cpus and save registers + */ void machine_crash_shutdown(struct pt_regs *regs) { - /* Empty routine needed to avoid build errors. */ + local_irq_disable(); + + /* shutdown non-crashing cpus */ + smp_send_crash_stop(); + + /* for crashing cpu */ + crash_save_cpu(regs, smp_processor_id()); + machine_kexec_mask_interrupts(); + + pr_info("Starting crashdump kernel...\n"); +} + +void arch_kexec_protect_crashkres(void) +{ + int i; + + kexec_segment_flush(kexec_crash_image); + + for (i = 0; i < kexec_crash_image->nr_segments; i++) + set_memory_valid( + __phys_to_virt(kexec_crash_image->segment[i].mem), + kexec_crash_image->segment[i].memsz >> PAGE_SHIFT, 0); +} + +void arch_kexec_unprotect_crashkres(void) +{ + int i; + + for (i = 0; i < kexec_crash_image->nr_segments; i++) + set_memory_valid( + __phys_to_virt(kexec_crash_image->segment[i].mem), + kexec_crash_image->segment[i].memsz >> PAGE_SHIFT, 1); +} + +#ifdef CONFIG_HIBERNATION +/* + * To preserve the crash dump kernel image, the relevant memory segments + * should be mapped again around the hibernation. + */ +void crash_prepare_suspend(void) +{ + if (kexec_crash_image) + arch_kexec_unprotect_crashkres(); +} + +void crash_post_resume(void) +{ + if (kexec_crash_image) + arch_kexec_protect_crashkres(); +} + +/* + * crash_is_nosave + * + * Return true only if a page is part of reserved memory for crash dump kernel, + * but does not hold any data of loaded kernel image. + * + * Note that all the pages in crash dump kernel memory have been initially + * marked as Reserved in kexec_reserve_crashkres_pages(). + * + * In hibernation, the pages which are Reserved and yet "nosave" are excluded + * from the hibernation iamge. crash_is_nosave() does thich check for crash + * dump kernel and will reduce the total size of hibernation image. + */ + +bool crash_is_nosave(unsigned long pfn) +{ + int i; + phys_addr_t addr; + + if (!crashk_res.end) + return false; + + /* in reserved memory? */ + addr = __pfn_to_phys(pfn); + if ((addr < crashk_res.start) || (crashk_res.end < addr)) + return false; + + if (!kexec_crash_image) + return true; + + /* not part of loaded kernel image? */ + for (i = 0; i < kexec_crash_image->nr_segments; i++) + if (addr >= kexec_crash_image->segment[i].mem && + addr < (kexec_crash_image->segment[i].mem + + kexec_crash_image->segment[i].memsz)) + return false; + + return true; +} + +void crash_free_reserved_phys_range(unsigned long begin, unsigned long end) +{ + unsigned long addr; + struct page *page; + + for (addr = begin; addr < end; addr += PAGE_SIZE) { + page = phys_to_page(addr); + ClearPageReserved(page); + free_reserved_page(page); + } +} +#endif /* CONFIG_HIBERNATION */ + +void arch_crash_save_vmcoreinfo(void) +{ + VMCOREINFO_NUMBER(VA_BITS); + /* Please note VMCOREINFO_NUMBER() uses "%d", not "%x" */ + vmcoreinfo_append_str("NUMBER(kimage_voffset)=0x%llx\n", + kimage_voffset); + vmcoreinfo_append_str("NUMBER(PHYS_OFFSET)=0x%llx\n", + PHYS_OFFSET); } --- linux-azure-4.11.0.orig/arch/arm64/kernel/pci.c +++ linux-azure-4.11.0/arch/arm64/kernel/pci.c @@ -191,8 +191,10 @@ return NULL; root_ops = kzalloc_node(sizeof(*root_ops), GFP_KERNEL, node); - if (!root_ops) + if (!root_ops) { + kfree(ri); return NULL; + } ri->cfg = pci_acpi_setup_ecam_mapping(root); if (!ri->cfg) { --- linux-azure-4.11.0.orig/arch/arm64/kernel/perf_event.c +++ linux-azure-4.11.0/arch/arm64/kernel/perf_event.c @@ -957,11 +957,26 @@ ARMV8_PMU_EVTYPE_EVENT); } +struct armv8pmu_probe_info { + struct arm_pmu *pmu; + bool present; +}; + static void __armv8pmu_probe_pmu(void *info) { - struct arm_pmu *cpu_pmu = info; + struct armv8pmu_probe_info *probe = info; + struct arm_pmu *cpu_pmu = probe->pmu; + u64 dfr0, pmuver; u32 pmceid[2]; + dfr0 = read_sysreg(id_aa64dfr0_el1); + pmuver = cpuid_feature_extract_unsigned_field(dfr0, + ID_AA64DFR0_PMUVER_SHIFT); + if (pmuver != 1) + return; + + probe->present = true; + /* Read the nb of CNTx counters supported from PMNC */ cpu_pmu->num_events = (armv8pmu_pmcr_read() >> ARMV8_PMU_PMCR_N_SHIFT) & ARMV8_PMU_PMCR_N_MASK; @@ -979,13 +994,27 @@ static int armv8pmu_probe_pmu(struct arm_pmu *cpu_pmu) { - return smp_call_function_any(&cpu_pmu->supported_cpus, + struct armv8pmu_probe_info probe = { + .pmu = cpu_pmu, + .present = false, + }; + int ret; + + ret = smp_call_function_any(&cpu_pmu->supported_cpus, __armv8pmu_probe_pmu, - cpu_pmu, 1); + &probe, 1); + if (ret) + return ret; + + return probe.present ? 0 : -ENODEV; } -static void armv8_pmu_init(struct arm_pmu *cpu_pmu) +static int armv8_pmu_init(struct arm_pmu *cpu_pmu) { + int ret = armv8pmu_probe_pmu(cpu_pmu); + if (ret) + return ret; + cpu_pmu->handle_irq = armv8pmu_handle_irq, cpu_pmu->enable = armv8pmu_enable_event, cpu_pmu->disable = armv8pmu_disable_event, @@ -997,78 +1026,104 @@ cpu_pmu->reset = armv8pmu_reset, cpu_pmu->max_period = (1LLU << 32) - 1, cpu_pmu->set_event_filter = armv8pmu_set_event_filter; + + return 0; } static int armv8_pmuv3_init(struct arm_pmu *cpu_pmu) { - armv8_pmu_init(cpu_pmu); + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + cpu_pmu->name = "armv8_pmuv3"; cpu_pmu->map_event = armv8_pmuv3_map_event; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = &armv8_pmuv3_events_attr_group; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = &armv8_pmuv3_format_attr_group; - return armv8pmu_probe_pmu(cpu_pmu); + + return 0; } static int armv8_a53_pmu_init(struct arm_pmu *cpu_pmu) { - armv8_pmu_init(cpu_pmu); + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + cpu_pmu->name = "armv8_cortex_a53"; cpu_pmu->map_event = armv8_a53_map_event; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = &armv8_pmuv3_events_attr_group; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = &armv8_pmuv3_format_attr_group; - return armv8pmu_probe_pmu(cpu_pmu); + + return 0; } static int armv8_a57_pmu_init(struct arm_pmu *cpu_pmu) { - armv8_pmu_init(cpu_pmu); + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + cpu_pmu->name = "armv8_cortex_a57"; cpu_pmu->map_event = armv8_a57_map_event; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = &armv8_pmuv3_events_attr_group; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = &armv8_pmuv3_format_attr_group; - return armv8pmu_probe_pmu(cpu_pmu); + + return 0; } static int armv8_a72_pmu_init(struct arm_pmu *cpu_pmu) { - armv8_pmu_init(cpu_pmu); + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + cpu_pmu->name = "armv8_cortex_a72"; cpu_pmu->map_event = armv8_a57_map_event; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = &armv8_pmuv3_events_attr_group; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = &armv8_pmuv3_format_attr_group; - return armv8pmu_probe_pmu(cpu_pmu); + + return 0; } static int armv8_thunder_pmu_init(struct arm_pmu *cpu_pmu) { - armv8_pmu_init(cpu_pmu); + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + cpu_pmu->name = "armv8_cavium_thunder"; cpu_pmu->map_event = armv8_thunder_map_event; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = &armv8_pmuv3_events_attr_group; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = &armv8_pmuv3_format_attr_group; - return armv8pmu_probe_pmu(cpu_pmu); + + return 0; } static int armv8_vulcan_pmu_init(struct arm_pmu *cpu_pmu) { - armv8_pmu_init(cpu_pmu); + int ret = armv8_pmu_init(cpu_pmu); + if (ret) + return ret; + cpu_pmu->name = "armv8_brcm_vulcan"; cpu_pmu->map_event = armv8_vulcan_map_event; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_EVENTS] = &armv8_pmuv3_events_attr_group; cpu_pmu->attr_groups[ARMPMU_ATTR_GROUP_FORMATS] = &armv8_pmuv3_format_attr_group; - return armv8pmu_probe_pmu(cpu_pmu); + + return 0; } static const struct of_device_id armv8_pmu_of_device_ids[] = { @@ -1081,24 +1136,9 @@ {}, }; -/* - * Non DT systems have their micro/arch events probed at run-time. - * A fairly complete list of generic events are provided and ones that - * aren't supported by the current PMU are disabled. - */ -static const struct pmu_probe_info armv8_pmu_probe_table[] = { - PMU_PROBE(0, 0, armv8_pmuv3_init), /* enable all defined counters */ - { /* sentinel value */ } -}; - static int armv8_pmu_device_probe(struct platform_device *pdev) { - if (acpi_disabled) - return arm_pmu_device_probe(pdev, armv8_pmu_of_device_ids, - NULL); - - return arm_pmu_device_probe(pdev, armv8_pmu_of_device_ids, - armv8_pmu_probe_table); + return arm_pmu_device_probe(pdev, armv8_pmu_of_device_ids, NULL); } static struct platform_driver armv8_pmu_driver = { @@ -1109,4 +1149,11 @@ .probe = armv8_pmu_device_probe, }; -builtin_platform_driver(armv8_pmu_driver); +static int __init armv8_pmu_driver_init(void) +{ + if (acpi_disabled) + return platform_driver_register(&armv8_pmu_driver); + else + return arm_pmu_acpi_probe(armv8_pmuv3_init); +} +device_initcall(armv8_pmu_driver_init) --- linux-azure-4.11.0.orig/arch/arm64/kernel/setup.c +++ linux-azure-4.11.0/arch/arm64/kernel/setup.c @@ -31,7 +31,6 @@ #include #include #include -#include #include #include #include @@ -226,6 +225,12 @@ if (kernel_data.start >= res->start && kernel_data.end <= res->end) request_resource(res, &kernel_data); +#ifdef CONFIG_KEXEC_CORE + /* Userspace will find "Crash kernel" region in /proc/iomem. */ + if (crashk_res.end && crashk_res.start >= res->start && + crashk_res.end <= res->end) + request_resource(res, &crashk_res); +#endif } } --- linux-azure-4.11.0.orig/arch/arm64/kernel/smp.c +++ linux-azure-4.11.0/arch/arm64/kernel/smp.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include @@ -76,6 +77,7 @@ IPI_RESCHEDULE, IPI_CALL_FUNC, IPI_CPU_STOP, + IPI_CPU_CRASH_STOP, IPI_TIMER, IPI_IRQ_WORK, IPI_WAKEUP @@ -518,6 +520,13 @@ static unsigned int cpu_count = 1; #ifdef CONFIG_ACPI +static struct acpi_madt_generic_interrupt cpu_madt_gicc[NR_CPUS]; + +struct acpi_madt_generic_interrupt *acpi_cpu_get_madt_gicc(int cpu) +{ + return &cpu_madt_gicc[cpu]; +} + /* * acpi_map_gic_cpu_interface - parse processor MADT entry * @@ -552,6 +561,7 @@ return; } bootcpu_valid = true; + cpu_madt_gicc[0] = *processor; early_map_cpu_to_node(0, acpi_numa_get_nid(0, hwid)); return; } @@ -562,6 +572,8 @@ /* map the logical cpu id to cpu MPIDR */ cpu_logical_map(cpu_count) = hwid; + cpu_madt_gicc[cpu_count] = *processor; + /* * Set-up the ACPI parking protocol cpu entries * while initializing the cpu_logical_map to @@ -755,6 +767,7 @@ S(IPI_RESCHEDULE, "Rescheduling interrupts"), S(IPI_CALL_FUNC, "Function call interrupts"), S(IPI_CPU_STOP, "CPU stop interrupts"), + S(IPI_CPU_CRASH_STOP, "CPU stop (for crash dump) interrupts"), S(IPI_TIMER, "Timer broadcast interrupts"), S(IPI_IRQ_WORK, "IRQ work interrupts"), S(IPI_WAKEUP, "CPU wake-up interrupts"), @@ -829,6 +842,29 @@ cpu_relax(); } +#ifdef CONFIG_KEXEC_CORE +static atomic_t waiting_for_crash_ipi = ATOMIC_INIT(0); +#endif + +static void ipi_cpu_crash_stop(unsigned int cpu, struct pt_regs *regs) +{ +#ifdef CONFIG_KEXEC_CORE + crash_save_cpu(regs, cpu); + + atomic_dec(&waiting_for_crash_ipi); + + local_irq_disable(); + +#ifdef CONFIG_HOTPLUG_CPU + if (cpu_ops[cpu]->cpu_die) + cpu_ops[cpu]->cpu_die(cpu); +#endif + + /* just in case */ + cpu_park_loop(); +#endif +} + /* * Main handler for inter-processor interrupts */ @@ -859,6 +895,15 @@ irq_exit(); break; + case IPI_CPU_CRASH_STOP: + if (IS_ENABLED(CONFIG_KEXEC_CORE)) { + irq_enter(); + ipi_cpu_crash_stop(cpu, regs); + + unreachable(); + } + break; + #ifdef CONFIG_GENERIC_CLOCKEVENTS_BROADCAST case IPI_TIMER: irq_enter(); @@ -931,6 +976,39 @@ cpumask_pr_args(cpu_online_mask)); } +#ifdef CONFIG_KEXEC_CORE +void smp_send_crash_stop(void) +{ + cpumask_t mask; + unsigned long timeout; + + if (num_online_cpus() == 1) + return; + + cpumask_copy(&mask, cpu_online_mask); + cpumask_clear_cpu(smp_processor_id(), &mask); + + atomic_set(&waiting_for_crash_ipi, num_online_cpus() - 1); + + pr_crit("SMP: stopping secondary CPUs\n"); + smp_cross_call(&mask, IPI_CPU_CRASH_STOP); + + /* Wait up to one second for other CPUs to stop */ + timeout = USEC_PER_SEC; + while ((atomic_read(&waiting_for_crash_ipi) > 0) && timeout--) + udelay(1); + + if (atomic_read(&waiting_for_crash_ipi) > 0) + pr_warning("SMP: failed to stop secondary CPUs %*pbl\n", + cpumask_pr_args(&mask)); +} + +bool smp_crash_stop_failed(void) +{ + return (atomic_read(&waiting_for_crash_ipi) > 0); +} +#endif + /* * not supported here */ --- linux-azure-4.11.0.orig/arch/arm64/kernel/traps.c +++ linux-azure-4.11.0/arch/arm64/kernel/traps.c @@ -443,7 +443,7 @@ } #define __user_cache_maint(insn, address, res) \ - if (untagged_addr(address) >= user_addr_max()) { \ + if (address >= user_addr_max()) { \ res = -EFAULT; \ } else { \ uaccess_ttbr0_enable(); \ @@ -469,7 +469,7 @@ int crm = (esr & ESR_ELx_SYS64_ISS_CRM_MASK) >> ESR_ELx_SYS64_ISS_CRM_SHIFT; int ret = 0; - address = pt_regs_read_reg(regs, rt); + address = untagged_addr(pt_regs_read_reg(regs, rt)); switch (crm) { case ESR_ELx_SYS64_ISS_CRM_DC_CVAU: /* DC CVAU, gets promoted */ @@ -505,6 +505,22 @@ regs->pc += 4; } +static void cntvct_read_handler(unsigned int esr, struct pt_regs *regs) +{ + int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT; + + pt_regs_write_reg(regs, rt, arch_counter_get_cntvct()); + regs->pc += 4; +} + +static void cntfrq_read_handler(unsigned int esr, struct pt_regs *regs) +{ + int rt = (esr & ESR_ELx_SYS64_ISS_RT_MASK) >> ESR_ELx_SYS64_ISS_RT_SHIFT; + + pt_regs_write_reg(regs, rt, read_sysreg(cntfrq_el0)); + regs->pc += 4; +} + struct sys64_hook { unsigned int esr_mask; unsigned int esr_val; @@ -523,6 +539,18 @@ .esr_val = ESR_ELx_SYS64_ISS_SYS_CTR_READ, .handler = ctr_read_handler, }, + { + /* Trap read access to CNTVCT_EL0 */ + .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, + .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTVCT, + .handler = cntvct_read_handler, + }, + { + /* Trap read access to CNTFRQ_EL0 */ + .esr_mask = ESR_ELx_SYS64_ISS_SYS_OP_MASK, + .esr_val = ESR_ELx_SYS64_ISS_SYS_CNTFRQ, + .handler = cntfrq_read_handler, + }, {}, }; --- linux-azure-4.11.0.orig/arch/arm64/kernel/vdso.c +++ linux-azure-4.11.0/arch/arm64/kernel/vdso.c @@ -221,10 +221,11 @@ /* tkr_mono.cycle_last == tkr_raw.cycle_last */ vdso_data->cs_cycle_last = tk->tkr_mono.cycle_last; vdso_data->raw_time_sec = tk->raw_time.tv_sec; - vdso_data->raw_time_nsec = tk->raw_time.tv_nsec; + vdso_data->raw_time_nsec = (tk->raw_time.tv_nsec << + tk->tkr_raw.shift) + + tk->tkr_raw.xtime_nsec; vdso_data->xtime_clock_sec = tk->xtime_sec; vdso_data->xtime_clock_nsec = tk->tkr_mono.xtime_nsec; - /* tkr_raw.xtime_nsec == 0 */ vdso_data->cs_mono_mult = tk->tkr_mono.mult; vdso_data->cs_raw_mult = tk->tkr_raw.mult; /* tkr_mono.shift == tkr_raw.shift */ --- linux-azure-4.11.0.orig/arch/arm64/kernel/vdso/gettimeofday.S +++ linux-azure-4.11.0/arch/arm64/kernel/vdso/gettimeofday.S @@ -256,7 +256,6 @@ seqcnt_check fail=monotonic_raw /* All computations are done with left-shifted nsecs. */ - lsl x14, x14, x12 get_nsec_per_sec res=x9 lsl x9, x9, x12 --- linux-azure-4.11.0.orig/arch/arm64/kvm/hyp-init.S +++ linux-azure-4.11.0/arch/arm64/kvm/hyp-init.S @@ -102,10 +102,13 @@ tlbi alle2 dsb sy - mrs x4, sctlr_el2 - and x4, x4, #SCTLR_ELx_EE // preserve endianness of EL2 - ldr x5, =SCTLR_ELx_FLAGS - orr x4, x4, x5 + /* + * Preserve all the RES1 bits while setting the default flags, + * as well as the EE bit on BE. Drop the A flag since the compiler + * is allowed to generate unaligned accesses. + */ + ldr x4, =(SCTLR_EL2_RES1 | (SCTLR_ELx_FLAGS & ~SCTLR_ELx_A)) +CPU_BE( orr x4, x4, #SCTLR_ELx_EE) msr sctlr_el2, x4 isb --- linux-azure-4.11.0.orig/arch/arm64/kvm/hyp/Makefile +++ linux-azure-4.11.0/arch/arm64/kvm/hyp/Makefile @@ -2,6 +2,8 @@ # Makefile for Kernel-based Virtual Machine module, HYP part # +ccflags-y += -fno-stack-protector + KVM=../../../../virt/kvm obj-$(CONFIG_KVM_ARM_HOST) += $(KVM)/arm/hyp/vgic-v2-sr.o --- linux-azure-4.11.0.orig/arch/arm64/kvm/hyp/switch.c +++ linux-azure-4.11.0/arch/arm64/kvm/hyp/switch.c @@ -350,6 +350,20 @@ } } + if (static_branch_unlikely(&vgic_v3_cpuif_trap) && + exit_code == ARM_EXCEPTION_TRAP && + (kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_SYS64 || + kvm_vcpu_trap_get_class(vcpu) == ESR_ELx_EC_CP15_32)) { + int ret = __vgic_v3_perform_cpuif_access(vcpu); + + if (ret == 1) { + __skip_instr(vcpu); + goto again; + } + + /* 0 falls through to be handled out of EL2 */ + } + fp_enabled = __fpsimd_enabled(); __sysreg_save_guest_state(guest_ctxt); --- linux-azure-4.11.0.orig/arch/arm64/kvm/sys_regs.c +++ linux-azure-4.11.0/arch/arm64/kvm/sys_regs.c @@ -55,6 +55,26 @@ * 64bit interface. */ +static bool read_from_write_only(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r) +{ + WARN_ONCE(1, "Unexpected sys_reg read to write-only register\n"); + print_sys_reg_instr(params); + kvm_inject_undefined(vcpu); + return false; +} + +static bool write_to_read_only(struct kvm_vcpu *vcpu, + struct sys_reg_params *params, + const struct sys_reg_desc *r) +{ + WARN_ONCE(1, "Unexpected sys_reg write to read-only register\n"); + print_sys_reg_instr(params); + kvm_inject_undefined(vcpu); + return false; +} + /* 3 bits per cache level, as per CLIDR, but non-existent caches always 0 */ static u32 cache_levels; @@ -84,7 +104,7 @@ const struct sys_reg_desc *r) { if (!p->is_write) - return read_from_write_only(vcpu, p); + return read_from_write_only(vcpu, p, r); kvm_set_way_flush(vcpu); return true; @@ -126,7 +146,7 @@ const struct sys_reg_desc *r) { if (!p->is_write) - return read_from_write_only(vcpu, p); + return read_from_write_only(vcpu, p, r); vgic_v3_dispatch_sgi(vcpu, p->regval); @@ -1012,9 +1032,33 @@ { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b0000), Op2(0b000), NULL, reset_val, VBAR_EL1, 0 }, + /* ICC_IAR0_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1000), Op2(0b000), + write_to_read_only }, + /* ICC_EOIR0_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1000), Op2(0b001), + read_from_write_only }, + /* ICC_HPPIR0_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1000), Op2(0b010), + write_to_read_only }, + /* ICC_DIR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1011), Op2(0b001), + read_from_write_only }, + /* ICC_RPR_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1011), Op2(0b011), + write_to_read_only }, /* ICC_SGI1R_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1011), Op2(0b101), access_gic_sgi }, + /* ICC_IAR1_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b000), + write_to_read_only }, + /* ICC_EOIR1_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b001), + read_from_write_only }, + /* ICC_HPPIR1_EL1 */ + { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b010), + write_to_read_only }, /* ICC_SRE_EL1 */ { Op0(0b11), Op1(0b000), CRn(0b1100), CRm(0b1100), Op2(0b101), access_gic_sre }, @@ -1638,8 +1682,8 @@ { struct sys_reg_params params; u32 hsr = kvm_vcpu_get_hsr(vcpu); - int Rt = (hsr >> 5) & 0xf; - int Rt2 = (hsr >> 10) & 0xf; + int Rt = kvm_vcpu_sys_get_rt(vcpu); + int Rt2 = (hsr >> 10) & 0x1f; params.is_aarch32 = true; params.is_32bit = false; @@ -1690,7 +1734,7 @@ { struct sys_reg_params params; u32 hsr = kvm_vcpu_get_hsr(vcpu); - int Rt = (hsr >> 5) & 0xf; + int Rt = kvm_vcpu_sys_get_rt(vcpu); params.is_aarch32 = true; params.is_32bit = true; @@ -1805,7 +1849,7 @@ { struct sys_reg_params params; unsigned long esr = kvm_vcpu_get_hsr(vcpu); - int Rt = (esr >> 5) & 0x1f; + int Rt = kvm_vcpu_sys_get_rt(vcpu); int ret; trace_kvm_handle_sys_reg(esr); --- linux-azure-4.11.0.orig/arch/arm64/kvm/sys_regs.h +++ linux-azure-4.11.0/arch/arm64/kvm/sys_regs.h @@ -83,24 +83,6 @@ return true; } -static inline bool write_to_read_only(struct kvm_vcpu *vcpu, - const struct sys_reg_params *params) -{ - kvm_debug("sys_reg write to read-only register at: %lx\n", - *vcpu_pc(vcpu)); - print_sys_reg_instr(params); - return false; -} - -static inline bool read_from_write_only(struct kvm_vcpu *vcpu, - const struct sys_reg_params *params) -{ - kvm_debug("sys_reg read to write-only register at: %lx\n", - *vcpu_pc(vcpu)); - print_sys_reg_instr(params); - return false; -} - /* Reset functions */ static inline void reset_unknown(struct kvm_vcpu *vcpu, const struct sys_reg_desc *r) --- linux-azure-4.11.0.orig/arch/arm64/mm/dma-mapping.c +++ linux-azure-4.11.0/arch/arm64/mm/dma-mapping.c @@ -977,4 +977,11 @@ dev->archdata.dma_coherent = coherent; __iommu_setup_dma_ops(dev, dma_base, size, iommu); + +#ifdef CONFIG_XEN + if (xen_initial_domain()) { + dev->archdata.dev_dma_ops = dev->dma_ops; + dev->dma_ops = xen_dma_ops; + } +#endif } --- linux-azure-4.11.0.orig/arch/arm64/mm/init.c +++ linux-azure-4.11.0/arch/arm64/mm/init.c @@ -30,6 +30,7 @@ #include #include #include +#include #include #include #include @@ -37,6 +38,8 @@ #include #include #include +#include +#include #include #include @@ -77,6 +80,142 @@ early_param("initrd", early_initrd); #endif +#ifdef CONFIG_KEXEC_CORE +/* + * reserve_crashkernel() - reserves memory for crash kernel + * + * This function reserves memory area given in "crashkernel=" kernel command + * line parameter. The memory reserved is used by dump capture kernel when + * primary kernel is crashing. + */ +static void __init reserve_crashkernel(void) +{ + unsigned long long crash_base, crash_size; + int ret; + + ret = parse_crashkernel(boot_command_line, memblock_phys_mem_size(), + &crash_size, &crash_base); + /* no crashkernel= or invalid value specified */ + if (ret || !crash_size) + return; + + crash_size = PAGE_ALIGN(crash_size); + + if (crash_base == 0) { + /* Current arm64 boot protocol requires 2MB alignment */ + crash_base = memblock_find_in_range(0, ARCH_LOW_ADDRESS_LIMIT, + crash_size, SZ_2M); + if (crash_base == 0) { + pr_warn("cannot allocate crashkernel (size:0x%llx)\n", + crash_size); + return; + } + } else { + /* User specifies base address explicitly. */ + if (!memblock_is_region_memory(crash_base, crash_size)) { + pr_warn("cannot reserve crashkernel: region is not memory\n"); + return; + } + + if (memblock_is_region_reserved(crash_base, crash_size)) { + pr_warn("cannot reserve crashkernel: region overlaps reserved memory\n"); + return; + } + + if (!IS_ALIGNED(crash_base, SZ_2M)) { + pr_warn("cannot reserve crashkernel: base address is not 2MB aligned\n"); + return; + } + } + memblock_reserve(crash_base, crash_size); + + pr_info("crashkernel reserved: 0x%016llx - 0x%016llx (%lld MB)\n", + crash_base, crash_base + crash_size, crash_size >> 20); + + crashk_res.start = crash_base; + crashk_res.end = crash_base + crash_size - 1; +} + +static void __init kexec_reserve_crashkres_pages(void) +{ +#ifdef CONFIG_HIBERNATION + phys_addr_t addr; + struct page *page; + + if (!crashk_res.end) + return; + + /* + * To reduce the size of hibernation image, all the pages are + * marked as Reserved initially. + */ + for (addr = crashk_res.start; addr < (crashk_res.end + 1); + addr += PAGE_SIZE) { + page = phys_to_page(addr); + SetPageReserved(page); + } +#endif +} +#else +static void __init reserve_crashkernel(void) +{ +} + +static void __init kexec_reserve_crashkres_pages(void) +{ +} +#endif /* CONFIG_KEXEC_CORE */ + +#ifdef CONFIG_CRASH_DUMP +static int __init early_init_dt_scan_elfcorehdr(unsigned long node, + const char *uname, int depth, void *data) +{ + const __be32 *reg; + int len; + + if (depth != 1 || strcmp(uname, "chosen") != 0) + return 0; + + reg = of_get_flat_dt_prop(node, "linux,elfcorehdr", &len); + if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells))) + return 1; + + elfcorehdr_addr = dt_mem_next_cell(dt_root_addr_cells, ®); + elfcorehdr_size = dt_mem_next_cell(dt_root_size_cells, ®); + + return 1; +} + +/* + * reserve_elfcorehdr() - reserves memory for elf core header + * + * This function reserves the memory occupied by an elf core header + * described in the device tree. This region contains all the + * information about primary kernel's core image and is used by a dump + * capture kernel to access the system memory on primary kernel. + */ +static void __init reserve_elfcorehdr(void) +{ + of_scan_flat_dt(early_init_dt_scan_elfcorehdr, NULL); + + if (!elfcorehdr_size) + return; + + if (memblock_is_region_reserved(elfcorehdr_addr, elfcorehdr_size)) { + pr_warn("elfcorehdr is overlapped\n"); + return; + } + + memblock_reserve(elfcorehdr_addr, elfcorehdr_size); + + pr_info("Reserving %lldKB of memory at 0x%llx for elfcorehdr\n", + elfcorehdr_size >> 10, elfcorehdr_addr); +} +#else +static void __init reserve_elfcorehdr(void) +{ +} +#endif /* CONFIG_CRASH_DUMP */ /* * Return the maximum physical address for ZONE_DMA (DMA_BIT_MASK(32)). It * currently assumes that for memory starting above 4G, 32-bit devices will @@ -188,10 +327,45 @@ } early_param("mem", early_mem); +static int __init early_init_dt_scan_usablemem(unsigned long node, + const char *uname, int depth, void *data) +{ + struct memblock_region *usablemem = data; + const __be32 *reg; + int len; + + if (depth != 1 || strcmp(uname, "chosen") != 0) + return 0; + + reg = of_get_flat_dt_prop(node, "linux,usable-memory-range", &len); + if (!reg || (len < (dt_root_addr_cells + dt_root_size_cells))) + return 1; + + usablemem->base = dt_mem_next_cell(dt_root_addr_cells, ®); + usablemem->size = dt_mem_next_cell(dt_root_size_cells, ®); + + return 1; +} + +static void __init fdt_enforce_memory_region(void) +{ + struct memblock_region reg = { + .size = 0, + }; + + of_scan_flat_dt(early_init_dt_scan_usablemem, ®); + + if (reg.size) + memblock_cap_memory_range(reg.base, reg.size); +} + void __init arm64_memblock_init(void) { const s64 linear_region_size = -(s64)PAGE_OFFSET; + /* Handle linux,usable-memory-range property */ + fdt_enforce_memory_region(); + /* * Ensure that the linear region takes up exactly half of the kernel * virtual address space. This way, we can distinguish a linear address @@ -297,6 +471,11 @@ arm64_dma_phys_limit = max_zone_dma_phys(); else arm64_dma_phys_limit = PHYS_MASK + 1; + + reserve_crashkernel(); + + reserve_elfcorehdr(); + dma_contiguous_reserve(arm64_dma_phys_limit); memblock_allow_resize(); @@ -416,6 +595,8 @@ /* this will put all unused low memory onto the freelists */ free_all_bootmem(); + kexec_reserve_crashkres_pages(); + mem_init_print_info(NULL); #define MLK(b, t) b, t, ((t) - (b)) >> 10 --- linux-azure-4.11.0.orig/arch/arm64/mm/mmu.c +++ linux-azure-4.11.0/arch/arm64/mm/mmu.c @@ -22,6 +22,8 @@ #include #include #include +#include +#include #include #include #include @@ -332,56 +334,31 @@ NULL, debug_pagealloc_enabled()); } -static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, phys_addr_t end) +static void __init __map_memblock(pgd_t *pgd, phys_addr_t start, + phys_addr_t end, pgprot_t prot, + bool page_mappings_only) +{ + __create_pgd_mapping(pgd, start, __phys_to_virt(start), end - start, + prot, early_pgtable_alloc, + page_mappings_only); +} + +static void __init map_mem(pgd_t *pgd) { phys_addr_t kernel_start = __pa_symbol(_text); phys_addr_t kernel_end = __pa_symbol(__init_begin); + struct memblock_region *reg; /* - * Take care not to create a writable alias for the - * read-only text and rodata sections of the kernel image. - */ - - /* No overlap with the kernel text/rodata */ - if (end < kernel_start || start >= kernel_end) { - __create_pgd_mapping(pgd, start, __phys_to_virt(start), - end - start, PAGE_KERNEL, - early_pgtable_alloc, - debug_pagealloc_enabled()); - return; - } - - /* - * This block overlaps the kernel text/rodata mappings. - * Map the portion(s) which don't overlap. - */ - if (start < kernel_start) - __create_pgd_mapping(pgd, start, - __phys_to_virt(start), - kernel_start - start, PAGE_KERNEL, - early_pgtable_alloc, - debug_pagealloc_enabled()); - if (kernel_end < end) - __create_pgd_mapping(pgd, kernel_end, - __phys_to_virt(kernel_end), - end - kernel_end, PAGE_KERNEL, - early_pgtable_alloc, - debug_pagealloc_enabled()); - - /* - * Map the linear alias of the [_text, __init_begin) interval as - * read-only/non-executable. This makes the contents of the - * region accessible to subsystems such as hibernate, but - * protects it from inadvertent modification or execution. + * Temporarily marked as NOMAP to skip mapping in the next for-loop */ - __create_pgd_mapping(pgd, kernel_start, __phys_to_virt(kernel_start), - kernel_end - kernel_start, PAGE_KERNEL_RO, - early_pgtable_alloc, debug_pagealloc_enabled()); -} + memblock_mark_nomap(kernel_start, kernel_end - kernel_start); -static void __init map_mem(pgd_t *pgd) -{ - struct memblock_region *reg; +#ifdef CONFIG_KEXEC_CORE + if (crashk_res.end) + memblock_mark_nomap(crashk_res.start, + resource_size(&crashk_res)); +#endif /* map all the memory banks */ for_each_memblock(memory, reg) { @@ -393,8 +370,33 @@ if (memblock_is_nomap(reg)) continue; - __map_memblock(pgd, start, end); + __map_memblock(pgd, start, end, + PAGE_KERNEL, debug_pagealloc_enabled()); + } + + /* + * Map the linear alias of the [_text, __init_begin) interval as + * read-only/non-executable. This makes the contents of the + * region accessible to subsystems such as hibernate, but + * protects it from inadvertent modification or execution. + */ + __map_memblock(pgd, kernel_start, kernel_end, + PAGE_KERNEL_RO, debug_pagealloc_enabled()); + memblock_clear_nomap(kernel_start, kernel_end - kernel_start); + +#ifdef CONFIG_KEXEC_CORE + /* + * User page-level mappings here so that we can shrink the region + * in page granularity and put back unused memory to buddy system + * through /sys/kernel/kexec_crash_size interface. + */ + if (crashk_res.end) { + __map_memblock(pgd, crashk_res.start, crashk_res.end + 1, + PAGE_KERNEL, true); + memblock_clear_nomap(crashk_res.start, + resource_size(&crashk_res)); } +#endif } void mark_rodata_ro(void) --- linux-azure-4.11.0.orig/arch/arm64/mm/pageattr.c +++ linux-azure-4.11.0/arch/arm64/mm/pageattr.c @@ -125,20 +125,23 @@ } EXPORT_SYMBOL_GPL(set_memory_x); -#ifdef CONFIG_DEBUG_PAGEALLOC -void __kernel_map_pages(struct page *page, int numpages, int enable) +int set_memory_valid(unsigned long addr, int numpages, int enable) { - unsigned long addr = (unsigned long) page_address(page); - if (enable) - __change_memory_common(addr, PAGE_SIZE * numpages, + return __change_memory_common(addr, PAGE_SIZE * numpages, __pgprot(PTE_VALID), __pgprot(0)); else - __change_memory_common(addr, PAGE_SIZE * numpages, + return __change_memory_common(addr, PAGE_SIZE * numpages, __pgprot(0), __pgprot(PTE_VALID)); } + +#ifdef CONFIG_DEBUG_PAGEALLOC +void __kernel_map_pages(struct page *page, int numpages, int enable) +{ + set_memory_valid((unsigned long)page_address(page), numpages, enable); +} #ifdef CONFIG_HIBERNATION /* * When built with CONFIG_DEBUG_PAGEALLOC and CONFIG_HIBERNATION, this function --- linux-azure-4.11.0.orig/arch/arm64/net/bpf_jit_comp.c +++ linux-azure-4.11.0/arch/arm64/net/bpf_jit_comp.c @@ -252,8 +252,9 @@ */ off = offsetof(struct bpf_array, ptrs); emit_a64_mov_i64(tmp, off, ctx); - emit(A64_LDR64(tmp, r2, tmp), ctx); - emit(A64_LDR64(prg, tmp, r3), ctx); + emit(A64_ADD(1, tmp, r2, tmp), ctx); + emit(A64_LSL(1, prg, r3, 3), ctx); + emit(A64_LDR64(prg, tmp, prg), ctx); emit(A64_CBZ(1, prg, jmp_offset), ctx); /* goto *(prog->bpf_func + prologue_size); */ @@ -779,14 +780,14 @@ int ret; ret = build_insn(insn, ctx); - - if (ctx->image == NULL) - ctx->offset[i] = ctx->idx; - if (ret > 0) { i++; + if (ctx->image == NULL) + ctx->offset[i] = ctx->idx; continue; } + if (ctx->image == NULL) + ctx->offset[i] = ctx->idx; if (ret) return ret; } --- linux-azure-4.11.0.orig/arch/frv/mm/elf-fdpic.c +++ linux-azure-4.11.0/arch/frv/mm/elf-fdpic.c @@ -75,7 +75,7 @@ addr = PAGE_ALIGN(addr); vma = find_vma(current->mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) goto success; } --- linux-azure-4.11.0.orig/arch/metag/include/asm/uaccess.h +++ linux-azure-4.11.0/arch/metag/include/asm/uaccess.h @@ -28,24 +28,32 @@ #define segment_eq(a, b) ((a).seg == (b).seg) -#define __kernel_ok (segment_eq(get_fs(), KERNEL_DS)) -/* - * Explicitly allow NULL pointers here. Parts of the kernel such - * as readv/writev use access_ok to validate pointers, but want - * to allow NULL pointers for various reasons. NULL pointers are - * safe to allow through because the first page is not mappable on - * Meta. - * - * We also wish to avoid letting user code access the system area - * and the kernel half of the address space. - */ -#define __user_bad(addr, size) (((addr) > 0 && (addr) < META_MEMORY_BASE) || \ - ((addr) > PAGE_OFFSET && \ - (addr) < LINCORE_BASE)) - static inline int __access_ok(unsigned long addr, unsigned long size) { - return __kernel_ok || !__user_bad(addr, size); + /* + * Allow access to the user mapped memory area, but not the system area + * before it. The check extends to the top of the address space when + * kernel access is allowed (there's no real reason to user copy to the + * system area in any case). + */ + if (likely(addr >= META_MEMORY_BASE && addr < get_fs().seg && + size <= get_fs().seg - addr)) + return true; + /* + * Explicitly allow NULL pointers here. Parts of the kernel such + * as readv/writev use access_ok to validate pointers, but want + * to allow NULL pointers for various reasons. NULL pointers are + * safe to allow through because the first page is not mappable on + * Meta. + */ + if (!addr) + return true; + /* Allow access to core code memory area... */ + if (addr >= LINCORE_CODE_BASE && addr <= LINCORE_CODE_LIMIT && + size <= LINCORE_CODE_LIMIT + 1 - addr) + return true; + /* ... but no other areas. */ + return false; } #define access_ok(type, addr, size) __access_ok((unsigned long)(addr), \ @@ -186,8 +194,13 @@ extern long __must_check __strncpy_from_user(char *dst, const char __user *src, long count); -#define strncpy_from_user(dst, src, count) __strncpy_from_user(dst, src, count) - +static inline long +strncpy_from_user(char *dst, const char __user *src, long count) +{ + if (!access_ok(VERIFY_READ, src, 1)) + return -EFAULT; + return __strncpy_from_user(dst, src, count); +} /* * Return the size of a string (including the ending 0) * --- linux-azure-4.11.0.orig/arch/mips/Kconfig +++ linux-azure-4.11.0/arch/mips/Kconfig @@ -1373,6 +1373,7 @@ select WEAK_ORDERING select WEAK_REORDERING_BEYOND_LLSC select MIPS_PGD_C0_CONTEXT + select MIPS_L1_CACHE_SHIFT_6 select GPIOLIB help The Loongson 3 processor implements the MIPS64R2 instruction --- linux-azure-4.11.0.orig/arch/mips/boot/Makefile +++ linux-azure-4.11.0/arch/mips/boot/Makefile @@ -128,19 +128,19 @@ -DADDR_BITS=$(ADDR_BITS) \ -DADDR_CELLS=$(itb_addr_cells) -$(obj)/vmlinux.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE +$(obj)/vmlinux.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S $(VMLINUX) FORCE $(call if_changed_dep,cpp_its_S,none,vmlinux.bin) -$(obj)/vmlinux.gz.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE +$(obj)/vmlinux.gz.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S $(VMLINUX) FORCE $(call if_changed_dep,cpp_its_S,gzip,vmlinux.bin.gz) -$(obj)/vmlinux.bz2.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE +$(obj)/vmlinux.bz2.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S $(VMLINUX) FORCE $(call if_changed_dep,cpp_its_S,bzip2,vmlinux.bin.bz2) -$(obj)/vmlinux.lzma.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE +$(obj)/vmlinux.lzma.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S $(VMLINUX) FORCE $(call if_changed_dep,cpp_its_S,lzma,vmlinux.bin.lzma) -$(obj)/vmlinux.lzo.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S FORCE +$(obj)/vmlinux.lzo.its: $(srctree)/arch/mips/$(PLATFORM)/vmlinux.its.S $(VMLINUX) FORCE $(call if_changed_dep,cpp_its_S,lzo,vmlinux.bin.lzo) quiet_cmd_itb-image = ITB $@ --- linux-azure-4.11.0.orig/arch/mips/kernel/branch.c +++ linux-azure-4.11.0/arch/mips/kernel/branch.c @@ -804,8 +804,10 @@ break; } /* Compact branch: BNEZC || JIALC */ - if (insn.i_format.rs) + if (!insn.i_format.rs) { + /* JIALC: set $31/ra */ regs->regs[31] = epc + 4; + } regs->cp0_epc += 8; break; #endif --- linux-azure-4.11.0.orig/arch/mips/kernel/entry.S +++ linux-azure-4.11.0/arch/mips/kernel/entry.S @@ -11,6 +11,7 @@ #include #include #include +#include #include #include #include @@ -119,6 +120,7 @@ andi t0, a2, _TIF_NEED_RESCHED # a2 is preloaded with TI_FLAGS beqz t0, work_notifysig work_resched: + TRACE_IRQS_OFF jal schedule local_irq_disable # make sure need_resched and @@ -155,6 +157,7 @@ beqz t0, work_pending # trace bit set? local_irq_enable # could let syscall_trace_leave() # call schedule() instead + TRACE_IRQS_ON move a0, sp jal syscall_trace_leave b resume_userspace --- linux-azure-4.11.0.orig/arch/mips/kernel/head.S +++ linux-azure-4.11.0/arch/mips/kernel/head.S @@ -106,8 +106,8 @@ beq t0, t1, dtb_found #endif li t1, -2 - beq a0, t1, dtb_found move t2, a1 + beq a0, t1, dtb_found li t2, 0 dtb_found: --- linux-azure-4.11.0.orig/arch/mips/kernel/pm-cps.c +++ linux-azure-4.11.0/arch/mips/kernel/pm-cps.c @@ -56,7 +56,6 @@ * state. Actually per-core rather than per-CPU. */ static DEFINE_PER_CPU_ALIGNED(u32*, ready_count); -static DEFINE_PER_CPU_ALIGNED(void*, ready_count_alloc); /* Indicates online CPUs coupled with the current CPU */ static DEFINE_PER_CPU_ALIGNED(cpumask_t, online_coupled); @@ -642,7 +641,6 @@ { enum cps_pm_state state; unsigned core = cpu_data[cpu].core; - unsigned dlinesz = cpu_data[cpu].dcache.linesz; void *entry_fn, *core_rc; for (state = CPS_PM_NC_WAIT; state < CPS_PM_STATE_COUNT; state++) { @@ -662,16 +660,11 @@ } if (!per_cpu(ready_count, core)) { - core_rc = kmalloc(dlinesz * 2, GFP_KERNEL); + core_rc = kmalloc(sizeof(u32), GFP_KERNEL); if (!core_rc) { pr_err("Failed allocate core %u ready_count\n", core); return -ENOMEM; } - per_cpu(ready_count_alloc, core) = core_rc; - - /* Ensure ready_count is aligned to a cacheline boundary */ - core_rc += dlinesz - 1; - core_rc = (void *)((unsigned long)core_rc & ~(dlinesz - 1)); per_cpu(ready_count, core) = core_rc; } --- linux-azure-4.11.0.orig/arch/mips/kernel/process.c +++ linux-azure-4.11.0/arch/mips/kernel/process.c @@ -120,7 +120,6 @@ struct thread_info *ti = task_thread_info(p); struct pt_regs *childregs, *regs = current_pt_regs(); unsigned long childksp; - p->set_child_tid = p->clear_child_tid = NULL; childksp = (unsigned long)task_stack_page(p) + THREAD_SIZE - 32; --- linux-azure-4.11.0.orig/arch/mips/kernel/traps.c +++ linux-azure-4.11.0/arch/mips/kernel/traps.c @@ -201,6 +201,8 @@ { struct pt_regs regs; mm_segment_t old_fs = get_fs(); + + regs.cp0_status = KSU_KERNEL; if (sp) { regs.regs[29] = (unsigned long)sp; regs.regs[31] = 0; --- linux-azure-4.11.0.orig/arch/mips/kvm/tlb.c +++ linux-azure-4.11.0/arch/mips/kvm/tlb.c @@ -147,7 +147,11 @@ int kvm_mips_host_tlb_inv(struct kvm_vcpu *vcpu, unsigned long va, bool user, bool kernel) { - int idx_user, idx_kernel; + /* + * Initialize idx_user and idx_kernel to workaround bogus + * maybe-initialized warning when using GCC 6. + */ + int idx_user = 0, idx_kernel = 0; unsigned long flags, old_entryhi; local_irq_save(flags); --- linux-azure-4.11.0.orig/arch/mips/mm/mmap.c +++ linux-azure-4.11.0/arch/mips/mm/mmap.c @@ -93,7 +93,7 @@ vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } --- linux-azure-4.11.0.orig/arch/openrisc/kernel/process.c +++ linux-azure-4.11.0/arch/openrisc/kernel/process.c @@ -167,8 +167,6 @@ top_of_kernel_stack = sp; - p->set_child_tid = p->clear_child_tid = NULL; - /* Locate userspace context on stack... */ sp -= STACK_FRAME_OVERHEAD; /* redzone */ sp -= sizeof(struct pt_regs); --- linux-azure-4.11.0.orig/arch/parisc/include/asm/dma-mapping.h +++ linux-azure-4.11.0/arch/parisc/include/asm/dma-mapping.h @@ -20,6 +20,8 @@ ** flush/purge and allocate "regular" cacheable pages for everything. */ +#define DMA_ERROR_CODE (~(dma_addr_t)0) + #ifdef CONFIG_PA11 extern const struct dma_map_ops pcxl_dma_ops; extern const struct dma_map_ops pcx_dma_ops; @@ -54,12 +56,13 @@ break; } } - BUG_ON(!dev->platform_data); return dev->platform_data; } - -#define GET_IOC(dev) (HBA_DATA(parisc_walk_tree(dev))->iommu) - + +#define GET_IOC(dev) ({ \ + void *__pdata = parisc_walk_tree(dev); \ + __pdata ? HBA_DATA(__pdata)->iommu : NULL; \ +}) #ifdef CONFIG_IOMMU_CCIO struct parisc_device; --- linux-azure-4.11.0.orig/arch/parisc/include/asm/mmu_context.h +++ linux-azure-4.11.0/arch/parisc/include/asm/mmu_context.h @@ -49,15 +49,26 @@ mtctl(__space_to_prot(context), 8); } -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, struct task_struct *tsk) +static inline void switch_mm_irqs_off(struct mm_struct *prev, + struct mm_struct *next, struct task_struct *tsk) { - if (prev != next) { mtctl(__pa(next->pgd), 25); load_context(next->context); } } +static inline void switch_mm(struct mm_struct *prev, + struct mm_struct *next, struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} +#define switch_mm_irqs_off switch_mm_irqs_off + #define deactivate_mm(tsk,mm) do { } while (0) static inline void activate_mm(struct mm_struct *prev, struct mm_struct *next) --- linux-azure-4.11.0.orig/arch/parisc/kernel/sys_parisc.c +++ linux-azure-4.11.0/arch/parisc/kernel/sys_parisc.c @@ -90,7 +90,7 @@ unsigned long len, unsigned long pgoff, unsigned long flags) { struct mm_struct *mm = current->mm; - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; unsigned long task_size = TASK_SIZE; int do_color_align, last_mmap; struct vm_unmapped_area_info info; @@ -117,9 +117,10 @@ else addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + vma = find_vma_prev(mm, addr, &prev); if (task_size - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) goto found_addr; } @@ -143,7 +144,7 @@ const unsigned long len, const unsigned long pgoff, const unsigned long flags) { - struct vm_area_struct *vma; + struct vm_area_struct *vma, *prev; struct mm_struct *mm = current->mm; unsigned long addr = addr0; int do_color_align, last_mmap; @@ -177,9 +178,11 @@ addr = COLOR_ALIGN(addr, last_mmap, pgoff); else addr = PAGE_ALIGN(addr); - vma = find_vma(mm, addr); + + vma = find_vma_prev(mm, addr, &prev); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma)) && + (!prev || addr >= vm_end_gap(prev))) goto found_addr; } --- linux-azure-4.11.0.orig/arch/parisc/kernel/syscall_table.S +++ linux-azure-4.11.0/arch/parisc/kernel/syscall_table.S @@ -361,7 +361,7 @@ ENTRY_SAME(ni_syscall) /* 263: reserved for vserver */ ENTRY_SAME(add_key) ENTRY_SAME(request_key) /* 265 */ - ENTRY_SAME(keyctl) + ENTRY_COMP(keyctl) ENTRY_SAME(ioprio_set) ENTRY_SAME(ioprio_get) ENTRY_SAME(inotify_init) --- linux-azure-4.11.0.orig/arch/parisc/mm/fault.c +++ linux-azure-4.11.0/arch/parisc/mm/fault.c @@ -367,7 +367,7 @@ case 15: /* Data TLB miss fault/Data page fault */ /* send SIGSEGV when outside of vma */ if (!vma || - address < vma->vm_start || address > vma->vm_end) { + address < vma->vm_start || address >= vma->vm_end) { si.si_signo = SIGSEGV; si.si_code = SEGV_MAPERR; break; --- linux-azure-4.11.0.orig/arch/powerpc/Kconfig +++ linux-azure-4.11.0/arch/powerpc/Kconfig @@ -1129,6 +1129,8 @@ source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "fs/Kconfig" source "lib/Kconfig" --- linux-azure-4.11.0.orig/arch/powerpc/Makefile +++ linux-azure-4.11.0/arch/powerpc/Makefile @@ -267,7 +267,7 @@ all: zImage # With make 3.82 we cannot mix normal and wildcard targets -BOOT_TARGETS1 := zImage zImage.initrd uImage +BOOT_TARGETS1 := zImage zImage.initrd uImage vmlinux.strip BOOT_TARGETS2 := zImage% dtbImage% treeImage.% cuImage.% simpleImage.% uImage.% PHONY += $(BOOT_TARGETS1) $(BOOT_TARGETS2) --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/bitops.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/bitops.h @@ -55,6 +55,14 @@ #define PPC_BITEXTRACT(bits, ppc_bit, dst_bit) \ ((((bits) >> PPC_BITLSHIFT(ppc_bit)) & 1) << (dst_bit)) +#define PPC_BITLSHIFT32(be) (32 - 1 - (be)) +#define PPC_BIT32(bit) (1UL << PPC_BITLSHIFT32(bit)) +#define PPC_BITMASK32(bs, be) ((PPC_BIT32(bs) - PPC_BIT32(be))|PPC_BIT32(bs)) + +#define PPC_BITLSHIFT8(be) (8 - 1 - (be)) +#define PPC_BIT8(bit) (1UL << PPC_BITLSHIFT8(bit)) +#define PPC_BITMASK8(bs, be) ((PPC_BIT8(bs) - PPC_BIT8(be))|PPC_BIT8(bs)) + #include /* Macro for generating the ***_bits() functions */ --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/book3s/64/mmu.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/book3s/64/mmu.h @@ -73,10 +73,16 @@ typedef unsigned long mm_context_id_t; struct spinlock; +/* Maximum possible number of NPUs in a system. */ +#define NV_MAX_NPUS 8 + typedef struct { mm_context_id_t id; u16 user_psize; /* page size index */ + /* NPU NMMU context */ + struct npu_context *npu_context; + #ifdef CONFIG_PPC_MM_SLICES u64 low_slices_psize; /* SLB page size encodings */ unsigned char high_slices_psize[SLICE_ARRAY_SIZE]; --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/cpuidle.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/cpuidle.h @@ -46,6 +46,7 @@ extern u64 pnv_first_deep_stop_state; +unsigned long pnv_cpu_offline(unsigned int cpu); int validate_psscr_val_mask(u64 *psscr_val, u64 *psscr_mask, u32 flags); static inline void report_invalid_psscr_val(u64 psscr_val, int err) { --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/cputhreads.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/cputhreads.h @@ -2,6 +2,7 @@ #define _ASM_POWERPC_CPUTHREADS_H #ifndef __ASSEMBLY__ +#include #include #include --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/dbell.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/dbell.h @@ -35,33 +35,53 @@ #ifdef CONFIG_PPC_BOOK3S #define PPC_DBELL_MSGTYPE PPC_DBELL_SERVER -#define SPRN_DOORBELL_CPUTAG SPRN_TIR -#define PPC_DBELL_TAG_MASK 0x7f static inline void _ppc_msgsnd(u32 msg) { - if (cpu_has_feature(CPU_FTR_HVMODE)) - __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); - else - __asm__ __volatile__ (PPC_MSGSNDP(%0) : : "r" (msg)); + __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSND(%1), PPC_MSGSNDP(%1), %0) + : : "i" (CPU_FTR_HVMODE), "r" (msg)); +} + +/* sync before sending message */ +static inline void ppc_msgsnd_sync(void) +{ + __asm__ __volatile__ ("sync" : : : "memory"); +} + +/* sync after taking message interrupt */ +static inline void ppc_msgsync(void) +{ + /* sync is not required when taking messages from the same core */ + __asm__ __volatile__ (ASM_FTR_IFSET(PPC_MSGSYNC " ; lwsync", "", %0) + : : "i" (CPU_FTR_HVMODE|CPU_FTR_ARCH_300)); } #else /* CONFIG_PPC_BOOK3S */ #define PPC_DBELL_MSGTYPE PPC_DBELL -#define SPRN_DOORBELL_CPUTAG SPRN_PIR -#define PPC_DBELL_TAG_MASK 0x3fff static inline void _ppc_msgsnd(u32 msg) { __asm__ __volatile__ (PPC_MSGSND(%0) : : "r" (msg)); } +/* sync before sending message */ +static inline void ppc_msgsnd_sync(void) +{ + __asm__ __volatile__ ("sync" : : : "memory"); +} + +/* sync after taking message interrupt */ +static inline void ppc_msgsync(void) +{ +} + #endif /* CONFIG_PPC_BOOK3S */ -extern void doorbell_cause_ipi(int cpu, unsigned long data); +extern void doorbell_global_ipi(int cpu); +extern void doorbell_core_ipi(int cpu); +extern int doorbell_try_core_ipi(int cpu); extern void doorbell_exception(struct pt_regs *regs); -extern void doorbell_setup_this_cpu(void); static inline void ppc_msgsnd(enum ppc_dbell type, u32 flags, u32 tag) { --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/elf.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/elf.h @@ -23,12 +23,13 @@ #define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE PAGE_SIZE -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#define ELF_ET_DYN_BASE 0x20000000 +/* + * This is the base location for PIE (ET_DYN with INTERP) loads. On + * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +#define ELF_ET_DYN_BASE (is_32bit_task() ? 0x000400000UL : \ + 0x100000000UL) #define ELF_CORE_EFLAGS (is_elf2_task() ? 2 : 0) --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/fadump.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/fadump.h @@ -43,6 +43,9 @@ #define MIN_BOOT_MEM (((RMA_END < (0x1UL << 28)) ? (0x1UL << 28) : RMA_END) \ + (0x1UL << 26)) +/* The upper limit percentage for user specified boot memory size (25%) */ +#define MAX_BOOT_MEM_RATIO 4 + #define memblock_num_regions(memblock_type) (memblock.memblock_type.cnt) /* Firmware provided dump sections */ --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/iommu.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/iommu.h @@ -64,6 +64,11 @@ long index, unsigned long *hpa, enum dma_data_direction *direction); + /* Real mode */ + int (*exchange_rm)(struct iommu_table *tbl, + long index, + unsigned long *hpa, + enum dma_data_direction *direction); #endif void (*clear)(struct iommu_table *tbl, long index, long npages); @@ -114,6 +119,7 @@ struct list_head it_group_list;/* List of iommu_table_group_link */ unsigned long *it_userspace; /* userspace view of the table */ struct iommu_table_ops *it_ops; + struct kref it_kref; }; #define IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry) \ @@ -146,8 +152,8 @@ extern int dma_iommu_dma_supported(struct device *dev, u64 mask); -/* Frees table for an individual device node */ -extern void iommu_free_table(struct iommu_table *tbl, const char *node_name); +extern struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl); +extern int iommu_tce_table_put(struct iommu_table *tbl); /* Initializes an iommu_table based in values set in the passed-in * structure @@ -208,6 +214,8 @@ extern int __init tce_iommu_bus_notifier_init(void); extern long iommu_tce_xchg(struct iommu_table *tbl, unsigned long entry, unsigned long *hpa, enum dma_data_direction *direction); +extern long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, + unsigned long *hpa, enum dma_data_direction *direction); #else static inline void iommu_register_group(struct iommu_table_group *table_group, int pci_domain_number, --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/kvm_host.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/kvm_host.h @@ -191,6 +191,13 @@ atomic_t refcnt; }; +struct kvmppc_spapr_tce_iommu_table { + struct rcu_head rcu; + struct list_head next; + struct iommu_table *tbl; + struct kref kref; +}; + struct kvmppc_spapr_tce_table { struct list_head list; struct kvm *kvm; @@ -199,6 +206,7 @@ u32 page_shift; u64 offset; /* in pages */ u64 size; /* window size in pages */ + struct list_head iommu_tables; struct page *pages[0]; }; --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/kvm_ppc.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/kvm_ppc.h @@ -164,11 +164,15 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, struct kvm_memory_slot *memslot, unsigned long porder); extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); +extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, + struct iommu_group *grp); +extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, + struct iommu_group *grp); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, struct kvm_create_spapr_tce_64 *args); extern struct kvmppc_spapr_tce_table *kvmppc_find_table( - struct kvm_vcpu *vcpu, unsigned long liobn); + struct kvm *kvm, unsigned long liobn); extern long kvmppc_ioba_validate(struct kvmppc_spapr_tce_table *stt, unsigned long ioba, unsigned long npages); extern long kvmppc_tce_validate(struct kvmppc_spapr_tce_table *tt, --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/mmu_context.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/mmu_context.h @@ -29,10 +29,14 @@ extern void mm_iommu_cleanup(struct mm_struct *mm); extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup(struct mm_struct *mm, unsigned long ua, unsigned long size); +extern struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm( + struct mm_struct *mm, unsigned long ua, unsigned long size); extern struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, unsigned long ua, unsigned long entries); extern long mm_iommu_ua_to_hpa(struct mm_iommu_table_group_mem_t *mem, unsigned long ua, unsigned long *hpa); +extern long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, + unsigned long ua, unsigned long *hpa); extern long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem); extern void mm_iommu_mapped_dec(struct mm_iommu_table_group_mem_t *mem); #endif @@ -70,8 +74,9 @@ * switch_mm is the entry point called from the architecture independent * code in kernel/sched/core.c */ -static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, - struct task_struct *tsk) +static inline void switch_mm_irqs_off(struct mm_struct *prev, + struct mm_struct *next, + struct task_struct *tsk) { /* Mark this context has been used on the new CPU */ if (!cpumask_test_cpu(smp_processor_id(), mm_cpumask(next))) @@ -110,6 +115,18 @@ switch_mmu_context(prev, next, tsk); } +static inline void switch_mm(struct mm_struct *prev, struct mm_struct *next, + struct task_struct *tsk) +{ + unsigned long flags; + + local_irq_save(flags); + switch_mm_irqs_off(prev, next, tsk); + local_irq_restore(flags); +} +#define switch_mm_irqs_off switch_mm_irqs_off + + #define deactivate_mm(tsk,mm) do { } while (0) /* --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/opal-api.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/opal-api.h @@ -40,6 +40,8 @@ #define OPAL_I2C_ARBT_LOST -22 #define OPAL_I2C_NACK_RCVD -23 #define OPAL_I2C_STOP_ERR -24 +#define OPAL_XIVE_PROVISIONING -31 +#define OPAL_XIVE_FREE_ACTIVE -32 /* API Tokens (in r0) */ #define OPAL_INVALID_CALL -1 @@ -168,7 +170,27 @@ #define OPAL_INT_SET_MFRR 125 #define OPAL_PCI_TCE_KILL 126 #define OPAL_NMMU_SET_PTCR 127 -#define OPAL_LAST 127 +#define OPAL_XIVE_RESET 128 +#define OPAL_XIVE_GET_IRQ_INFO 129 +#define OPAL_XIVE_GET_IRQ_CONFIG 130 +#define OPAL_XIVE_SET_IRQ_CONFIG 131 +#define OPAL_XIVE_GET_QUEUE_INFO 132 +#define OPAL_XIVE_SET_QUEUE_INFO 133 +#define OPAL_XIVE_DONATE_PAGE 134 +#define OPAL_XIVE_ALLOCATE_VP_BLOCK 135 +#define OPAL_XIVE_FREE_VP_BLOCK 136 +#define OPAL_XIVE_GET_VP_INFO 137 +#define OPAL_XIVE_SET_VP_INFO 138 +#define OPAL_XIVE_ALLOCATE_IRQ 139 +#define OPAL_XIVE_FREE_IRQ 140 +#define OPAL_XIVE_SYNC 141 +#define OPAL_XIVE_DUMP 142 +#define OPAL_XIVE_RESERVED3 143 +#define OPAL_XIVE_RESERVED4 144 +#define OPAL_NPU_INIT_CONTEXT 146 +#define OPAL_NPU_DESTROY_CONTEXT 147 +#define OPAL_NPU_MAP_LPAR 148 +#define OPAL_LAST 148 /* Device tree flags */ @@ -783,6 +805,15 @@ enum { OPAL_REINIT_CPUS_HILE_BE = (1 << 0), OPAL_REINIT_CPUS_HILE_LE = (1 << 1), + + /* These two define the base MMU mode of the host on P9 + * + * On P9 Nimbus DD2.0 and Cumlus (and later), KVM can still + * create hash guests in "radix" mode with care (full core + * switch only). + */ + OPAL_REINIT_CPUS_MMU_HASH = (1 << 2), + OPAL_REINIT_CPUS_MMU_RADIX = (1 << 3), }; typedef struct oppanel_line { @@ -928,6 +959,59 @@ OPAL_PCI_TCE_KILL_ALL, }; +/* The xive operation mode indicates the active "API" and + * corresponds to the "mode" parameter of the opal_xive_reset() + * call + */ +enum { + OPAL_XIVE_MODE_EMU = 0, + OPAL_XIVE_MODE_EXPL = 1, +}; + +/* Flags for OPAL_XIVE_GET_IRQ_INFO */ +enum { + OPAL_XIVE_IRQ_TRIGGER_PAGE = 0x00000001, + OPAL_XIVE_IRQ_STORE_EOI = 0x00000002, + OPAL_XIVE_IRQ_LSI = 0x00000004, + OPAL_XIVE_IRQ_SHIFT_BUG = 0x00000008, + OPAL_XIVE_IRQ_MASK_VIA_FW = 0x00000010, + OPAL_XIVE_IRQ_EOI_VIA_FW = 0x00000020, +}; + +/* Flags for OPAL_XIVE_GET/SET_QUEUE_INFO */ +enum { + OPAL_XIVE_EQ_ENABLED = 0x00000001, + OPAL_XIVE_EQ_ALWAYS_NOTIFY = 0x00000002, + OPAL_XIVE_EQ_ESCALATE = 0x00000004, +}; + +/* Flags for OPAL_XIVE_GET/SET_VP_INFO */ +enum { + OPAL_XIVE_VP_ENABLED = 0x00000001, +}; + +/* "Any chip" replacement for chip ID for allocation functions */ +enum { + OPAL_XIVE_ANY_CHIP = 0xffffffff, +}; + +/* Xive sync options */ +enum { + /* This bits are cumulative, arg is a girq */ + XIVE_SYNC_EAS = 0x00000001, /* Sync irq source */ + XIVE_SYNC_QUEUE = 0x00000002, /* Sync irq target */ +}; + +/* Dump options */ +enum { + XIVE_DUMP_TM_HYP = 0, + XIVE_DUMP_TM_POOL = 1, + XIVE_DUMP_TM_OS = 2, + XIVE_DUMP_TM_USER = 3, + XIVE_DUMP_VP = 4, + XIVE_DUMP_EMU_STATE = 5, +}; + #endif /* __ASSEMBLY__ */ #endif /* __OPAL_API_H */ --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/opal.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/opal.h @@ -29,6 +29,11 @@ /* API functions */ int64_t opal_invalid_call(void); +int64_t opal_npu_destroy_context(uint64_t phb_id, uint64_t pid, uint64_t bdf); +int64_t opal_npu_init_context(uint64_t phb_id, int pasid, uint64_t msr, + uint64_t bdf); +int64_t opal_npu_map_lpar(uint64_t phb_id, uint64_t bdf, uint64_t lparid, + uint64_t lpcr); int64_t opal_console_write(int64_t term_number, __be64 *length, const uint8_t *buffer); int64_t opal_console_read(int64_t term_number, __be64 *length, @@ -226,6 +231,42 @@ uint32_t pe_num, uint32_t tce_size, uint64_t dma_addr, uint32_t npages); int64_t opal_nmmu_set_ptcr(uint64_t chip_id, uint64_t ptcr); +int64_t opal_xive_reset(uint64_t version); +int64_t opal_xive_get_irq_info(uint32_t girq, + __be64 *out_flags, + __be64 *out_eoi_page, + __be64 *out_trig_page, + __be32 *out_esb_shift, + __be32 *out_src_chip); +int64_t opal_xive_get_irq_config(uint32_t girq, __be64 *out_vp, + uint8_t *out_prio, __be32 *out_lirq); +int64_t opal_xive_set_irq_config(uint32_t girq, uint64_t vp, uint8_t prio, + uint32_t lirq); +int64_t opal_xive_get_queue_info(uint64_t vp, uint32_t prio, + __be64 *out_qpage, + __be64 *out_qsize, + __be64 *out_qeoi_page, + __be32 *out_escalate_irq, + __be64 *out_qflags); +int64_t opal_xive_set_queue_info(uint64_t vp, uint32_t prio, + uint64_t qpage, + uint64_t qsize, + uint64_t qflags); +int64_t opal_xive_donate_page(uint32_t chip_id, uint64_t addr); +int64_t opal_xive_alloc_vp_block(uint32_t alloc_order); +int64_t opal_xive_free_vp_block(uint64_t vp); +int64_t opal_xive_get_vp_info(uint64_t vp, + __be64 *out_flags, + __be64 *out_cam_value, + __be64 *out_report_cl_pair, + __be32 *out_chip_id); +int64_t opal_xive_set_vp_info(uint64_t vp, + uint64_t flags, + uint64_t report_cl_pair); +int64_t opal_xive_allocate_irq(uint32_t chip_id); +int64_t opal_xive_free_irq(uint32_t girq); +int64_t opal_xive_sync(uint32_t type, uint32_t id); +int64_t opal_xive_dump(uint32_t type, uint32_t id); /* Internal functions */ extern int early_init_dt_scan_opal(unsigned long node, const char *uname, --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/paca.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/paca.h @@ -44,6 +44,9 @@ #define get_lppaca() (get_paca()->lppaca_ptr) #define get_slb_shadow() (get_paca()->slb_shadow_ptr) +/* Maximum number of threads per core. */ +#define MAX_SMT 8 + struct task_struct; /* --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/powernv.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/powernv.h @@ -11,9 +11,31 @@ #define _ASM_POWERNV_H #ifdef CONFIG_PPC_POWERNV +#define NPU2_WRITE 1 extern void powernv_set_nmmu_ptcr(unsigned long ptcr); +extern struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, + unsigned long flags, + struct npu_context *(*cb)(struct npu_context *, void *), + void *priv); +extern void pnv_npu2_destroy_context(struct npu_context *context, + struct pci_dev *gpdev); +extern int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, + unsigned long *flags, unsigned long *status, + int count); #else static inline void powernv_set_nmmu_ptcr(unsigned long ptcr) { } +static inline struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, + unsigned long flags, + struct npu_context *(*cb)(struct npu_context *, void *), + void *priv) { return ERR_PTR(-ENODEV); } +static inline void pnv_npu2_destroy_context(struct npu_context *context, + struct pci_dev *gpdev) { } + +static inline int pnv_npu2_handle_fault(struct npu_context *context, + uintptr_t *ea, unsigned long *flags, + unsigned long *status, int count) { + return -ENODEV; +} #endif #endif /* _ASM_POWERNV_H */ --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/ppc-opcode.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/ppc-opcode.h @@ -161,6 +161,7 @@ #define PPC_INST_MFTMR 0x7c0002dc #define PPC_INST_MSGSND 0x7c00019c #define PPC_INST_MSGCLR 0x7c0001dc +#define PPC_INST_MSGSYNC 0x7c0006ec #define PPC_INST_MSGSNDP 0x7c00011c #define PPC_INST_MTTMR 0x7c0003dc #define PPC_INST_NOP 0x60000000 @@ -345,6 +346,7 @@ ___PPC_RB(b) | __PPC_EH(eh)) #define PPC_MSGSND(b) stringify_in_c(.long PPC_INST_MSGSND | \ ___PPC_RB(b)) +#define PPC_MSGSYNC stringify_in_c(.long PPC_INST_MSGSYNC) #define PPC_MSGCLR(b) stringify_in_c(.long PPC_INST_MSGCLR | \ ___PPC_RB(b)) #define PPC_MSGSNDP(b) stringify_in_c(.long PPC_INST_MSGSNDP | \ --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/reg.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/reg.h @@ -310,6 +310,7 @@ #define SPRN_PMCR 0x374 /* Power Management Control Register */ /* HFSCR and FSCR bit numbers are the same */ +#define FSCR_SCV_LG 12 /* Enable System Call Vectored */ #define FSCR_MSGP_LG 10 /* Enable MSGP */ #define FSCR_TAR_LG 8 /* Enable Target Address Register */ #define FSCR_EBB_LG 7 /* Enable Event Based Branching */ @@ -320,6 +321,7 @@ #define FSCR_VECVSX_LG 1 /* Enable VMX/VSX */ #define FSCR_FP_LG 0 /* Enable Floating Point */ #define SPRN_FSCR 0x099 /* Facility Status & Control Register */ +#define FSCR_SCV __MASK(FSCR_SCV_LG) #define FSCR_TAR __MASK(FSCR_TAR_LG) #define FSCR_EBB __MASK(FSCR_EBB_LG) #define FSCR_DSCR __MASK(FSCR_DSCR_LG) --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/smp.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/smp.h @@ -30,6 +30,7 @@ #include extern int boot_cpuid; +extern int boot_hw_cpuid; extern int spinning_secondaries; extern void cpu_die(void); @@ -40,10 +41,11 @@ struct smp_ops_t { void (*message_pass)(int cpu, int msg); #ifdef CONFIG_PPC_SMP_MUXED_IPI - void (*cause_ipi)(int cpu, unsigned long data); + void (*cause_ipi)(int cpu); #endif void (*probe)(void); int (*kick_cpu)(int nr); + int (*prepare_cpu)(int nr); void (*setup_cpu)(int nr); void (*bringup_done)(void); void (*take_timebase)(void); @@ -61,7 +63,6 @@ DECLARE_PER_CPU(unsigned int, cpu_pvr); #ifdef CONFIG_HOTPLUG_CPU -extern void migrate_irqs(void); int generic_cpu_disable(void); void generic_cpu_die(unsigned int cpu); void generic_set_cpu_dead(unsigned int cpu); @@ -125,10 +126,10 @@ extern const char *smp_ipi_name[]; /* for irq controllers with only a single ipi */ -extern void smp_muxed_ipi_set_data(int cpu, unsigned long data); extern void smp_muxed_ipi_message_pass(int cpu, int msg); extern void smp_muxed_ipi_set_message(int cpu, int msg); extern irqreturn_t smp_ipi_demux(void); +extern irqreturn_t smp_ipi_demux_relaxed(void); void smp_init_pSeries(void); void smp_init_cell(void); --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/topology.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/topology.h @@ -44,8 +44,22 @@ extern int sysfs_add_device_to_node(struct device *dev, int nid); extern void sysfs_remove_device_from_node(struct device *dev, int nid); +static inline int early_cpu_to_node(int cpu) +{ + int nid; + + nid = numa_cpu_lookup_table[cpu]; + + /* + * Fall back to node 0 if nid is unset (it should be, except bugs). + * This allows callers to safely do NODE_DATA(early_cpu_to_node(cpu)). + */ + return (nid < 0) ? 0 : nid; +} #else +static inline int early_cpu_to_node(int cpu) { return 0; } + static inline void dump_numa_cpu_topology(void) {} static inline int sysfs_add_device_to_node(struct device *dev, int nid) --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/xics.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/xics.h @@ -57,7 +57,7 @@ void (*teardown_cpu)(void); void (*flush_ipi)(void); #ifdef CONFIG_SMP - void (*cause_ipi)(int cpu, unsigned long data); + void (*cause_ipi)(int cpu); irq_handler_t ipi_action; #endif }; --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/xive-regs.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/xive-regs.h @@ -0,0 +1,97 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _ASM_POWERPC_XIVE_REGS_H +#define _ASM_POWERPC_XIVE_REGS_H + +/* + * Thread Management (aka "TM") registers + */ + +/* TM register offsets */ +#define TM_QW0_USER 0x000 /* All rings */ +#define TM_QW1_OS 0x010 /* Ring 0..2 */ +#define TM_QW2_HV_POOL 0x020 /* Ring 0..1 */ +#define TM_QW3_HV_PHYS 0x030 /* Ring 0..1 */ + +/* Byte offsets inside a QW QW0 QW1 QW2 QW3 */ +#define TM_NSR 0x0 /* + + - + */ +#define TM_CPPR 0x1 /* - + - + */ +#define TM_IPB 0x2 /* - + + + */ +#define TM_LSMFB 0x3 /* - + + + */ +#define TM_ACK_CNT 0x4 /* - + - - */ +#define TM_INC 0x5 /* - + - + */ +#define TM_AGE 0x6 /* - + - + */ +#define TM_PIPR 0x7 /* - + - + */ + +#define TM_WORD0 0x0 +#define TM_WORD1 0x4 + +/* + * QW word 2 contains the valid bit at the top and other fields + * depending on the QW. + */ +#define TM_WORD2 0x8 +#define TM_QW0W2_VU PPC_BIT32(0) +#define TM_QW0W2_LOGIC_SERV PPC_BITMASK32(1,31) // XX 2,31 ? +#define TM_QW1W2_VO PPC_BIT32(0) +#define TM_QW1W2_OS_CAM PPC_BITMASK32(8,31) +#define TM_QW2W2_VP PPC_BIT32(0) +#define TM_QW2W2_POOL_CAM PPC_BITMASK32(8,31) +#define TM_QW3W2_VT PPC_BIT32(0) +#define TM_QW3W2_LP PPC_BIT32(6) +#define TM_QW3W2_LE PPC_BIT32(7) +#define TM_QW3W2_T PPC_BIT32(31) + +/* + * In addition to normal loads to "peek" and writes (only when invalid) + * using 4 and 8 bytes accesses, the above registers support these + * "special" byte operations: + * + * - Byte load from QW0[NSR] - User level NSR (EBB) + * - Byte store to QW0[NSR] - User level NSR (EBB) + * - Byte load/store to QW1[CPPR] and QW3[CPPR] - CPPR access + * - Byte load from QW3[TM_WORD2] - Read VT||00000||LP||LE on thrd 0 + * otherwise VT||0000000 + * - Byte store to QW3[TM_WORD2] - Set VT bit (and LP/LE if present) + * + * Then we have all these "special" CI ops at these offset that trigger + * all sorts of side effects: + */ +#define TM_SPC_ACK_EBB 0x800 /* Load8 ack EBB to reg*/ +#define TM_SPC_ACK_OS_REG 0x810 /* Load16 ack OS irq to reg */ +#define TM_SPC_PUSH_USR_CTX 0x808 /* Store32 Push/Validate user context */ +#define TM_SPC_PULL_USR_CTX 0x808 /* Load32 Pull/Invalidate user context */ +#define TM_SPC_SET_OS_PENDING 0x812 /* Store8 Set OS irq pending bit */ +#define TM_SPC_PULL_OS_CTX 0x818 /* Load32/Load64 Pull/Invalidate OS context to reg */ +#define TM_SPC_PULL_POOL_CTX 0x828 /* Load32/Load64 Pull/Invalidate Pool context to reg*/ +#define TM_SPC_ACK_HV_REG 0x830 /* Load16 ack HV irq to reg */ +#define TM_SPC_PULL_USR_CTX_OL 0xc08 /* Store8 Pull/Inval usr ctx to odd line */ +#define TM_SPC_ACK_OS_EL 0xc10 /* Store8 ack OS irq to even line */ +#define TM_SPC_ACK_HV_POOL_EL 0xc20 /* Store8 ack HV evt pool to even line */ +#define TM_SPC_ACK_HV_EL 0xc30 /* Store8 ack HV irq to even line */ +/* XXX more... */ + +/* NSR fields for the various QW ack types */ +#define TM_QW0_NSR_EB PPC_BIT8(0) +#define TM_QW1_NSR_EO PPC_BIT8(0) +#define TM_QW3_NSR_HE PPC_BITMASK8(0,1) +#define TM_QW3_NSR_HE_NONE 0 +#define TM_QW3_NSR_HE_POOL 1 +#define TM_QW3_NSR_HE_PHYS 2 +#define TM_QW3_NSR_HE_LSI 3 +#define TM_QW3_NSR_I PPC_BIT8(2) +#define TM_QW3_NSR_GRP_LVL PPC_BIT8(3,7) + +/* Utilities to manipulate these (originaly from OPAL) */ +#define MASK_TO_LSH(m) (__builtin_ffsl(m) - 1) +#define GETFIELD(m, v) (((v) & (m)) >> MASK_TO_LSH(m)) +#define SETFIELD(m, v, val) \ + (((v) & ~(m)) | ((((typeof(v))(val)) << MASK_TO_LSH(m)) & (m))) + +#endif /* _ASM_POWERPC_XIVE_REGS_H */ --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/xive.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/xive.h @@ -0,0 +1,163 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef _ASM_POWERPC_XIVE_H +#define _ASM_POWERPC_XIVE_H + +#define XIVE_INVALID_VP 0xffffffff + +#ifdef CONFIG_PPC_XIVE + +/* + * Thread Interrupt Management Area (TIMA) + * + * This is a global MMIO region divided in 4 pages of varying access + * permissions, providing access to per-cpu interrupt management + * functions. It always identifies the CPU doing the access based + * on the PowerBus initiator ID, thus we always access via the + * same offset regardless of where the code is executing + */ +extern void __iomem *xive_tima; + +/* + * Offset in the TM area of our current execution level (provided by + * the backend) + */ +extern u32 xive_tima_offset; + +/* + * Per-irq data (irq_get_handler_data for normal IRQs), IPIs + * have it stored in the xive_cpu structure. We also cache + * for normal interrupts the current target CPU. + * + * This structure is setup by the backend for each interrupt. + */ +struct xive_irq_data { + u64 flags; + u64 eoi_page; + void __iomem *eoi_mmio; + u64 trig_page; + void __iomem *trig_mmio; + u32 esb_shift; + int src_chip; + + /* Setup/used by frontend */ + int target; + bool saved_p; +}; +#define XIVE_IRQ_FLAG_STORE_EOI 0x01 +#define XIVE_IRQ_FLAG_LSI 0x02 +#define XIVE_IRQ_FLAG_SHIFT_BUG 0x04 +#define XIVE_IRQ_FLAG_MASK_FW 0x08 +#define XIVE_IRQ_FLAG_EOI_FW 0x10 + +#define XIVE_INVALID_CHIP_ID -1 + +/* A queue tracking structure in a CPU */ +struct xive_q { + __be32 *qpage; + u32 msk; + u32 idx; + u32 toggle; + u64 eoi_phys; + u32 esc_irq; + atomic_t count; + atomic_t pending_count; +}; + +/* + * "magic" Event State Buffer (ESB) MMIO offsets. + * + * Each interrupt source has a 2-bit state machine called ESB + * which can be controlled by MMIO. It's made of 2 bits, P and + * Q. P indicates that an interrupt is pending (has been sent + * to a queue and is waiting for an EOI). Q indicates that the + * interrupt has been triggered while pending. + * + * This acts as a coalescing mechanism in order to guarantee + * that a given interrupt only occurs at most once in a queue. + * + * When doing an EOI, the Q bit will indicate if the interrupt + * needs to be re-triggered. + * + * The following offsets into the ESB MMIO allow to read or + * manipulate the PQ bits. They must be used with an 8-bytes + * load instruction. They all return the previous state of the + * interrupt (atomically). + * + * Additionally, some ESB pages support doing an EOI via a + * store at 0 and some ESBs support doing a trigger via a + * separate trigger page. + */ +#define XIVE_ESB_GET 0x800 +#define XIVE_ESB_SET_PQ_00 0xc00 +#define XIVE_ESB_SET_PQ_01 0xd00 +#define XIVE_ESB_SET_PQ_10 0xe00 +#define XIVE_ESB_SET_PQ_11 0xf00 +#define XIVE_ESB_MASK XIVE_ESB_SET_PQ_01 + +#define XIVE_ESB_VAL_P 0x2 +#define XIVE_ESB_VAL_Q 0x1 + +/* Global enable flags for the XIVE support */ +extern bool __xive_enabled; + +static inline bool xive_enabled(void) { return __xive_enabled; } + +extern bool xive_native_init(void); +extern void xive_smp_probe(void); +extern int xive_smp_prepare_cpu(unsigned int cpu); +extern void xive_smp_setup_cpu(void); +extern void xive_smp_disable_cpu(void); +extern void xive_kexec_teardown_cpu(int secondary); +extern void xive_shutdown(void); +extern void xive_flush_interrupt(void); + +/* xmon hook */ +extern void xmon_xive_do_dump(int cpu); + +/* APIs used by KVM */ +extern u32 xive_native_default_eq_shift(void); +extern u32 xive_native_alloc_vp_block(u32 max_vcpus); +extern void xive_native_free_vp_block(u32 vp_base); +extern int xive_native_populate_irq_data(u32 hw_irq, + struct xive_irq_data *data); +extern void xive_cleanup_irq_data(struct xive_irq_data *xd); +extern u32 xive_native_alloc_irq(void); +extern void xive_native_free_irq(u32 irq); +extern int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); + +extern int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, + __be32 *qpage, u32 order, bool can_escalate); +extern void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio); + +extern bool __xive_irq_trigger(struct xive_irq_data *xd); +extern bool __xive_irq_retrigger(struct xive_irq_data *xd); +extern void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd); + +extern bool is_xive_irq(struct irq_chip *chip); + +#else + +static inline bool xive_enabled(void) { return false; } + +static inline bool xive_native_init(void) { return false; } +static inline void xive_smp_probe(void) { } +extern inline int xive_smp_prepare_cpu(unsigned int cpu) { return -EINVAL; } +static inline void xive_smp_setup_cpu(void) { } +static inline void xive_smp_disable_cpu(void) { } +static inline void xive_kexec_teardown_cpu(int secondary) { } +static inline void xive_shutdown(void) { } +static inline void xive_flush_interrupt(void) { } + +static inline u32 xive_native_alloc_vp_block(u32 max_vcpus) { return XIVE_INVALID_VP; } +static inline void xive_native_free_vp_block(u32 vp_base) { } + +#endif + +#endif /* _ASM_POWERPC_XIVE_H */ --- linux-azure-4.11.0.orig/arch/powerpc/include/asm/xmon.h +++ linux-azure-4.11.0/arch/powerpc/include/asm/xmon.h @@ -29,5 +29,7 @@ extern int cpus_are_in_xmon(void); #endif +extern void xmon_printf(const char *format, ...); + #endif /* __KERNEL __ */ #endif /* __ASM_POWERPC_XMON_H */ --- linux-azure-4.11.0.orig/arch/powerpc/kernel/dbell.c +++ linux-azure-4.11.0/arch/powerpc/kernel/dbell.c @@ -20,18 +20,60 @@ #include #ifdef CONFIG_SMP -void doorbell_setup_this_cpu(void) + +/* + * Doorbells must only be used if CPU_FTR_DBELL is available. + * msgsnd is used in HV, and msgsndp is used in !HV. + * + * These should be used by platform code that is aware of restrictions. + * Other arch code should use ->cause_ipi. + * + * doorbell_global_ipi() sends a dbell to any target CPU. + * Must be used only by architectures that address msgsnd target + * by PIR/get_hard_smp_processor_id. + */ +void doorbell_global_ipi(int cpu) { - unsigned long tag = mfspr(SPRN_DOORBELL_CPUTAG) & PPC_DBELL_TAG_MASK; + u32 tag = get_hard_smp_processor_id(cpu); - smp_muxed_ipi_set_data(smp_processor_id(), tag); + kvmppc_set_host_ipi(cpu, 1); + /* Order previous accesses vs. msgsnd, which is treated as a store */ + ppc_msgsnd_sync(); + ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); } -void doorbell_cause_ipi(int cpu, unsigned long data) +/* + * doorbell_core_ipi() sends a dbell to a target CPU in the same core. + * Must be used only by architectures that address msgsnd target + * by TIR/cpu_thread_in_core. + */ +void doorbell_core_ipi(int cpu) { + u32 tag = cpu_thread_in_core(cpu); + + kvmppc_set_host_ipi(cpu, 1); /* Order previous accesses vs. msgsnd, which is treated as a store */ - mb(); - ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, data); + ppc_msgsnd_sync(); + ppc_msgsnd(PPC_DBELL_MSGTYPE, 0, tag); +} + +/* + * Attempt to cause a core doorbell if destination is on the same core. + * Returns 1 on success, 0 on failure. + */ +int doorbell_try_core_ipi(int cpu) +{ + int this_cpu = get_cpu(); + int ret = 0; + + if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) { + doorbell_core_ipi(cpu); + ret = 1; + } + + put_cpu(); + + return ret; } void doorbell_exception(struct pt_regs *regs) @@ -40,12 +82,14 @@ irq_enter(); + ppc_msgsync(); + may_hard_irq_enable(); kvmppc_set_host_ipi(smp_processor_id(), 0); __this_cpu_inc(irq_stat.doorbell_irqs); - smp_ipi_demux(); + smp_ipi_demux_relaxed(); /* already performed the barrier */ irq_exit(); set_irq_regs(old_regs); --- linux-azure-4.11.0.orig/arch/powerpc/kernel/eeh_driver.c +++ linux-azure-4.11.0/arch/powerpc/kernel/eeh_driver.c @@ -724,7 +724,7 @@ */ #define MAX_WAIT_FOR_RECOVERY 300 -static void eeh_handle_normal_event(struct eeh_pe *pe) +static bool eeh_handle_normal_event(struct eeh_pe *pe) { struct pci_bus *frozen_bus; struct eeh_dev *edev, *tmp; @@ -736,7 +736,7 @@ if (!frozen_bus) { pr_err("%s: Cannot find PCI bus for PHB#%x-PE#%x\n", __func__, pe->phb->global_number, pe->addr); - return; + return false; } eeh_pe_update_time_stamp(pe); @@ -870,7 +870,7 @@ pr_info("EEH: Notify device driver to resume\n"); eeh_pe_dev_traverse(pe, eeh_report_resume, NULL); - return; + return false; excess_failures: /* @@ -915,8 +915,12 @@ pci_lock_rescan_remove(); pci_hp_remove_devices(frozen_bus); pci_unlock_rescan_remove(); + + /* The passed PE should no longer be used */ + return true; } } + return false; } static void eeh_handle_special_event(void) @@ -982,7 +986,14 @@ */ if (rc == EEH_NEXT_ERR_FROZEN_PE || rc == EEH_NEXT_ERR_FENCED_PHB) { - eeh_handle_normal_event(pe); + /* + * eeh_handle_normal_event() can make the PE stale if it + * determines that the PE cannot possibly be recovered. + * Don't modify the PE state if that's the case. + */ + if (eeh_handle_normal_event(pe)) + continue; + eeh_pe_state_clear(pe, EEH_PE_RECOVERING); } else { pci_lock_rescan_remove(); --- linux-azure-4.11.0.orig/arch/powerpc/kernel/exceptions-64e.S +++ linux-azure-4.11.0/arch/powerpc/kernel/exceptions-64e.S @@ -735,8 +735,14 @@ andis. r15,r14,(DBSR_IC|DBSR_BT)@h beq+ 1f +#ifdef CONFIG_RELOCATABLE + ld r15,PACATOC(r13) + ld r14,interrupt_base_book3e@got(r15) + ld r15,__end_interrupts@got(r15) +#else LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) LOAD_REG_IMMEDIATE(r15,__end_interrupts) +#endif cmpld cr0,r10,r14 cmpld cr1,r10,r15 blt+ cr0,1f @@ -799,8 +805,14 @@ andis. r15,r14,(DBSR_IC|DBSR_BT)@h beq+ 1f +#ifdef CONFIG_RELOCATABLE + ld r15,PACATOC(r13) + ld r14,interrupt_base_book3e@got(r15) + ld r15,__end_interrupts@got(r15) +#else LOAD_REG_IMMEDIATE(r14,interrupt_base_book3e) LOAD_REG_IMMEDIATE(r15,__end_interrupts) +#endif cmpld cr0,r10,r14 cmpld cr1,r10,r15 blt+ cr0,1f --- linux-azure-4.11.0.orig/arch/powerpc/kernel/exceptions-64s.S +++ linux-azure-4.11.0/arch/powerpc/kernel/exceptions-64s.S @@ -99,7 +99,9 @@ #ifdef CONFIG_PPC_P7_NAP /* * If running native on arch 2.06 or later, check if we are waking up - * from nap/sleep/winkle, and branch to idle handler. + * from nap/sleep/winkle, and branch to idle handler. This tests + * SRR1 bits 46:47. A non-0 value indicates that we are coming from + * a power saving state. */ #define IDLETEST(n) \ BEGIN_FTR_SECTION ; \ @@ -1425,10 +1427,8 @@ .balign IFETCH_ALIGN_BYTES do_hash_page: #ifdef CONFIG_PPC_STD_MMU_64 - andis. r0,r4,0xa410 /* weird error? */ + andis. r0,r4,0xa450 /* weird error? */ bne- handle_page_fault /* if not, try to insert a HPTE */ - andis. r0,r4,DSISR_DABRMATCH@h - bne- handle_dabr_fault CURRENT_THREAD_INFO(r11, r1) lwz r0,TI_PREEMPT(r11) /* If we're in an "NMI" */ andis. r0,r0,NMI_MASK@h /* (i.e. an irq when soft-disabled) */ @@ -1452,11 +1452,16 @@ /* Error */ blt- 13f + + /* Reload DSISR into r4 for the DABR check below */ + ld r4,_DSISR(r1) #endif /* CONFIG_PPC_STD_MMU_64 */ /* Here we have a page fault that hash_page can't handle. */ handle_page_fault: -11: ld r4,_DAR(r1) +11: andis. r0,r4,DSISR_DABRMATCH@h + bne- handle_dabr_fault + ld r4,_DAR(r1) ld r5,_DSISR(r1) addi r3,r1,STACK_FRAME_OVERHEAD bl do_page_fault --- linux-azure-4.11.0.orig/arch/powerpc/kernel/fadump.c +++ linux-azure-4.11.0/arch/powerpc/kernel/fadump.c @@ -216,11 +216,25 @@ * Check if the size is specified through fadump_reserve_mem= cmdline * option. If yes, then use that. */ - if (fw_dump.reserve_bootvar) + if (fw_dump.reserve_bootvar) { + unsigned long max_size; + + /* + * Adjust if the boot memory size specified is above + * the upper limit. + */ + max_size = memblock_phys_mem_size() / MAX_BOOT_MEM_RATIO; + if (fw_dump.reserve_bootvar > max_size) { + fw_dump.reserve_bootvar = max_size; + pr_info("Adjusted boot memory size to %luMB\n", + (fw_dump.reserve_bootvar >> 20)); + } + return fw_dump.reserve_bootvar; + } /* divide by 20 to get 5% of value */ - size = memblock_end_of_DRAM() / 20; + size = memblock_phys_mem_size() / 20; /* round it down in multiples of 256 */ size = size & ~0x0FFFFFFFUL; --- linux-azure-4.11.0.orig/arch/powerpc/kernel/idle_book3s.S +++ linux-azure-4.11.0/arch/powerpc/kernel/idle_book3s.S @@ -387,6 +387,13 @@ BEGIN_FTR_SECTION ld r2,PACATOC(r13); /* + * Workaround for POWER9, if we lost resources, the ERAT + * might have been mixed up and needs flushing. + */ + blt cr3,1f + PPC_INVALIDATE_ERAT +1: + /* * POWER ISA 3. Use PSSCR to determine if we * are waking up from deep idle state */ --- linux-azure-4.11.0.orig/arch/powerpc/kernel/iommu.c +++ linux-azure-4.11.0/arch/powerpc/kernel/iommu.c @@ -711,13 +711,16 @@ return tbl; } -void iommu_free_table(struct iommu_table *tbl, const char *node_name) +static void iommu_table_free(struct kref *kref) { unsigned long bitmap_sz; unsigned int order; + struct iommu_table *tbl; - if (!tbl) - return; + tbl = container_of(kref, struct iommu_table, it_kref); + + if (tbl->it_ops->free) + tbl->it_ops->free(tbl); if (!tbl->it_map) { kfree(tbl); @@ -733,7 +736,7 @@ /* verify that table contains no entries */ if (!bitmap_empty(tbl->it_map, tbl->it_size)) - pr_warn("%s: Unexpected TCEs for %s\n", __func__, node_name); + pr_warn("%s: Unexpected TCEs\n", __func__); /* calculate bitmap size in bytes */ bitmap_sz = BITS_TO_LONGS(tbl->it_size) * sizeof(unsigned long); @@ -746,6 +749,24 @@ kfree(tbl); } +struct iommu_table *iommu_tce_table_get(struct iommu_table *tbl) +{ + if (kref_get_unless_zero(&tbl->it_kref)) + return tbl; + + return NULL; +} +EXPORT_SYMBOL_GPL(iommu_tce_table_get); + +int iommu_tce_table_put(struct iommu_table *tbl) +{ + if (WARN_ON(!tbl)) + return 0; + + return kref_put(&tbl->it_kref, iommu_table_free); +} +EXPORT_SYMBOL_GPL(iommu_tce_table_put); + /* Creates TCEs for a user provided buffer. The user buffer must be * contiguous real kernel storage (not vmalloc). The address passed here * comprises a page address and offset into that page. The dma_addr_t @@ -1004,6 +1025,31 @@ } EXPORT_SYMBOL_GPL(iommu_tce_xchg); +#ifdef CONFIG_PPC_BOOK3S_64 +long iommu_tce_xchg_rm(struct iommu_table *tbl, unsigned long entry, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret; + + ret = tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); + + if (!ret && ((*direction == DMA_FROM_DEVICE) || + (*direction == DMA_BIDIRECTIONAL))) { + struct page *pg = realmode_pfn_to_page(*hpa >> PAGE_SHIFT); + + if (likely(pg)) { + SetPageDirty(pg); + } else { + tbl->it_ops->exchange_rm(tbl, entry, hpa, direction); + ret = -EFAULT; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(iommu_tce_xchg_rm); +#endif + int iommu_take_ownership(struct iommu_table *tbl) { unsigned long flags, i, sz = (tbl->it_size + 7) >> 3; --- linux-azure-4.11.0.orig/arch/powerpc/kernel/irq.c +++ linux-azure-4.11.0/arch/powerpc/kernel/irq.c @@ -442,46 +442,6 @@ return sum; } -#ifdef CONFIG_HOTPLUG_CPU -void migrate_irqs(void) -{ - struct irq_desc *desc; - unsigned int irq; - static int warned; - cpumask_var_t mask; - const struct cpumask *map = cpu_online_mask; - - alloc_cpumask_var(&mask, GFP_KERNEL); - - for_each_irq_desc(irq, desc) { - struct irq_data *data; - struct irq_chip *chip; - - data = irq_desc_get_irq_data(desc); - if (irqd_is_per_cpu(data)) - continue; - - chip = irq_data_get_irq_chip(data); - - cpumask_and(mask, irq_data_get_affinity_mask(data), map); - if (cpumask_any(mask) >= nr_cpu_ids) { - pr_warn("Breaking affinity for irq %i\n", irq); - cpumask_copy(mask, map); - } - if (chip->irq_set_affinity) - chip->irq_set_affinity(data, mask, true); - else if (desc->action && !(warned++)) - pr_err("Cannot set affinity for irq %i\n", irq); - } - - free_cpumask_var(mask); - - local_irq_enable(); - mdelay(1); - local_irq_disable(); -} -#endif - static inline void check_stack_overflow(void) { #ifdef CONFIG_DEBUG_STACKOVERFLOW --- linux-azure-4.11.0.orig/arch/powerpc/kernel/kprobes.c +++ linux-azure-4.11.0/arch/powerpc/kernel/kprobes.c @@ -495,6 +495,15 @@ regs->gpr[2] = (unsigned long)(((func_descr_t *)jp->entry)->toc); #endif + /* + * jprobes use jprobe_return() which skips the normal return + * path of the function, and this messes up the accounting of the + * function graph tracer. + * + * Pause function graph tracing while performing the jprobe function. + */ + pause_graph_tracing(); + return 1; } @@ -517,6 +526,8 @@ * saved regs... */ memcpy(regs, &kcb->jprobe_saved_regs, sizeof(struct pt_regs)); + /* It's OK to start function graph tracing again */ + unpause_graph_tracing(); preempt_enable_no_resched(); return 1; } --- linux-azure-4.11.0.orig/arch/powerpc/kernel/mce.c +++ linux-azure-4.11.0/arch/powerpc/kernel/mce.c @@ -221,6 +221,8 @@ { int index; + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + /* * For now just print it to console. * TODO: log this error event to FSP or nvram. --- linux-azure-4.11.0.orig/arch/powerpc/kernel/misc_64.S +++ linux-azure-4.11.0/arch/powerpc/kernel/misc_64.S @@ -614,6 +614,18 @@ li r0,0 std r0,16(r1) +BEGIN_FTR_SECTION + /* + * This is the best time to turn AMR/IAMR off. + * key 0 is used in radix for supervisor<->user + * protection, but on hash key 0 is reserved + * ideally we want to enter with a clean state. + * NOTE, we rely on r0 being 0 from above. + */ + mtspr SPRN_IAMR,r0 + mtspr SPRN_AMOR,r0 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) + /* save regs for local vars on new stack. * yes, we won't go back, but ... */ --- linux-azure-4.11.0.orig/arch/powerpc/kernel/nvram_64.c +++ linux-azure-4.11.0/arch/powerpc/kernel/nvram_64.c @@ -561,6 +561,7 @@ static struct pstore_info nvram_pstore_info = { .owner = THIS_MODULE, .name = "nvram", + .flags = PSTORE_FLAGS_DMESG, .open = nvram_pstore_open, .read = nvram_pstore_read, .write = nvram_pstore_write, --- linux-azure-4.11.0.orig/arch/powerpc/kernel/paca.c +++ linux-azure-4.11.0/arch/powerpc/kernel/paca.c @@ -198,6 +198,7 @@ { u64 limit; int cpu; + int nr_cpus; limit = ppc64_rma_size; @@ -210,20 +211,32 @@ limit = min(0x10000000ULL, limit); #endif - paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpu_ids); + /* + * Always align up the nr_cpu_ids to SMT threads and allocate + * the paca. This will help us to prepare for a situation where + * boot cpu id > nr_cpus_id. We will use the last nthreads + * slots (nthreads == threads per core) to accommodate a core + * that contains boot cpu thread. + * + * Do not change nr_cpu_ids value here. Let us do that in + * early_init_dt_scan_cpus() where we know exact value + * of threads per core. + */ + nr_cpus = _ALIGN_UP(nr_cpu_ids, MAX_SMT); + paca_size = PAGE_ALIGN(sizeof(struct paca_struct) * nr_cpus); paca = __va(memblock_alloc_base(paca_size, PAGE_SIZE, limit)); memset(paca, 0, paca_size); printk(KERN_DEBUG "Allocated %u bytes for %d pacas at %p\n", - paca_size, nr_cpu_ids, paca); + paca_size, nr_cpus, paca); - allocate_lppacas(nr_cpu_ids, limit); + allocate_lppacas(nr_cpus, limit); - allocate_slb_shadows(nr_cpu_ids, limit); + allocate_slb_shadows(nr_cpus, limit); /* Can't use for_each_*_cpu, as they aren't functional yet */ - for (cpu = 0; cpu < nr_cpu_ids; cpu++) + for (cpu = 0; cpu < nr_cpus; cpu++) initialise_paca(&paca[cpu], cpu); } --- linux-azure-4.11.0.orig/arch/powerpc/kernel/pci-common.c +++ linux-azure-4.11.0/arch/powerpc/kernel/pci-common.c @@ -323,6 +323,7 @@ } return NULL; } +EXPORT_SYMBOL(pci_find_hose_for_OF_device); /* * Reads the interrupt pin to determine if interrupt is use by card. @@ -1640,6 +1641,7 @@ { return pci_bus_find_capability(fake_pci_bus(hose, bus), devfn, cap); } +EXPORT_SYMBOL_GPL(early_find_capability); struct device_node *pcibios_get_phb_of_node(struct pci_bus *bus) { --- linux-azure-4.11.0.orig/arch/powerpc/kernel/process.c +++ linux-azure-4.11.0/arch/powerpc/kernel/process.c @@ -864,6 +864,25 @@ if (!MSR_TM_SUSPENDED(mfmsr())) return; + /* + * If we are in a transaction and FP is off then we can't have + * used FP inside that transaction. Hence the checkpointed + * state is the same as the live state. We need to copy the + * live state to the checkpointed state so that when the + * transaction is restored, the checkpointed state is correct + * and the aborted transaction sees the correct state. We use + * ckpt_regs.msr here as that's what tm_reclaim will use to + * determine if it's going to write the checkpointed state or + * not. So either this will write the checkpointed registers, + * or reclaim will. Similarly for VMX. + */ + if ((thr->ckpt_regs.msr & MSR_FP) == 0) + memcpy(&thr->ckfp_state, &thr->fp_state, + sizeof(struct thread_fp_state)); + if ((thr->ckpt_regs.msr & MSR_VEC) == 0) + memcpy(&thr->ckvr_state, &thr->vr_state, + sizeof(struct thread_vr_state)); + giveup_all(container_of(thr, struct task_struct, thread)); tm_reclaim(thr, thr->ckpt_regs.msr, cause); @@ -1647,6 +1666,7 @@ #ifdef CONFIG_VSX current->thread.used_vsr = 0; #endif + current->thread.load_fp = 0; memset(¤t->thread.fp_state, 0, sizeof(current->thread.fp_state)); current->thread.fp_save_area = NULL; #ifdef CONFIG_ALTIVEC @@ -1655,6 +1675,7 @@ current->thread.vr_save_area = NULL; current->thread.vrsave = 0; current->thread.used_vr = 0; + current->thread.load_vec = 0; #endif /* CONFIG_ALTIVEC */ #ifdef CONFIG_SPE memset(current->thread.evr, 0, sizeof(current->thread.evr)); @@ -1666,6 +1687,7 @@ current->thread.tm_tfhar = 0; current->thread.tm_texasr = 0; current->thread.tm_tfiar = 0; + current->thread.load_tm = 0; #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ } EXPORT_SYMBOL(start_thread); --- linux-azure-4.11.0.orig/arch/powerpc/kernel/prom.c +++ linux-azure-4.11.0/arch/powerpc/kernel/prom.c @@ -161,7 +161,9 @@ { .pabyte = 0, .pabit = 3, .cpu_features = CPU_FTR_CTRL }, { .pabyte = 0, .pabit = 6, .cpu_features = CPU_FTR_NOEXECUTE }, { .pabyte = 1, .pabit = 2, .mmu_features = MMU_FTR_CI_LARGE_PAGE }, +#ifdef CONFIG_PPC_RADIX_MMU { .pabyte = 40, .pabit = 0, .mmu_features = MMU_FTR_TYPE_RADIX }, +#endif { .pabyte = 1, .pabit = 1, .invert = 1, .cpu_features = CPU_FTR_NODSISRALIGN }, { .pabyte = 5, .pabit = 0, .cpu_features = CPU_FTR_REAL_LE, .cpu_user_ftrs = PPC_FEATURE_TRUE_LE }, @@ -300,6 +302,29 @@ } } +/* + * Adjust the logical id of a boot cpu to fall under nr_cpu_ids. Map it to + * last core slot in the allocated paca array. + * + * e.g. on SMT=8 system, kernel booted with nr_cpus=1 and boot cpu = 33, + * align nr_cpu_ids to MAX_SMT value 8. Allocate paca array to hold up-to + * MAX_SMT=8 cpus. Since boot cpu 33 is greater than nr_cpus (8), adjust + * its logical id so that new id becomes less than nr_cpu_ids. Make sure + * that boot cpu's new logical id is aligned to its thread id and falls + * under last nthreads slots available in paca array. In this case the + * boot cpu 33 is adjusted to new boot cpu id 1. + * + */ +static inline void adjust_boot_cpuid(int nthreads, int phys_id) +{ + boot_hw_cpuid = phys_id; + if (boot_cpuid >= nr_cpu_ids) { + boot_cpuid = (boot_cpuid % nthreads) + (nr_cpu_ids - nthreads); + pr_info("Adjusted logical boot cpu id: logical %d physical %d\n", + boot_cpuid, phys_id); + } +} + static int __init early_init_dt_scan_cpus(unsigned long node, const char *uname, int depth, void *data) @@ -323,6 +348,18 @@ nthreads = len / sizeof(int); +#ifdef CONFIG_SMP + /* + * Now that we know threads per core lets align nr_cpu_ids to + * correct SMT value. + */ + if (nr_cpu_ids % nthreads) { + nr_cpu_ids = _ALIGN_UP(nr_cpu_ids, nthreads); + pr_info("Aligned nr_cpus to SMT=%d, nr_cpu_ids = %d\n", + nthreads, nr_cpu_ids); + } +#endif + /* * Now see if any of these threads match our boot cpu. * NOTE: This must match the parsing done in smp_setup_cpu_maps. @@ -361,7 +398,9 @@ DBG("boot cpu: logical %d physical %d\n", found, be32_to_cpu(intserv[found_thread])); boot_cpuid = found; - set_hard_smp_processor_id(found, be32_to_cpu(intserv[found_thread])); + adjust_boot_cpuid(nthreads, be32_to_cpu(intserv[found_thread])); + set_hard_smp_processor_id(boot_cpuid, + be32_to_cpu(intserv[found_thread])); /* * PAPR defines "logical" PVR values for cpus that --- linux-azure-4.11.0.orig/arch/powerpc/kernel/prom_init.c +++ linux-azure-4.11.0/arch/powerpc/kernel/prom_init.c @@ -815,7 +815,7 @@ .virt_base = cpu_to_be32(0xffffffff), .virt_size = cpu_to_be32(0xffffffff), .load_base = cpu_to_be32(0xffffffff), - .min_rma = cpu_to_be32(256), /* 256MB min RMA */ + .min_rma = cpu_to_be32(512), /* 512MB min RMA */ .min_load = cpu_to_be32(0xffffffff), /* full client load */ .min_rma_percent = 0, /* min RMA percentage of total RAM */ .max_pft_size = 48, /* max log_2(hash table size) */ --- linux-azure-4.11.0.orig/arch/powerpc/kernel/setup-common.c +++ linux-azure-4.11.0/arch/powerpc/kernel/setup-common.c @@ -85,6 +85,7 @@ EXPORT_SYMBOL(machine_id); int boot_cpuid = -1; +int boot_hw_cpuid = -1; EXPORT_SYMBOL_GPL(boot_cpuid); /* @@ -330,6 +331,10 @@ maj = ((pvr >> 8) & 0xFF) - 1; min = pvr & 0xFF; break; + case 0x004e: /* POWER9 bits 12-15 give chip type */ + maj = (pvr >> 8) & 0x0F; + min = pvr & 0xFF; + break; default: maj = (pvr >> 8) & 0xFF; min = pvr & 0xFF; @@ -464,6 +469,7 @@ struct device_node *dn = NULL; int cpu = 0; int nthreads = 1; + bool boot_cpu_added = false; DBG("smp_setup_cpu_maps()\n"); @@ -490,6 +496,24 @@ } nthreads = len / sizeof(int); + /* + * If boot cpu hasn't been added to paca and there are only + * last nthreads slots available in paca array then wait + * for boot cpu to show up. + */ + if (!boot_cpu_added && (cpu + nthreads) >= nr_cpu_ids) { + int found = 0; + + DBG("Holding last nthreads paca slots for boot cpu\n"); + for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) { + if (boot_hw_cpuid == be32_to_cpu(intserv[j])) { + found = 1; + break; + } + } + if (!found) + continue; + } for (j = 0; j < nthreads && cpu < nr_cpu_ids; j++) { bool avail; @@ -505,6 +529,11 @@ set_cpu_present(cpu, avail); set_hard_smp_processor_id(cpu, be32_to_cpu(intserv[j])); set_cpu_possible(cpu, true); + if (boot_hw_cpuid == be32_to_cpu(intserv[j])) { + DBG("Boot cpu %d (hard id %d) added to paca\n", + cpu, be32_to_cpu(intserv[j])); + boot_cpu_added = true; + } cpu++; } } --- linux-azure-4.11.0.orig/arch/powerpc/kernel/setup_64.c +++ linux-azure-4.11.0/arch/powerpc/kernel/setup_64.c @@ -650,7 +650,7 @@ static void * __init pcpu_fc_alloc(unsigned int cpu, size_t size, size_t align) { - return __alloc_bootmem_node(NODE_DATA(cpu_to_node(cpu)), size, align, + return __alloc_bootmem_node(NODE_DATA(early_cpu_to_node(cpu)), size, align, __pa(MAX_DMA_ADDRESS)); } @@ -661,7 +661,7 @@ static int pcpu_cpu_distance(unsigned int from, unsigned int to) { - if (cpu_to_node(from) == cpu_to_node(to)) + if (early_cpu_to_node(from) == early_cpu_to_node(to)) return LOCAL_DISTANCE; else return REMOTE_DISTANCE; --- linux-azure-4.11.0.orig/arch/powerpc/kernel/smp.c +++ linux-azure-4.11.0/arch/powerpc/kernel/smp.c @@ -39,6 +39,7 @@ #include #include #include +#include #include #include #include @@ -211,17 +212,9 @@ #ifdef CONFIG_PPC_SMP_MUXED_IPI struct cpu_messages { long messages; /* current messages */ - unsigned long data; /* data for cause ipi */ }; static DEFINE_PER_CPU_SHARED_ALIGNED(struct cpu_messages, ipi_message); -void smp_muxed_ipi_set_data(int cpu, unsigned long data) -{ - struct cpu_messages *info = &per_cpu(ipi_message, cpu); - - info->data = data; -} - void smp_muxed_ipi_set_message(int cpu, int msg) { struct cpu_messages *info = &per_cpu(ipi_message, cpu); @@ -236,14 +229,13 @@ void smp_muxed_ipi_message_pass(int cpu, int msg) { - struct cpu_messages *info = &per_cpu(ipi_message, cpu); - smp_muxed_ipi_set_message(cpu, msg); + /* * cause_ipi functions are required to include a full barrier * before doing whatever causes the IPI. */ - smp_ops->cause_ipi(cpu, info->data); + smp_ops->cause_ipi(cpu); } #ifdef __BIG_ENDIAN__ @@ -254,11 +246,18 @@ irqreturn_t smp_ipi_demux(void) { - struct cpu_messages *info = this_cpu_ptr(&ipi_message); - unsigned long all; - mb(); /* order any irq clear */ + return smp_ipi_demux_relaxed(); +} + +/* sync-free variant. Callers should ensure synchronization */ +irqreturn_t smp_ipi_demux_relaxed(void) +{ + struct cpu_messages *info; + unsigned long all; + + info = this_cpu_ptr(&ipi_message); do { all = xchg(&info->messages, 0); #if defined(CONFIG_KVM_XICS) && defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) @@ -439,7 +438,14 @@ #ifdef CONFIG_PPC64 vdso_data->processorCount--; #endif - migrate_irqs(); + /* Update affinity of all IRQs previously aimed at this CPU */ + irq_migrate_all_off_this_cpu(); + + /* Give the CPU time to drain in-flight ones */ + local_irq_enable(); + mdelay(1); + local_irq_disable(); + return 0; } @@ -521,6 +527,16 @@ cpu_idle_thread_init(cpu, tidle); + /* + * The platform might need to allocate resources prior to bringing + * up the CPU + */ + if (smp_ops->prepare_cpu) { + rc = smp_ops->prepare_cpu(cpu); + if (rc) + return rc; + } + /* Make sure callin-map entry is 0 (can be leftover a CPU * hotplug */ --- linux-azure-4.11.0.orig/arch/powerpc/kernel/sysfs.c +++ linux-azure-4.11.0/arch/powerpc/kernel/sysfs.c @@ -710,6 +710,10 @@ struct device_attribute *attrs, *pmc_attrs; int i, nattrs; + /* For cpus present at boot a reference was already grabbed in register_cpu() */ + if (!s->of_node) + s->of_node = of_get_cpu_node(cpu, NULL); + #ifdef CONFIG_PPC64 if (cpu_has_feature(CPU_FTR_SMT)) device_create_file(s, &dev_attr_smt_snooze_delay); @@ -864,6 +868,8 @@ } #endif cacheinfo_cpu_offline(cpu); + of_node_put(s->of_node); + s->of_node = NULL; #endif /* CONFIG_HOTPLUG_CPU */ return 0; } --- linux-azure-4.11.0.orig/arch/powerpc/kernel/traps.c +++ linux-azure-4.11.0/arch/powerpc/kernel/traps.c @@ -306,8 +306,6 @@ __this_cpu_inc(irq_stat.mce_exceptions); - add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); - if (cur_cpu_spec && cur_cpu_spec->machine_check_early) handled = cur_cpu_spec->machine_check_early(regs); return handled; @@ -741,6 +739,8 @@ __this_cpu_inc(irq_stat.mce_exceptions); + add_taint(TAINT_MACHINE_CHECK, LOCKDEP_NOW_UNRELIABLE); + /* See if any machine dependent calls. In theory, we would want * to call the CPU first, and call the ppc_md. one if the CPU * one returns a positive number. However there is existing code @@ -1440,6 +1440,8 @@ [FSCR_TM_LG] = "TM", [FSCR_EBB_LG] = "EBB", [FSCR_TAR_LG] = "TAR", + [FSCR_MSGP_LG] = "MSGP", + [FSCR_SCV_LG] = "SCV", }; char *facility = "unknown"; u64 value; --- linux-azure-4.11.0.orig/arch/powerpc/kvm/book3s_64_vio.c +++ linux-azure-4.11.0/arch/powerpc/kvm/book3s_64_vio.c @@ -28,6 +28,8 @@ #include #include #include +#include +#include #include #include @@ -40,6 +42,7 @@ #include #include #include +#include static unsigned long kvmppc_tce_pages(unsigned long iommu_pages) { @@ -91,6 +94,137 @@ return ret; } +static void kvm_spapr_tce_iommu_table_free(struct rcu_head *head) +{ + struct kvmppc_spapr_tce_iommu_table *stit = container_of(head, + struct kvmppc_spapr_tce_iommu_table, rcu); + + iommu_tce_table_put(stit->tbl); + + kfree(stit); +} + +static void kvm_spapr_tce_liobn_put(struct kref *kref) +{ + struct kvmppc_spapr_tce_iommu_table *stit = container_of(kref, + struct kvmppc_spapr_tce_iommu_table, kref); + + list_del_rcu(&stit->next); + + call_rcu(&stit->rcu, kvm_spapr_tce_iommu_table_free); +} + +extern void kvm_spapr_tce_release_iommu_group(struct kvm *kvm, + struct iommu_group *grp) +{ + int i; + struct kvmppc_spapr_tce_table *stt; + struct kvmppc_spapr_tce_iommu_table *stit, *tmp; + struct iommu_table_group *table_group = NULL; + + list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) { + + table_group = iommu_group_get_iommudata(grp); + if (WARN_ON(!table_group)) + continue; + + list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) { + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + if (table_group->tables[i] != stit->tbl) + continue; + + kref_put(&stit->kref, kvm_spapr_tce_liobn_put); + return; + } + } + } +} + +extern long kvm_spapr_tce_attach_iommu_group(struct kvm *kvm, int tablefd, + struct iommu_group *grp) +{ + struct kvmppc_spapr_tce_table *stt = NULL; + bool found = false; + struct iommu_table *tbl = NULL; + struct iommu_table_group *table_group; + long i; + struct kvmppc_spapr_tce_iommu_table *stit; + struct fd f; + + f = fdget(tablefd); + if (!f.file) + return -EBADF; + + list_for_each_entry_rcu(stt, &kvm->arch.spapr_tce_tables, list) { + if (stt == f.file->private_data) { + found = true; + break; + } + } + + fdput(f); + + if (!found) + return -EINVAL; + + table_group = iommu_group_get_iommudata(grp); + if (WARN_ON(!table_group)) + return -EFAULT; + + for (i = 0; i < IOMMU_TABLE_GROUP_MAX_TABLES; ++i) { + struct iommu_table *tbltmp = table_group->tables[i]; + + if (!tbltmp) + continue; + /* + * Make sure hardware table parameters are exactly the same; + * this is used in the TCE handlers where boundary checks + * use only the first attached table. + */ + if ((tbltmp->it_page_shift == stt->page_shift) && + (tbltmp->it_offset == stt->offset) && + (tbltmp->it_size == stt->size)) { + /* + * Reference the table to avoid races with + * add/remove DMA windows. + */ + tbl = iommu_tce_table_get(tbltmp); + break; + } + } + if (!tbl) + return -EINVAL; + + list_for_each_entry_rcu(stit, &stt->iommu_tables, next) { + if (tbl != stit->tbl) + continue; + + if (!kref_get_unless_zero(&stit->kref)) { + /* stit is being destroyed */ + iommu_tce_table_put(tbl); + return -ENOTTY; + } + /* + * The table is already known to this KVM, we just increased + * its KVM reference counter and can return. + */ + return 0; + } + + stit = kzalloc(sizeof(*stit), GFP_KERNEL); + if (!stit) { + iommu_tce_table_put(tbl); + return -ENOMEM; + } + + stit->tbl = tbl; + kref_init(&stit->kref); + + list_add_rcu(&stit->next, &stt->iommu_tables); + + return 0; +} + static void release_spapr_tce_table(struct rcu_head *head) { struct kvmppc_spapr_tce_table *stt = container_of(head, @@ -130,9 +264,18 @@ static int kvm_spapr_tce_release(struct inode *inode, struct file *filp) { struct kvmppc_spapr_tce_table *stt = filp->private_data; + struct kvmppc_spapr_tce_iommu_table *stit, *tmp; list_del_rcu(&stt->list); + list_for_each_entry_safe(stit, tmp, &stt->iommu_tables, next) { + WARN_ON(!kref_read(&stit->kref)); + while (1) { + if (kref_put(&stit->kref, kvm_spapr_tce_liobn_put)) + break; + } + } + kvm_put_kvm(stt->kvm); kvmppc_account_memlimit( @@ -183,6 +326,7 @@ stt->offset = args->offset; stt->size = size; stt->kvm = kvm; + INIT_LIST_HEAD_RCU(&stt->iommu_tables); for (i = 0; i < npages; i++) { stt->pages[i] = alloc_page(GFP_KERNEL | __GFP_ZERO); @@ -211,15 +355,106 @@ return ret; } +static void kvmppc_clear_tce(struct iommu_table *tbl, unsigned long entry) +{ + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; + + iommu_tce_xchg(tbl, entry, &hpa, &dir); +} + +static long kvmppc_tce_iommu_mapped_dec(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + struct mm_iommu_table_group_mem_t *mem = NULL; + const unsigned long pgsize = 1ULL << tbl->it_page_shift; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + mem = mm_iommu_lookup(kvm->mm, *pua, pgsize); + if (!mem) + return H_TOO_HARD; + + mm_iommu_mapped_dec(mem); + + *pua = 0; + + return H_SUCCESS; +} + +static long kvmppc_tce_iommu_unmap(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + enum dma_data_direction dir = DMA_NONE; + unsigned long hpa = 0; + long ret; + + if (WARN_ON_ONCE(iommu_tce_xchg(tbl, entry, &hpa, &dir))) + return H_HARDWARE; + + if (dir == DMA_NONE) + return H_SUCCESS; + + ret = kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); + if (ret != H_SUCCESS) + iommu_tce_xchg(tbl, entry, &hpa, &dir); + + return ret; +} + +long kvmppc_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl, + unsigned long entry, unsigned long ua, + enum dma_data_direction dir) +{ + long ret; + unsigned long hpa, *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + struct mm_iommu_table_group_mem_t *mem; + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + mem = mm_iommu_lookup(kvm->mm, ua, 1ULL << tbl->it_page_shift); + if (!mem) + /* This only handles v2 IOMMU type, v1 is handled via ioctl() */ + return H_TOO_HARD; + + if (WARN_ON_ONCE(mm_iommu_ua_to_hpa(mem, ua, &hpa))) + return H_HARDWARE; + + if (mm_iommu_mapped_inc(mem)) + return H_CLOSED; + + ret = iommu_tce_xchg(tbl, entry, &hpa, &dir); + if (WARN_ON_ONCE(ret)) { + mm_iommu_mapped_dec(mem); + return H_HARDWARE; + } + + if (dir != DMA_NONE) + kvmppc_tce_iommu_mapped_dec(kvm, tbl, entry); + + *pua = ua; + + return 0; +} + long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { - struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); - long ret; + struct kvmppc_spapr_tce_table *stt; + long ret, idx; + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long entry, ua = 0; + enum dma_data_direction dir; /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -231,7 +466,35 @@ if (ret != H_SUCCESS) return ret; - kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); + dir = iommu_tce_direction(tce); + if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) + return H_PARAMETER; + + entry = ioba >> stt->page_shift; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + if (dir == DMA_NONE) { + ret = kvmppc_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry); + } else { + idx = srcu_read_lock(&vcpu->kvm->srcu); + ret = kvmppc_tce_iommu_map(vcpu->kvm, stit->tbl, + entry, ua, dir); + srcu_read_unlock(&vcpu->kvm->srcu, idx); + } + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE(1); + kvmppc_clear_tce(stit->tbl, entry); + } + + kvmppc_tce_put(stt, entry, tce); return H_SUCCESS; } @@ -246,8 +509,9 @@ unsigned long entry, ua = 0; u64 __user *tces; u64 tce; + struct kvmppc_spapr_tce_iommu_table *stit; - stt = kvmppc_find_table(vcpu, liobn); + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -284,6 +548,26 @@ if (ret != H_SUCCESS) goto unlock_exit; + if (kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), + &ua, NULL)) + return H_PARAMETER; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + ret = kvmppc_tce_iommu_map(vcpu->kvm, + stit->tbl, entry + i, ua, + iommu_tce_direction(tce)); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + goto unlock_exit; + + WARN_ON_ONCE(1); + kvmppc_clear_tce(stit->tbl, entry); + } + kvmppc_tce_put(stt, entry + i, tce); } @@ -300,8 +584,9 @@ { struct kvmppc_spapr_tce_table *stt; long i, ret; + struct kvmppc_spapr_tce_iommu_table *stit; - stt = kvmppc_find_table(vcpu, liobn); + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -313,6 +598,24 @@ if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) return H_PARAMETER; + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + unsigned long entry = ioba >> stit->tbl->it_page_shift; + + for (i = 0; i < npages; ++i) { + ret = kvmppc_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry + i); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE(1); + kvmppc_clear_tce(stit->tbl, entry); + } + } + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); --- linux-azure-4.11.0.orig/arch/powerpc/kvm/book3s_64_vio_hv.c +++ linux-azure-4.11.0/arch/powerpc/kvm/book3s_64_vio_hv.c @@ -40,6 +40,31 @@ #include #include +#ifdef CONFIG_BUG + +#define WARN_ON_ONCE_RM(condition) ({ \ + static bool __section(.data.unlikely) __warned; \ + int __ret_warn_once = !!(condition); \ + \ + if (unlikely(__ret_warn_once && !__warned)) { \ + __warned = true; \ + pr_err("WARN_ON_ONCE_RM: (%s) at %s:%u\n", \ + __stringify(condition), \ + __func__, __LINE__); \ + dump_stack(); \ + } \ + unlikely(__ret_warn_once); \ +}) + +#else + +#define WARN_ON_ONCE_RM(condition) ({ \ + int __ret_warn_on = !!(condition); \ + unlikely(__ret_warn_on); \ +}) + +#endif + #define TCES_PER_PAGE (PAGE_SIZE / sizeof(u64)) /* @@ -48,10 +73,9 @@ * WARNING: This will be called in real or virtual mode on HV KVM and virtual * mode on PR KVM */ -struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm_vcpu *vcpu, +struct kvmppc_spapr_tce_table *kvmppc_find_table(struct kvm *kvm, unsigned long liobn) { - struct kvm *kvm = vcpu->kvm; struct kvmppc_spapr_tce_table *stt; list_for_each_entry_lockless(stt, &kvm->arch.spapr_tce_tables, list) @@ -179,15 +203,126 @@ EXPORT_SYMBOL_GPL(kvmppc_gpa_to_ua); #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE +static void kvmppc_rm_clear_tce(struct iommu_table *tbl, unsigned long entry) +{ + unsigned long hpa = 0; + enum dma_data_direction dir = DMA_NONE; + + iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); +} + +static long kvmppc_rm_tce_iommu_mapped_dec(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + struct mm_iommu_table_group_mem_t *mem = NULL; + const unsigned long pgsize = 1ULL << tbl->it_page_shift; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + pua = (void *) vmalloc_to_phys(pua); + if (WARN_ON_ONCE_RM(!pua)) + return H_HARDWARE; + + mem = mm_iommu_lookup_rm(kvm->mm, *pua, pgsize); + if (!mem) + return H_TOO_HARD; + + mm_iommu_mapped_dec(mem); + + *pua = 0; + + return H_SUCCESS; +} + +static long kvmppc_rm_tce_iommu_unmap(struct kvm *kvm, + struct iommu_table *tbl, unsigned long entry) +{ + enum dma_data_direction dir = DMA_NONE; + unsigned long hpa = 0; + long ret; + + if (iommu_tce_xchg_rm(tbl, entry, &hpa, &dir)) + /* + * real mode xchg can fail if struct page crosses + * a page boundary + */ + return H_TOO_HARD; + + if (dir == DMA_NONE) + return H_SUCCESS; + + ret = kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); + if (ret) + iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + + return ret; +} + +static long kvmppc_rm_tce_iommu_map(struct kvm *kvm, struct iommu_table *tbl, + unsigned long entry, unsigned long ua, + enum dma_data_direction dir) +{ + long ret; + unsigned long hpa = 0; + unsigned long *pua = IOMMU_TABLE_USERSPACE_ENTRY(tbl, entry); + struct mm_iommu_table_group_mem_t *mem; + + if (!pua) + /* it_userspace allocation might be delayed */ + return H_TOO_HARD; + + mem = mm_iommu_lookup_rm(kvm->mm, ua, 1ULL << tbl->it_page_shift); + if (!mem) + return H_TOO_HARD; + + if (WARN_ON_ONCE_RM(mm_iommu_ua_to_hpa_rm(mem, ua, &hpa))) + return H_HARDWARE; + + pua = (void *) vmalloc_to_phys(pua); + if (WARN_ON_ONCE_RM(!pua)) + return H_HARDWARE; + + if (WARN_ON_ONCE_RM(mm_iommu_mapped_inc(mem))) + return H_CLOSED; + + ret = iommu_tce_xchg_rm(tbl, entry, &hpa, &dir); + if (ret) { + mm_iommu_mapped_dec(mem); + /* + * real mode xchg can fail if struct page crosses + * a page boundary + */ + return H_TOO_HARD; + } + + if (dir != DMA_NONE) + kvmppc_rm_tce_iommu_mapped_dec(kvm, tbl, entry); + + *pua = ua; + + return 0; +} + long kvmppc_rm_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce) { - struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + struct kvmppc_spapr_tce_table *stt; long ret; + struct kvmppc_spapr_tce_iommu_table *stit; + unsigned long entry, ua = 0; + enum dma_data_direction dir; /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */ /* liobn, ioba, tce); */ + /* For radix, we might be in virtual mode, so punt */ + if (kvm_is_radix(vcpu->kvm)) + return H_TOO_HARD; + + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -199,7 +334,32 @@ if (ret != H_SUCCESS) return ret; - kvmppc_tce_put(stt, ioba >> stt->page_shift, tce); + dir = iommu_tce_direction(tce); + if ((dir != DMA_NONE) && kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), &ua, NULL)) + return H_PARAMETER; + + entry = ioba >> stt->page_shift; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + if (dir == DMA_NONE) + ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry); + else + ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, + stit->tbl, entry, ua, dir); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE_RM(1); + kvmppc_rm_clear_tce(stit->tbl, entry); + } + + kvmppc_tce_put(stt, entry, tce); return H_SUCCESS; } @@ -239,8 +399,14 @@ long i, ret = H_SUCCESS; unsigned long tces, entry, ua = 0; unsigned long *rmap = NULL; + bool prereg = false; + struct kvmppc_spapr_tce_iommu_table *stit; + + /* For radix, we might be in virtual mode, so punt */ + if (kvm_is_radix(vcpu->kvm)) + return H_TOO_HARD; - stt = kvmppc_find_table(vcpu, liobn); + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -259,23 +425,49 @@ if (ret != H_SUCCESS) return ret; - if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) - return H_TOO_HARD; - - rmap = (void *) vmalloc_to_phys(rmap); + if (mm_iommu_preregistered(vcpu->kvm->mm)) { + /* + * We get here if guest memory was pre-registered which + * is normally VFIO case and gpa->hpa translation does not + * depend on hpt. + */ + struct mm_iommu_table_group_mem_t *mem; + + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, NULL)) + return H_TOO_HARD; + + mem = mm_iommu_lookup_rm(vcpu->kvm->mm, ua, IOMMU_PAGE_SIZE_4K); + if (mem) + prereg = mm_iommu_ua_to_hpa_rm(mem, ua, &tces) == 0; + } - /* - * Synchronize with the MMU notifier callbacks in - * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). - * While we have the rmap lock, code running on other CPUs - * cannot finish unmapping the host real page that backs - * this guest real page, so we are OK to access the host - * real page. - */ - lock_rmap(rmap); - if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { - ret = H_TOO_HARD; - goto unlock_exit; + if (!prereg) { + /* + * This is usually a case of a guest with emulated devices only + * when TCE list is not in preregistered memory. + * We do not require memory to be preregistered in this case + * so lock rmap and do __find_linux_pte_or_hugepte(). + */ + if (kvmppc_gpa_to_ua(vcpu->kvm, tce_list, &ua, &rmap)) + return H_TOO_HARD; + + rmap = (void *) vmalloc_to_phys(rmap); + if (WARN_ON_ONCE_RM(!rmap)) + return H_HARDWARE; + + /* + * Synchronize with the MMU notifier callbacks in + * book3s_64_mmu_hv.c (kvm_unmap_hva_hv etc.). + * While we have the rmap lock, code running on other CPUs + * cannot finish unmapping the host real page that backs + * this guest real page, so we are OK to access the host + * real page. + */ + lock_rmap(rmap); + if (kvmppc_rm_ua_to_hpa(vcpu, ua, &tces)) { + ret = H_TOO_HARD; + goto unlock_exit; + } } for (i = 0; i < npages; ++i) { @@ -285,11 +477,33 @@ if (ret != H_SUCCESS) goto unlock_exit; + ua = 0; + if (kvmppc_gpa_to_ua(vcpu->kvm, + tce & ~(TCE_PCI_READ | TCE_PCI_WRITE), + &ua, NULL)) + return H_PARAMETER; + + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + ret = kvmppc_rm_tce_iommu_map(vcpu->kvm, + stit->tbl, entry + i, ua, + iommu_tce_direction(tce)); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + goto unlock_exit; + + WARN_ON_ONCE_RM(1); + kvmppc_rm_clear_tce(stit->tbl, entry); + } + kvmppc_tce_put(stt, entry + i, tce); } unlock_exit: - unlock_rmap(rmap); + if (rmap) + unlock_rmap(rmap); return ret; } @@ -300,8 +514,13 @@ { struct kvmppc_spapr_tce_table *stt; long i, ret; + struct kvmppc_spapr_tce_iommu_table *stit; + + /* For radix, we might be in virtual mode, so punt */ + if (kvm_is_radix(vcpu->kvm)) + return H_TOO_HARD; - stt = kvmppc_find_table(vcpu, liobn); + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; @@ -313,21 +532,41 @@ if (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)) return H_PARAMETER; + list_for_each_entry_lockless(stit, &stt->iommu_tables, next) { + unsigned long entry = ioba >> stit->tbl->it_page_shift; + + for (i = 0; i < npages; ++i) { + ret = kvmppc_rm_tce_iommu_unmap(vcpu->kvm, + stit->tbl, entry + i); + + if (ret == H_SUCCESS) + continue; + + if (ret == H_TOO_HARD) + return ret; + + WARN_ON_ONCE_RM(1); + kvmppc_rm_clear_tce(stit->tbl, entry); + } + } + for (i = 0; i < npages; ++i, ioba += (1ULL << stt->page_shift)) kvmppc_tce_put(stt, ioba >> stt->page_shift, tce_value); return H_SUCCESS; } +/* This can be called in either virtual mode or real mode */ long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba) { - struct kvmppc_spapr_tce_table *stt = kvmppc_find_table(vcpu, liobn); + struct kvmppc_spapr_tce_table *stt; long ret; unsigned long idx; struct page *page; u64 *tbl; + stt = kvmppc_find_table(vcpu->kvm, liobn); if (!stt) return H_TOO_HARD; --- linux-azure-4.11.0.orig/arch/powerpc/kvm/book3s_hv.c +++ linux-azure-4.11.0/arch/powerpc/kvm/book3s_hv.c @@ -1481,6 +1481,14 @@ r = set_vpa(vcpu, &vcpu->arch.dtl, addr, len); break; case KVM_REG_PPC_TB_OFFSET: + /* + * POWER9 DD1 has an erratum where writing TBU40 causes + * the timebase to lose ticks. So we don't let the + * timebase offset be changed on P9 DD1. (It is + * initialized to zero.) + */ + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) + break; /* round up to multiple of 2^24 */ vcpu->arch.vcore->tb_offset = ALIGN(set_reg_val(id, *val), 1UL << 24); @@ -2902,12 +2910,36 @@ { int r; int srcu_idx; + unsigned long ebb_regs[3] = {}; /* shut up GCC */ + unsigned long user_tar = 0; + unsigned int user_vrsave; if (!vcpu->arch.sane) { run->exit_reason = KVM_EXIT_INTERNAL_ERROR; return -EINVAL; } + /* + * Don't allow entry with a suspended transaction, because + * the guest entry/exit code will lose it. + * If the guest has TM enabled, save away their TM-related SPRs + * (they will get restored by the TM unavailable interrupt). + */ +#ifdef CONFIG_PPC_TRANSACTIONAL_MEM + if (cpu_has_feature(CPU_FTR_TM) && current->thread.regs && + (current->thread.regs->msr & MSR_TM)) { + if (MSR_TM_ACTIVE(current->thread.regs->msr)) { + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + run->fail_entry.hardware_entry_failure_reason = 0; + return -EINVAL; + } + current->thread.tm_tfhar = mfspr(SPRN_TFHAR); + current->thread.tm_tfiar = mfspr(SPRN_TFIAR); + current->thread.tm_texasr = mfspr(SPRN_TEXASR); + current->thread.regs->msr &= ~MSR_TM; + } +#endif + kvmppc_core_prepare_to_enter(vcpu); /* No need to go into the guest when all we'll do is come back out */ @@ -2929,6 +2961,15 @@ flush_all_to_thread(current); + /* Save userspace EBB and other register values */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + ebb_regs[0] = mfspr(SPRN_EBBHR); + ebb_regs[1] = mfspr(SPRN_EBBRR); + ebb_regs[2] = mfspr(SPRN_BESCR); + user_tar = mfspr(SPRN_TAR); + } + user_vrsave = mfspr(SPRN_VRSAVE); + vcpu->arch.wqp = &vcpu->arch.vcore->wq; vcpu->arch.pgdir = current->mm->pgd; vcpu->arch.state = KVMPPC_VCPU_BUSY_IN_HOST; @@ -2951,6 +2992,16 @@ r = kvmppc_xics_rm_complete(vcpu, 0); } while (is_kvmppc_resume_guest(r)); + /* Restore userspace EBB and other register values */ + if (cpu_has_feature(CPU_FTR_ARCH_207S)) { + mtspr(SPRN_EBBHR, ebb_regs[0]); + mtspr(SPRN_EBBRR, ebb_regs[1]); + mtspr(SPRN_BESCR, ebb_regs[2]); + mtspr(SPRN_TAR, user_tar); + mtspr(SPRN_FSCR, current->thread.fscr); + } + mtspr(SPRN_VRSAVE, user_vrsave); + out: vcpu->arch.state = KVMPPC_VCPU_NOTREADY; atomic_dec(&vcpu->kvm->arch.vcpus_running); --- linux-azure-4.11.0.orig/arch/powerpc/kvm/book3s_hv_builtin.c +++ linux-azure-4.11.0/arch/powerpc/kvm/book3s_hv_builtin.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include #include @@ -187,7 +188,14 @@ long kvmppc_h_random(struct kvm_vcpu *vcpu) { - if (powernv_get_random_real_mode(&vcpu->arch.gpr[4])) + int r; + + /* Only need to do the expensive mfmsr() on radix */ + if (kvm_is_radix(vcpu->kvm) && (mfmsr() & MSR_IR)) + r = powernv_get_random_long(&vcpu->arch.gpr[4]); + else + r = powernv_get_random_real_mode(&vcpu->arch.gpr[4]); + if (r) return H_SUCCESS; return H_HARDWARE; @@ -224,6 +232,10 @@ return; } + /* We should never reach this */ + if (WARN_ON_ONCE(xive_enabled())) + return; + /* Else poke the target with an IPI */ xics_phys = paca[cpu].kvm_hstate.xics_phys; if (xics_phys) @@ -386,6 +398,9 @@ long rc; bool again; + if (xive_enabled()) + return 1; + do { again = false; rc = kvmppc_read_one_intr(&again); --- linux-azure-4.11.0.orig/arch/powerpc/kvm/book3s_hv_interrupts.S +++ linux-azure-4.11.0/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -121,10 +121,20 @@ * Put whatever is in the decrementer into the * hypervisor decrementer. */ +BEGIN_FTR_SECTION + ld r5, HSTATE_KVM_VCORE(r13) + ld r6, VCORE_KVM(r5) + ld r9, KVM_HOST_LPCR(r6) + andis. r9, r9, LPCR_LD@h +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) mfspr r8,SPRN_DEC mftb r7 - mtspr SPRN_HDEC,r8 +BEGIN_FTR_SECTION + /* On POWER9, don't sign-extend if host LPCR[LD] bit is set */ + bne 32f +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) extsw r8,r8 +32: mtspr SPRN_HDEC,r8 add r8,r8,r7 std r8,HSTATE_DECEXP(r13) --- linux-azure-4.11.0.orig/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ linux-azure-4.11.0/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -31,12 +31,29 @@ #include #include +/* Sign-extend HDEC if not on POWER9 */ +#define EXTEND_HDEC(reg) \ +BEGIN_FTR_SECTION; \ + extsw reg, reg; \ +END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_300) + #define VCPU_GPRS_TM(reg) (((reg) * ULONG_SIZE) + VCPU_GPR_TM) /* Values in HSTATE_NAPPING(r13) */ #define NAPPING_CEDE 1 #define NAPPING_NOVCPU 2 +/* Stack frame offsets for kvmppc_hv_entry */ +#define SFS 144 +#define STACK_SLOT_TRAP (SFS-4) +#define STACK_SLOT_TID (SFS-16) +#define STACK_SLOT_PSSCR (SFS-24) +#define STACK_SLOT_PID (SFS-32) +#define STACK_SLOT_IAMR (SFS-40) +#define STACK_SLOT_CIABR (SFS-48) +#define STACK_SLOT_DAWR (SFS-56) +#define STACK_SLOT_DAWRX (SFS-64) + /* * Call kvmppc_hv_entry in real mode. * Must be called with interrupts hard-disabled. @@ -213,6 +230,8 @@ kvmppc_primary_no_guest: /* We handle this much like a ceded vcpu */ /* put the HDEC into the DEC, since HDEC interrupts don't wake us */ + /* HDEC may be larger than DEC for arch >= v3.00, but since the */ + /* HDEC value came from DEC in the first place, it will fit */ mfspr r3, SPRN_HDEC mtspr SPRN_DEC, r3 /* @@ -294,8 +313,9 @@ /* See if our timeslice has expired (HDEC is negative) */ mfspr r0, SPRN_HDEC + EXTEND_HDEC(r0) li r12, BOOK3S_INTERRUPT_HV_DECREMENTER - cmpwi r0, 0 + cmpdi r0, 0 blt kvm_novcpu_exit /* Got an IPI but other vcpus aren't yet exiting, must be a latecomer */ @@ -318,10 +338,10 @@ bl kvmhv_accumulate_time #endif 13: mr r3, r12 - stw r12, 112-4(r1) + stw r12, STACK_SLOT_TRAP(r1) bl kvmhv_commence_exit nop - lwz r12, 112-4(r1) + lwz r12, STACK_SLOT_TRAP(r1) b kvmhv_switch_to_host /* @@ -389,8 +409,8 @@ lbz r4, HSTATE_PTID(r13) cmpwi r4, 0 bne 63f - lis r6, 0x7fff - ori r6, r6, 0xffff + LOAD_REG_ADDR(r6, decrementer_max) + ld r6, 0(r6) mtspr SPRN_HDEC, r6 /* and set per-LPAR registers, if doing dynamic micro-threading */ ld r6, HSTATE_SPLIT_MODE(r13) @@ -544,11 +564,6 @@ * * *****************************************************************************/ -/* Stack frame offsets */ -#define STACK_SLOT_TID (112-16) -#define STACK_SLOT_PSSCR (112-24) -#define STACK_SLOT_PID (112-32) - .global kvmppc_hv_entry kvmppc_hv_entry: @@ -564,7 +579,7 @@ */ mflr r0 std r0, PPC_LR_STKOFF(r1) - stdu r1, -112(r1) + stdu r1, -SFS(r1) /* Save R1 in the PACA */ std r1, HSTATE_HOST_R1(r13) @@ -748,10 +763,20 @@ mfspr r5, SPRN_TIDR mfspr r6, SPRN_PSSCR mfspr r7, SPRN_PID + mfspr r8, SPRN_IAMR std r5, STACK_SLOT_TID(r1) std r6, STACK_SLOT_PSSCR(r1) std r7, STACK_SLOT_PID(r1) + std r8, STACK_SLOT_IAMR(r1) END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) +BEGIN_FTR_SECTION + mfspr r5, SPRN_CIABR + mfspr r6, SPRN_DAWR + mfspr r7, SPRN_DAWRX + std r5, STACK_SLOT_CIABR(r1) + std r6, STACK_SLOT_DAWR(r1) + std r7, STACK_SLOT_DAWRX(r1) +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) BEGIN_FTR_SECTION /* Set partition DABR */ @@ -967,7 +992,8 @@ /* Check if HDEC expires soon */ mfspr r3, SPRN_HDEC - cmpwi r3, 512 /* 1 microsecond */ + EXTEND_HDEC(r3) + cmpdi r3, 512 /* 1 microsecond */ blt hdec_soon deliver_guest_interrupt: @@ -1451,11 +1477,10 @@ * set by the guest could disrupt the host. */ li r0, 0 - mtspr SPRN_IAMR, r0 - mtspr SPRN_CIABR, r0 - mtspr SPRN_DAWRX, r0 + mtspr SPRN_PSPB, r0 mtspr SPRN_WORT, r0 BEGIN_FTR_SECTION + mtspr SPRN_IAMR, r0 mtspr SPRN_TCSCR, r0 /* Set MMCRS to 1<<31 to freeze and disable the SPMC counters */ li r0, 1 @@ -1471,6 +1496,7 @@ std r6,VCPU_UAMOR(r9) li r6,0 mtspr SPRN_AMR,r6 + mtspr SPRN_UAMOR, r6 /* Switch DSCR back to host value */ mfspr r8, SPRN_DSCR @@ -1616,12 +1642,22 @@ /* Restore host values of some registers */ BEGIN_FTR_SECTION + ld r5, STACK_SLOT_CIABR(r1) + ld r6, STACK_SLOT_DAWR(r1) + ld r7, STACK_SLOT_DAWRX(r1) + mtspr SPRN_CIABR, r5 + mtspr SPRN_DAWR, r6 + mtspr SPRN_DAWRX, r7 +END_FTR_SECTION_IFSET(CPU_FTR_ARCH_207S) +BEGIN_FTR_SECTION ld r5, STACK_SLOT_TID(r1) ld r6, STACK_SLOT_PSSCR(r1) ld r7, STACK_SLOT_PID(r1) + ld r8, STACK_SLOT_IAMR(r1) mtspr SPRN_TIDR, r5 mtspr SPRN_PSSCR, r6 mtspr SPRN_PID, r7 + mtspr SPRN_IAMR, r8 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) BEGIN_FTR_SECTION PPC_INVALIDATE_ERAT @@ -1765,8 +1801,8 @@ li r0, KVM_GUEST_MODE_NONE stb r0, HSTATE_IN_GUEST(r13) - ld r0, 112+PPC_LR_STKOFF(r1) - addi r1, r1, 112 + ld r0, SFS+PPC_LR_STKOFF(r1) + addi r1, r1, SFS mtlr r0 blr @@ -2308,12 +2344,13 @@ mfspr r3, SPRN_DEC mfspr r4, SPRN_HDEC mftb r5 - cmpw r3, r4 + extsw r3, r3 + EXTEND_HDEC(r4) + cmpd r3, r4 ble 67f mtspr SPRN_DEC, r4 67: /* save expiry time of guest decrementer */ - extsw r3, r3 add r3, r3, r5 ld r4, HSTATE_KVM_VCPU(r13) ld r5, HSTATE_KVM_VCORE(r13) --- linux-azure-4.11.0.orig/arch/powerpc/kvm/powerpc.c +++ linux-azure-4.11.0/arch/powerpc/kvm/powerpc.c @@ -538,6 +538,8 @@ #ifdef CONFIG_PPC_BOOK3S_64 case KVM_CAP_SPAPR_TCE: case KVM_CAP_SPAPR_TCE_64: + /* fallthrough */ + case KVM_CAP_SPAPR_TCE_VFIO: case KVM_CAP_PPC_RTAS: case KVM_CAP_PPC_FIXUP_HCALL: case KVM_CAP_PPC_ENABLE_HCALL: --- linux-azure-4.11.0.orig/arch/powerpc/lib/sstep.c +++ linux-azure-4.11.0/arch/powerpc/lib/sstep.c @@ -12,6 +12,7 @@ #include #include #include +#include #include #include #include --- linux-azure-4.11.0.orig/arch/powerpc/mm/dump_linuxpagetables.c +++ linux-azure-4.11.0/arch/powerpc/mm/dump_linuxpagetables.c @@ -16,6 +16,7 @@ */ #include #include +#include #include #include #include @@ -56,6 +57,8 @@ struct seq_file *seq; const struct addr_marker *marker; unsigned long start_address; + unsigned long start_pa; + unsigned long last_pa; unsigned int level; u64 current_flags; }; @@ -252,7 +255,9 @@ const char *unit = units; unsigned long delta; - seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); + seq_printf(st->seq, "0x%016lx-0x%016lx ", st->start_address, addr-1); + seq_printf(st->seq, "0x%016lx ", st->start_pa); + delta = (addr - st->start_address) >> 10; /* Work out what appropriate unit to use */ while (!(delta & 1023) && unit[1]) { @@ -267,11 +272,15 @@ unsigned int level, u64 val) { u64 flag = val & pg_level[level].mask; + u64 pa = val & PTE_RPN_MASK; + /* At first no level is set */ if (!st->level) { st->level = level; st->current_flags = flag; st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); /* * Dump the section of virtual memory when: @@ -279,9 +288,11 @@ * - we change levels in the tree. * - the address is in a different section of memory and is thus * used for a different purpose, regardless of the flags. + * - the pa of this page is not adjacent to the last inspected page */ } else if (flag != st->current_flags || level != st->level || - addr >= st->marker[1].start_address) { + addr >= st->marker[1].start_address || + pa != st->last_pa + PAGE_SIZE) { /* Check the PTE flags */ if (st->current_flags) { @@ -305,8 +316,12 @@ seq_printf(st->seq, "---[ %s ]---\n", st->marker->name); } st->start_address = addr; + st->start_pa = pa; + st->last_pa = pa; st->current_flags = flag; st->level = level; + } else { + st->last_pa = pa; } } @@ -331,7 +346,7 @@ for (i = 0; i < PTRS_PER_PMD; i++, pmd++) { addr = start + i * PMD_SIZE; - if (!pmd_none(*pmd)) + if (!pmd_none(*pmd) && !pmd_huge(*pmd)) /* pmd exists */ walk_pte(st, pmd, addr); else @@ -347,7 +362,7 @@ for (i = 0; i < PTRS_PER_PUD; i++, pud++) { addr = start + i * PUD_SIZE; - if (!pud_none(*pud)) + if (!pud_none(*pud) && !pud_huge(*pud)) /* pud exists */ walk_pmd(st, pud, addr); else @@ -367,7 +382,7 @@ */ for (i = 0; i < PTRS_PER_PGD; i++, pgd++) { addr = KERN_VIRT_START + i * PGDIR_SIZE; - if (!pgd_none(*pgd)) + if (!pgd_none(*pgd) && !pgd_huge(*pgd)) /* pgd exists */ walk_pud(st, pgd, addr); else --- linux-azure-4.11.0.orig/arch/powerpc/mm/hugetlbpage-radix.c +++ linux-azure-4.11.0/arch/powerpc/mm/hugetlbpage-radix.c @@ -65,7 +65,7 @@ addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } /* --- linux-azure-4.11.0.orig/arch/powerpc/mm/mmap.c +++ linux-azure-4.11.0/arch/powerpc/mm/mmap.c @@ -107,7 +107,7 @@ addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -143,7 +143,7 @@ addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } --- linux-azure-4.11.0.orig/arch/powerpc/mm/mmu_context_book3s64.c +++ linux-azure-4.11.0/arch/powerpc/mm/mmu_context_book3s64.c @@ -67,6 +67,9 @@ */ rts_field = radix__get_tree_size(); process_tb[index].prtb0 = cpu_to_be64(rts_field | __pa(mm->pgd) | RADIX_PGD_INDEX_SIZE); + + mm->context.npu_context = NULL; + return 0; } @@ -179,10 +182,15 @@ #ifdef CONFIG_PPC_RADIX_MMU void radix__switch_mmu_context(struct mm_struct *prev, struct mm_struct *next) { - asm volatile("isync": : :"memory"); - mtspr(SPRN_PID, next->context.id); - asm volatile("isync \n" - PPC_SLBIA(0x7) - : : :"memory"); + + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) { + isync(); + mtspr(SPRN_PID, next->context.id); + isync(); + asm volatile(PPC_INVALIDATE_ERAT : : :"memory"); + } else { + mtspr(SPRN_PID, next->context.id); + isync(); + } } #endif --- linux-azure-4.11.0.orig/arch/powerpc/mm/mmu_context_iommu.c +++ linux-azure-4.11.0/arch/powerpc/mm/mmu_context_iommu.c @@ -81,7 +81,7 @@ gfp_t gfp_mask = GFP_USER; struct page *new_page; - if (PageHuge(page) || PageTransHuge(page) || PageCompound(page)) + if (PageCompound(page)) return NULL; if (PageHighMem(page)) @@ -100,7 +100,7 @@ LIST_HEAD(cma_migrate_pages); /* Ignore huge pages for now */ - if (PageHuge(page) || PageTransHuge(page) || PageCompound(page)) + if (PageCompound(page)) return -EBUSY; lru_add_drain(); @@ -314,6 +314,25 @@ } EXPORT_SYMBOL_GPL(mm_iommu_lookup); +struct mm_iommu_table_group_mem_t *mm_iommu_lookup_rm(struct mm_struct *mm, + unsigned long ua, unsigned long size) +{ + struct mm_iommu_table_group_mem_t *mem, *ret = NULL; + + list_for_each_entry_lockless(mem, &mm->context.iommu_group_mem_list, + next) { + if ((mem->ua <= ua) && + (ua + size <= mem->ua + + (mem->entries << PAGE_SHIFT))) { + ret = mem; + break; + } + } + + return ret; +} +EXPORT_SYMBOL_GPL(mm_iommu_lookup_rm); + struct mm_iommu_table_group_mem_t *mm_iommu_find(struct mm_struct *mm, unsigned long ua, unsigned long entries) { @@ -345,6 +364,26 @@ } EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa); +long mm_iommu_ua_to_hpa_rm(struct mm_iommu_table_group_mem_t *mem, + unsigned long ua, unsigned long *hpa) +{ + const long entry = (ua - mem->ua) >> PAGE_SHIFT; + void *va = &mem->hpas[entry]; + unsigned long *pa; + + if (entry >= mem->entries) + return -EFAULT; + + pa = (void *) vmalloc_to_phys(va); + if (!pa) + return -EFAULT; + + *hpa = *pa | (ua & ~PAGE_MASK); + + return 0; +} +EXPORT_SYMBOL_GPL(mm_iommu_ua_to_hpa_rm); + long mm_iommu_mapped_inc(struct mm_iommu_table_group_mem_t *mem) { if (atomic64_inc_not_zero(&mem->mapped)) --- linux-azure-4.11.0.orig/arch/powerpc/mm/slice.c +++ linux-azure-4.11.0/arch/powerpc/mm/slice.c @@ -105,7 +105,7 @@ if ((mm->task_size - len) < addr) return 0; vma = find_vma(mm, addr); - return (!vma || (addr + len) <= vma->vm_start); + return (!vma || (addr + len) <= vm_start_gap(vma)); } static int slice_low_has_vma(struct mm_struct *mm, unsigned long slice) --- linux-azure-4.11.0.orig/arch/powerpc/mm/tlb-radix.c +++ linux-azure-4.11.0/arch/powerpc/mm/tlb-radix.c @@ -34,10 +34,8 @@ prs = 1; /* process scoped */ r = 1; /* raidx format */ - asm volatile("ptesync": : :"memory"); asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1) : : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory"); - asm volatile("ptesync": : :"memory"); } /* @@ -47,9 +45,11 @@ { int set; + asm volatile("ptesync": : :"memory"); for (set = 0; set < POWER9_TLB_SETS_RADIX ; set++) { __tlbiel_pid(pid, set, ric); } + asm volatile("ptesync": : :"memory"); asm volatile(PPC_INVALIDATE_ERAT "; isync" : : :"memory"); } @@ -129,6 +129,12 @@ { unsigned long pid; struct mm_struct *mm = tlb->mm; + /* + * If we are doing a full mm flush, we will do a tlb flush + * with RIC_FLUSH_ALL later. + */ + if (tlb->fullmm) + return; preempt_disable(); @@ -195,6 +201,12 @@ unsigned long pid; struct mm_struct *mm = tlb->mm; + /* + * If we are doing a full mm flush, we will do a tlb flush + * with RIC_FLUSH_ALL later. + */ + if (tlb->fullmm) + return; preempt_disable(); pid = mm->context.id; --- linux-azure-4.11.0.orig/arch/powerpc/perf/perf_regs.c +++ linux-azure-4.11.0/arch/powerpc/perf/perf_regs.c @@ -101,5 +101,6 @@ struct pt_regs *regs_user_copy) { regs_user->regs = task_pt_regs(current); - regs_user->abi = perf_reg_abi(current); + regs_user->abi = (regs_user->regs) ? perf_reg_abi(current) : + PERF_SAMPLE_REGS_ABI_NONE; } --- linux-azure-4.11.0.orig/arch/powerpc/platforms/85xx/smp.c +++ linux-azure-4.11.0/arch/powerpc/platforms/85xx/smp.c @@ -461,16 +461,9 @@ } #endif /* CONFIG_KEXEC_CORE */ -static void smp_85xx_basic_setup(int cpu_nr) -{ - if (cpu_has_feature(CPU_FTR_DBELL)) - doorbell_setup_this_cpu(); -} - static void smp_85xx_setup_cpu(int cpu_nr) { mpic_setup_this_cpu(); - smp_85xx_basic_setup(cpu_nr); } void __init mpc85xx_smp_init(void) @@ -484,7 +477,7 @@ smp_85xx_ops.setup_cpu = smp_85xx_setup_cpu; smp_85xx_ops.message_pass = smp_mpic_message_pass; } else - smp_85xx_ops.setup_cpu = smp_85xx_basic_setup; + smp_85xx_ops.setup_cpu = NULL; if (cpu_has_feature(CPU_FTR_DBELL)) { /* @@ -492,7 +485,7 @@ * smp_muxed_ipi_message_pass */ smp_85xx_ops.message_pass = NULL; - smp_85xx_ops.cause_ipi = doorbell_cause_ipi; + smp_85xx_ops.cause_ipi = doorbell_global_ipi; smp_85xx_ops.probe = NULL; } --- linux-azure-4.11.0.orig/arch/powerpc/platforms/Kconfig.cputype +++ linux-azure-4.11.0/arch/powerpc/platforms/Kconfig.cputype @@ -373,6 +373,7 @@ config SMP depends on PPC_BOOK3S || PPC_BOOK3E || FSL_BOOKE || PPC_47x + select GENERIC_IRQ_MIGRATION bool "Symmetric multi-processing support" ---help--- This enables support for systems with more than one CPU. If you have --- linux-azure-4.11.0.orig/arch/powerpc/platforms/cell/spu_base.c +++ linux-azure-4.11.0/arch/powerpc/platforms/cell/spu_base.c @@ -197,7 +197,9 @@ (REGION_ID(ea) != USER_REGION_ID)) { spin_unlock(&spu->register_lock); - ret = hash_page(ea, _PAGE_PRESENT | _PAGE_READ, 0x300, dsisr); + ret = hash_page(ea, + _PAGE_PRESENT | _PAGE_READ | _PAGE_PRIVILEGED, + 0x300, dsisr); spin_lock(&spu->register_lock); if (!ret) { --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powermac/smp.c +++ linux-azure-4.11.0/arch/powerpc/platforms/powermac/smp.c @@ -172,7 +172,7 @@ return IRQ_HANDLED; } -static void smp_psurge_cause_ipi(int cpu, unsigned long data) +static void smp_psurge_cause_ipi(int cpu) { psurge_set_ipi(cpu); } --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/Kconfig +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/Kconfig @@ -4,6 +4,7 @@ select PPC_NATIVE select PPC_XICS select PPC_ICP_NATIVE + select PPC_XIVE_NATIVE select PPC_P7_NAP select PCI select PCI_MSI @@ -19,6 +20,7 @@ select CPU_FREQ_GOV_ONDEMAND select CPU_FREQ_GOV_CONSERVATIVE select PPC_DOORBELL + select MMU_NOTIFIER default y config OPAL_PRD --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/idle.c +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/idle.c @@ -147,7 +147,6 @@ } EXPORT_SYMBOL_GPL(pnv_get_supported_cpuidle_states); - static void pnv_fastsleep_workaround_apply(void *info) { @@ -241,8 +240,9 @@ * The default stop state that will be used by ppc_md.power_save * function on platforms that support stop instruction. */ -u64 pnv_default_stop_val; -u64 pnv_default_stop_mask; +static u64 pnv_default_stop_val; +static u64 pnv_default_stop_mask; +static bool default_stop_found; /* * Used for ppc_md.power_save which needs a function with no parameters @@ -262,8 +262,42 @@ * psscr value and mask of the deepest stop idle state. * Used when a cpu is offlined. */ -u64 pnv_deepest_stop_psscr_val; -u64 pnv_deepest_stop_psscr_mask; +static u64 pnv_deepest_stop_psscr_val; +static u64 pnv_deepest_stop_psscr_mask; +static bool deepest_stop_found; + +/* + * pnv_cpu_offline: A function that puts the CPU into the deepest + * available platform idle state on a CPU-Offline. + */ +unsigned long pnv_cpu_offline(unsigned int cpu) +{ + unsigned long srr1; + + u32 idle_states = pnv_get_supported_cpuidle_states(); + + if (cpu_has_feature(CPU_FTR_ARCH_300) && deepest_stop_found) { + srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, + pnv_deepest_stop_psscr_mask); + } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { + srr1 = power7_winkle(); + } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || + (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { + srr1 = power7_sleep(); + } else if (idle_states & OPAL_PM_NAP_ENABLED) { + srr1 = power7_nap(1); + } else { + /* This is the fallback method. We emulate snooze */ + while (!generic_check_cpu_restart(cpu)) { + HMT_low(); + HMT_very_low(); + } + srr1 = 0; + HMT_medium(); + } + + return srr1; +} /* * Power ISA 3.0 idle initialization. @@ -352,7 +386,6 @@ u32 *residency_ns = NULL; u64 max_residency_ns = 0; int rc = 0, i; - bool default_stop_found = false, deepest_stop_found = false; psscr_val = kcalloc(dt_idle_states, sizeof(*psscr_val), GFP_KERNEL); psscr_mask = kcalloc(dt_idle_states, sizeof(*psscr_mask), GFP_KERNEL); @@ -432,21 +465,24 @@ } } - if (!default_stop_found) { - pnv_default_stop_val = PSSCR_HV_DEFAULT_VAL; - pnv_default_stop_mask = PSSCR_HV_DEFAULT_MASK; - pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", + if (unlikely(!default_stop_found)) { + pr_warn("cpuidle-powernv: No suitable default stop state found. Disabling platform idle.\n"); + } else { + ppc_md.power_save = power9_idle; + pr_info("cpuidle-powernv: Default stop: psscr = 0x%016llx,mask=0x%016llx\n", pnv_default_stop_val, pnv_default_stop_mask); } - if (!deepest_stop_found) { - pnv_deepest_stop_psscr_val = PSSCR_HV_DEFAULT_VAL; - pnv_deepest_stop_psscr_mask = PSSCR_HV_DEFAULT_MASK; - pr_warn("Setting default stop psscr val=0x%016llx,mask=0x%016llx\n", + if (unlikely(!deepest_stop_found)) { + pr_warn("cpuidle-powernv: No suitable stop state for CPU-Hotplug. Offlined CPUs will busy wait"); + } else { + pr_info("cpuidle-powernv: Deepest stop: psscr = 0x%016llx,mask=0x%016llx\n", pnv_deepest_stop_psscr_val, pnv_deepest_stop_psscr_mask); } + pr_info("cpuidle-powernv: Requested Level (RL) value of first deep stop = 0x%llx\n", + pnv_first_deep_stop_state); out: kfree(psscr_val); kfree(psscr_mask); @@ -526,8 +562,6 @@ if (supported_cpuidle_states & OPAL_PM_NAP_ENABLED) ppc_md.power_save = power7_idle; - else if (supported_cpuidle_states & OPAL_PM_STOP_INST_FAST) - ppc_md.power_save = power9_idle; out: return 0; --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/npu-dma.c +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/npu-dma.c @@ -9,11 +9,20 @@ * License as published by the Free Software Foundation. */ +#include +#include +#include +#include #include #include #include #include +#include +#include +#include +#include +#include #include #include #include @@ -22,6 +31,8 @@ #include "powernv.h" #include "pci.h" +#define npu_to_phb(x) container_of(x, struct pnv_phb, npu) + /* * Other types of TCE cache invalidation are not functional in the * hardware. @@ -37,6 +48,12 @@ struct device_node *dn; struct pci_dev *gpdev; + if (WARN_ON(!npdev)) + return NULL; + + if (WARN_ON(!npdev->dev.of_node)) + return NULL; + /* Get assoicated PCI device */ dn = of_parse_phandle(npdev->dev.of_node, "ibm,gpu", 0); if (!dn) @@ -55,6 +72,13 @@ struct device_node *dn; struct pci_dev *npdev; + if (WARN_ON(!gpdev)) + return NULL; + + /* Not all PCI devices have device-tree nodes */ + if (!gpdev->dev.of_node) + return NULL; + /* Get assoicated PCI device */ dn = of_parse_phandle(gpdev->dev.of_node, "ibm,npu", index); if (!dn) @@ -180,7 +204,7 @@ pe_err(npe, "Failed to configure TCE table, err %lld\n", rc); return rc; } - pnv_pci_phb3_tce_invalidate_entire(phb, false); + pnv_pci_ioda2_tce_invalidate_entire(phb, false); /* Add the table to the list so its TCE cache will get invalidated */ pnv_pci_link_table_and_group(phb->hose->node, num, @@ -204,7 +228,7 @@ pe_err(npe, "Unmapping failed, ret = %lld\n", rc); return rc; } - pnv_pci_phb3_tce_invalidate_entire(phb, false); + pnv_pci_ioda2_tce_invalidate_entire(phb, false); pnv_pci_unlink_table_and_group(npe->table_group.tables[num], &npe->table_group); @@ -270,7 +294,7 @@ 0 /* bypass base */, top); if (rc == OPAL_SUCCESS) - pnv_pci_phb3_tce_invalidate_entire(phb, false); + pnv_pci_ioda2_tce_invalidate_entire(phb, false); return rc; } @@ -334,7 +358,7 @@ pe_err(npe, "Failed to disable bypass, err %lld\n", rc); return; } - pnv_pci_phb3_tce_invalidate_entire(npe->phb, false); + pnv_pci_ioda2_tce_invalidate_entire(npe->phb, false); } struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe) @@ -359,3 +383,477 @@ return gpe; } + +/* Maximum number of nvlinks per npu */ +#define NV_MAX_LINKS 6 + +/* Maximum index of npu2 hosts in the system. Always < NV_MAX_NPUS */ +static int max_npu2_index; + +struct npu_context { + struct mm_struct *mm; + struct pci_dev *npdev[NV_MAX_NPUS][NV_MAX_LINKS]; + struct mmu_notifier mn; + struct kref kref; + + /* Callback to stop translation requests on a given GPU */ + struct npu_context *(*release_cb)(struct npu_context *, void *); + + /* + * Private pointer passed to the above callback for usage by + * device drivers. + */ + void *priv; +}; + +/* + * Find a free MMIO ATSD register and mark it in use. Return -ENOSPC + * if none are available. + */ +static int get_mmio_atsd_reg(struct npu *npu) +{ + int i; + + for (i = 0; i < npu->mmio_atsd_count; i++) { + if (!test_and_set_bit(i, &npu->mmio_atsd_usage)) + return i; + } + + return -ENOSPC; +} + +static void put_mmio_atsd_reg(struct npu *npu, int reg) +{ + clear_bit(reg, &npu->mmio_atsd_usage); +} + +/* MMIO ATSD register offsets */ +#define XTS_ATSD_AVA 1 +#define XTS_ATSD_STAT 2 + +static int mmio_launch_invalidate(struct npu *npu, unsigned long launch, + unsigned long va) +{ + int mmio_atsd_reg; + + do { + mmio_atsd_reg = get_mmio_atsd_reg(npu); + cpu_relax(); + } while (mmio_atsd_reg < 0); + + __raw_writeq(cpu_to_be64(va), + npu->mmio_atsd_regs[mmio_atsd_reg] + XTS_ATSD_AVA); + eieio(); + __raw_writeq(cpu_to_be64(launch), npu->mmio_atsd_regs[mmio_atsd_reg]); + + return mmio_atsd_reg; +} + +static int mmio_invalidate_pid(struct npu *npu, unsigned long pid, bool flush) +{ + unsigned long launch; + + /* IS set to invalidate matching PID */ + launch = PPC_BIT(12); + + /* PRS set to process-scoped */ + launch |= PPC_BIT(13); + + /* AP */ + launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); + + /* No flush */ + launch |= !flush << PPC_BITLSHIFT(39); + + /* Invalidating the entire process doesn't use a va */ + return mmio_launch_invalidate(npu, launch, 0); +} + +static int mmio_invalidate_va(struct npu *npu, unsigned long va, + unsigned long pid, bool flush) +{ + unsigned long launch; + + /* IS set to invalidate target VA */ + launch = 0; + + /* PRS set to process scoped */ + launch |= PPC_BIT(13); + + /* AP */ + launch |= (u64) mmu_get_ap(mmu_virtual_psize) << PPC_BITLSHIFT(17); + + /* PID */ + launch |= pid << PPC_BITLSHIFT(38); + + /* No flush */ + launch |= !flush << PPC_BITLSHIFT(39); + + return mmio_launch_invalidate(npu, launch, va); +} + +#define mn_to_npu_context(x) container_of(x, struct npu_context, mn) + +struct mmio_atsd_reg { + struct npu *npu; + int reg; +}; + +static void mmio_invalidate_wait( + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS], bool flush) +{ + struct npu *npu; + int i, reg; + + /* Wait for all invalidations to complete */ + for (i = 0; i <= max_npu2_index; i++) { + if (mmio_atsd_reg[i].reg < 0) + continue; + + /* Wait for completion */ + npu = mmio_atsd_reg[i].npu; + reg = mmio_atsd_reg[i].reg; + while (__raw_readq(npu->mmio_atsd_regs[reg] + XTS_ATSD_STAT)) + cpu_relax(); + + put_mmio_atsd_reg(npu, reg); + + /* + * The GPU requires two flush ATSDs to ensure all entries have + * been flushed. We use PID 0 as it will never be used for a + * process on the GPU. + */ + if (flush) + mmio_invalidate_pid(npu, 0, true); + } +} + +/* + * Invalidate either a single address or an entire PID depending on + * the value of va. + */ +static void mmio_invalidate(struct npu_context *npu_context, int va, + unsigned long address, bool flush) +{ + int i, j; + struct npu *npu; + struct pnv_phb *nphb; + struct pci_dev *npdev; + struct mmio_atsd_reg mmio_atsd_reg[NV_MAX_NPUS]; + unsigned long pid = npu_context->mm->context.id; + + /* + * Loop over all the NPUs this process is active on and launch + * an invalidate. + */ + for (i = 0; i <= max_npu2_index; i++) { + mmio_atsd_reg[i].reg = -1; + for (j = 0; j < NV_MAX_LINKS; j++) { + npdev = npu_context->npdev[i][j]; + if (!npdev) + continue; + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + mmio_atsd_reg[i].npu = npu; + + if (va) + mmio_atsd_reg[i].reg = + mmio_invalidate_va(npu, address, pid, + flush); + else + mmio_atsd_reg[i].reg = + mmio_invalidate_pid(npu, pid, flush); + + /* + * The NPU hardware forwards the shootdown to all GPUs + * so we only have to launch one shootdown per NPU. + */ + break; + } + } + + /* + * Unfortunately the nest mmu does not support flushing specific + * addresses so we have to flush the whole mm. + */ + flush_tlb_mm(npu_context->mm); + + mmio_invalidate_wait(mmio_atsd_reg, flush); + if (flush) + /* Wait for the flush to complete */ + mmio_invalidate_wait(mmio_atsd_reg, false); +} + +static void pnv_npu2_mn_release(struct mmu_notifier *mn, + struct mm_struct *mm) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + /* Call into device driver to stop requests to the NMMU */ + if (npu_context->release_cb) + npu_context->release_cb(npu_context, npu_context->priv); + + /* + * There should be no more translation requests for this PID, but we + * need to ensure any entries for it are removed from the TLB. + */ + mmio_invalidate(npu_context, 0, 0, true); +} + +static void pnv_npu2_mn_change_pte(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address, + pte_t pte) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + mmio_invalidate(npu_context, 1, address, true); +} + +static void pnv_npu2_mn_invalidate_page(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long address) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + + mmio_invalidate(npu_context, 1, address, true); +} + +static void pnv_npu2_mn_invalidate_range(struct mmu_notifier *mn, + struct mm_struct *mm, + unsigned long start, unsigned long end) +{ + struct npu_context *npu_context = mn_to_npu_context(mn); + unsigned long address; + + for (address = start; address < end; address += PAGE_SIZE) + mmio_invalidate(npu_context, 1, address, false); + + /* Do the flush only on the final addess == end */ + mmio_invalidate(npu_context, 1, address, true); +} + +static const struct mmu_notifier_ops nv_nmmu_notifier_ops = { + .release = pnv_npu2_mn_release, + .change_pte = pnv_npu2_mn_change_pte, + .invalidate_page = pnv_npu2_mn_invalidate_page, + .invalidate_range = pnv_npu2_mn_invalidate_range, +}; + +/* + * Call into OPAL to setup the nmmu context for the current task in + * the NPU. This must be called to setup the context tables before the + * GPU issues ATRs. pdev should be a pointed to PCIe GPU device. + * + * A release callback should be registered to allow a device driver to + * be notified that it should not launch any new translation requests + * as the final TLB invalidate is about to occur. + * + * Returns an error if there no contexts are currently available or a + * npu_context which should be passed to pnv_npu2_handle_fault(). + * + * mmap_sem must be held in write mode. + */ +struct npu_context *pnv_npu2_init_context(struct pci_dev *gpdev, + unsigned long flags, + struct npu_context *(*cb)(struct npu_context *, void *), + void *priv) +{ + int rc; + u32 nvlink_index; + struct device_node *nvlink_dn; + struct mm_struct *mm = current->mm; + struct pnv_phb *nphb; + struct npu *npu; + struct npu_context *npu_context; + + /* + * At present we don't support GPUs connected to multiple NPUs and I'm + * not sure the hardware does either. + */ + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return ERR_PTR(-ENODEV); + + if (!npdev) + /* No nvlink associated with this GPU device */ + return ERR_PTR(-ENODEV); + + if (!mm || mm->context.id == 0) { + /* + * Kernel thread contexts are not supported and context id 0 is + * reserved on the GPU. + */ + return ERR_PTR(-EINVAL); + } + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + + /* + * Setup the NPU context table for a particular GPU. These need to be + * per-GPU as we need the tables to filter ATSDs when there are no + * active contexts on a particular GPU. + */ + rc = opal_npu_init_context(nphb->opal_id, mm->context.id, flags, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + if (rc < 0) + return ERR_PTR(-ENOSPC); + + /* + * We store the npu pci device so we can more easily get at the + * associated npus. + */ + npu_context = mm->context.npu_context; + if (!npu_context) { + npu_context = kzalloc(sizeof(struct npu_context), GFP_KERNEL); + if (!npu_context) + return ERR_PTR(-ENOMEM); + + mm->context.npu_context = npu_context; + npu_context->mm = mm; + npu_context->mn.ops = &nv_nmmu_notifier_ops; + __mmu_notifier_register(&npu_context->mn, mm); + kref_init(&npu_context->kref); + } else { + kref_get(&npu_context->kref); + } + + npu_context->release_cb = cb; + npu_context->priv = priv; + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", + &nvlink_index))) + return ERR_PTR(-ENODEV); + npu_context->npdev[npu->index][nvlink_index] = npdev; + + return npu_context; +} +EXPORT_SYMBOL(pnv_npu2_init_context); + +static void pnv_npu2_release_context(struct kref *kref) +{ + struct npu_context *npu_context = + container_of(kref, struct npu_context, kref); + + npu_context->mm->context.npu_context = NULL; + mmu_notifier_unregister(&npu_context->mn, + npu_context->mm); + + kfree(npu_context); +} + +void pnv_npu2_destroy_context(struct npu_context *npu_context, + struct pci_dev *gpdev) +{ + struct pnv_phb *nphb; + struct npu *npu; + struct pci_dev *npdev = pnv_pci_get_npu_dev(gpdev, 0); + struct device_node *nvlink_dn; + u32 nvlink_index; + + if (WARN_ON(!npdev)) + return; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return; + + nphb = pci_bus_to_host(npdev->bus)->private_data; + npu = &nphb->npu; + nvlink_dn = of_parse_phandle(npdev->dev.of_node, "ibm,nvlink", 0); + if (WARN_ON(of_property_read_u32(nvlink_dn, "ibm,npu-link-index", + &nvlink_index))) + return; + npu_context->npdev[npu->index][nvlink_index] = NULL; + opal_npu_destroy_context(nphb->opal_id, npu_context->mm->context.id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn)); + kref_put(&npu_context->kref, pnv_npu2_release_context); +} +EXPORT_SYMBOL(pnv_npu2_destroy_context); + +/* + * Assumes mmap_sem is held for the contexts associated mm. + */ +int pnv_npu2_handle_fault(struct npu_context *context, uintptr_t *ea, + unsigned long *flags, unsigned long *status, int count) +{ + u64 rc = 0, result = 0; + int i, is_write; + struct page *page[1]; + + /* mmap_sem should be held so the struct_mm must be present */ + struct mm_struct *mm = context->mm; + + if (!firmware_has_feature(FW_FEATURE_OPAL)) + return -ENODEV; + + WARN_ON(!rwsem_is_locked(&mm->mmap_sem)); + + for (i = 0; i < count; i++) { + is_write = flags[i] & NPU2_WRITE; + rc = get_user_pages_remote(NULL, mm, ea[i], 1, + is_write ? FOLL_WRITE : 0, + page, NULL, NULL); + + /* + * To support virtualised environments we will have to do an + * access to the page to ensure it gets faulted into the + * hypervisor. For the moment virtualisation is not supported in + * other areas so leave the access out. + */ + if (rc != 1) { + status[i] = rc; + result = -EFAULT; + continue; + } + + status[i] = 0; + put_page(page[0]); + } + + return result; +} +EXPORT_SYMBOL(pnv_npu2_handle_fault); + +int pnv_npu2_init(struct pnv_phb *phb) +{ + unsigned int i; + u64 mmio_atsd; + struct device_node *dn; + struct pci_dev *gpdev; + static int npu_index; + uint64_t rc = 0; + + for_each_child_of_node(phb->hose->dn, dn) { + gpdev = pnv_pci_get_gpu_dev(get_pci_dev(dn)); + if (gpdev) { + rc = opal_npu_map_lpar(phb->opal_id, + PCI_DEVID(gpdev->bus->number, gpdev->devfn), + 0, 0); + if (rc) + dev_err(&gpdev->dev, + "Error %lld mapping device to LPAR\n", + rc); + } + } + + for (i = 0; !of_property_read_u64_index(phb->hose->dn, "ibm,mmio-atsd", + i, &mmio_atsd); i++) + phb->npu.mmio_atsd_regs[i] = ioremap(mmio_atsd, 32); + + pr_info("NPU%lld: Found %d MMIO ATSD registers", phb->opal_id, i); + phb->npu.mmio_atsd_count = i; + phb->npu.mmio_atsd_usage = 0; + npu_index++; + if (WARN_ON(npu_index >= NV_MAX_NPUS)) + return -ENOSPC; + max_npu2_index = npu_index; + phb->npu.index = npu_index; + + return 0; +} --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/opal-wrappers.S +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/opal-wrappers.S @@ -301,3 +301,21 @@ OPAL_CALL(opal_int_set_mfrr, OPAL_INT_SET_MFRR); OPAL_CALL(opal_pci_tce_kill, OPAL_PCI_TCE_KILL); OPAL_CALL(opal_nmmu_set_ptcr, OPAL_NMMU_SET_PTCR); +OPAL_CALL(opal_xive_reset, OPAL_XIVE_RESET); +OPAL_CALL(opal_xive_get_irq_info, OPAL_XIVE_GET_IRQ_INFO); +OPAL_CALL(opal_xive_get_irq_config, OPAL_XIVE_GET_IRQ_CONFIG); +OPAL_CALL(opal_xive_set_irq_config, OPAL_XIVE_SET_IRQ_CONFIG); +OPAL_CALL(opal_xive_get_queue_info, OPAL_XIVE_GET_QUEUE_INFO); +OPAL_CALL(opal_xive_set_queue_info, OPAL_XIVE_SET_QUEUE_INFO); +OPAL_CALL(opal_xive_donate_page, OPAL_XIVE_DONATE_PAGE); +OPAL_CALL(opal_xive_alloc_vp_block, OPAL_XIVE_ALLOCATE_VP_BLOCK); +OPAL_CALL(opal_xive_free_vp_block, OPAL_XIVE_FREE_VP_BLOCK); +OPAL_CALL(opal_xive_allocate_irq, OPAL_XIVE_ALLOCATE_IRQ); +OPAL_CALL(opal_xive_free_irq, OPAL_XIVE_FREE_IRQ); +OPAL_CALL(opal_xive_get_vp_info, OPAL_XIVE_GET_VP_INFO); +OPAL_CALL(opal_xive_set_vp_info, OPAL_XIVE_SET_VP_INFO); +OPAL_CALL(opal_xive_sync, OPAL_XIVE_SYNC); +OPAL_CALL(opal_xive_dump, OPAL_XIVE_DUMP); +OPAL_CALL(opal_npu_init_context, OPAL_NPU_INIT_CONTEXT); +OPAL_CALL(opal_npu_destroy_context, OPAL_NPU_DESTROY_CONTEXT); +OPAL_CALL(opal_npu_map_lpar, OPAL_NPU_MAP_LPAR); --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/opal.c +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/opal.c @@ -59,6 +59,8 @@ void opal_configure_cores(void) { + uint64_t reinit_flags = 0; + /* Do the actual re-init, This will clobber all FPRs, VRs, etc... * * It will preserve non volatile GPRs and HSPRG0/1. It will @@ -66,11 +68,24 @@ * but it might clobber a bunch. */ #ifdef __BIG_ENDIAN__ - opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); + reinit_flags |= OPAL_REINIT_CPUS_HILE_BE; #else - opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_LE); + reinit_flags |= OPAL_REINIT_CPUS_HILE_LE; #endif + /* + * POWER9 always support running hash: + * ie. Host hash supports hash guests + * Host radix supports hash/radix guests + */ + if (early_cpu_has_feature(CPU_FTR_ARCH_300)) { + reinit_flags |= OPAL_REINIT_CPUS_MMU_HASH; + if (early_radix_enabled()) + reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX; + } + + opal_reinit_cpus(reinit_flags); + /* Restore some bits */ if (cur_cpu_spec->cpu_restore) cur_cpu_spec->cpu_restore(); --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/pci-ioda.c +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/pci-ioda.c @@ -1262,6 +1262,8 @@ /* PE#0 is needed for error reporting */ pnv_ioda_reserve_pe(phb, 0); pnv_ioda_setup_npu_PEs(hose->bus); + if (phb->model == PNV_PHB_MODEL_NPU2) + pnv_npu2_init(phb); } } } @@ -1424,8 +1426,7 @@ iommu_group_put(pe->table_group.group); BUG_ON(pe->table_group.group); } - pnv_pci_ioda2_table_free_pages(tbl); - iommu_free_table(tbl, of_node_full_name(dev->dev.of_node)); + iommu_tce_table_put(tbl); } static void pnv_ioda_release_vf_PE(struct pci_dev *pdev) @@ -1860,6 +1861,17 @@ return ret; } + +static int pnv_ioda1_tce_xchg_rm(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret = pnv_tce_xchg(tbl, index, hpa, direction); + + if (!ret) + pnv_pci_p7ioc_tce_invalidate(tbl, index, 1, true); + + return ret; +} #endif static void pnv_ioda1_tce_free(struct iommu_table *tbl, long index, @@ -1874,6 +1886,7 @@ .set = pnv_ioda1_tce_build, #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda1_tce_xchg, + .exchange_rm = pnv_ioda1_tce_xchg_rm, #endif .clear = pnv_ioda1_tce_free, .get = pnv_tce_get, @@ -1883,7 +1896,7 @@ #define PHB3_TCE_KILL_INVAL_PE PPC_BIT(1) #define PHB3_TCE_KILL_INVAL_ONE PPC_BIT(2) -void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) +static void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm) { __be64 __iomem *invalidate = pnv_ioda_get_inval_reg(phb, rm); const unsigned long val = PHB3_TCE_KILL_INVAL_ALL; @@ -1948,7 +1961,7 @@ { struct iommu_table_group_link *tgl; - list_for_each_entry_rcu(tgl, &tbl->it_group_list, next) { + list_for_each_entry_lockless(tgl, &tbl->it_group_list, next) { struct pnv_ioda_pe *pe = container_of(tgl->table_group, struct pnv_ioda_pe, table_group); struct pnv_phb *phb = pe->phb; @@ -1979,6 +1992,14 @@ } } +void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm) +{ + if (phb->model == PNV_PHB_MODEL_NPU || phb->model == PNV_PHB_MODEL_PHB3) + pnv_pci_phb3_tce_invalidate_entire(phb, rm); + else + opal_pci_tce_kill(phb->opal_id, OPAL_PCI_TCE_KILL, 0, 0, 0, 0); +} + static int pnv_ioda2_tce_build(struct iommu_table *tbl, long index, long npages, unsigned long uaddr, enum dma_data_direction direction, @@ -2004,6 +2025,17 @@ return ret; } + +static int pnv_ioda2_tce_xchg_rm(struct iommu_table *tbl, long index, + unsigned long *hpa, enum dma_data_direction *direction) +{ + long ret = pnv_tce_xchg(tbl, index, hpa, direction); + + if (!ret) + pnv_pci_ioda2_tce_invalidate(tbl, index, 1, true); + + return ret; +} #endif static void pnv_ioda2_tce_free(struct iommu_table *tbl, long index, @@ -2017,13 +2049,13 @@ static void pnv_ioda2_table_free(struct iommu_table *tbl) { pnv_pci_ioda2_table_free_pages(tbl); - iommu_free_table(tbl, "pnv"); } static struct iommu_table_ops pnv_ioda2_iommu_ops = { .set = pnv_ioda2_tce_build, #ifdef CONFIG_IOMMU_API .exchange = pnv_ioda2_tce_xchg, + .exchange_rm = pnv_ioda2_tce_xchg_rm, #endif .clear = pnv_ioda2_tce_free, .get = pnv_tce_get, @@ -2203,7 +2235,7 @@ __free_pages(tce_mem, get_order(tce32_segsz * segs)); if (tbl) { pnv_pci_unlink_table_and_group(tbl, &pe->table_group); - iommu_free_table(tbl, "pnv"); + iommu_tce_table_put(tbl); } } @@ -2293,16 +2325,16 @@ if (!tbl) return -ENOMEM; + tbl->it_ops = &pnv_ioda2_iommu_ops; + ret = pnv_pci_ioda2_table_alloc_pages(nid, bus_offset, page_shift, window_size, levels, tbl); if (ret) { - iommu_free_table(tbl, "pnv"); + iommu_tce_table_put(tbl); return ret; } - tbl->it_ops = &pnv_ioda2_iommu_ops; - *ptbl = tbl; return 0; @@ -2343,7 +2375,7 @@ if (rc) { pe_err(pe, "Failed to configure 32-bit TCE table, err %ld\n", rc); - pnv_ioda2_table_free(tbl); + iommu_tce_table_put(tbl); return rc; } @@ -2431,7 +2463,7 @@ pnv_pci_ioda2_unset_window(&pe->table_group, 0); if (pe->pbus) pnv_ioda_setup_bus_dma(pe, pe->pbus, false); - pnv_ioda2_table_free(tbl); + iommu_tce_table_put(tbl); } static void pnv_ioda2_release_ownership(struct iommu_table_group *table_group) @@ -3406,7 +3438,7 @@ } free_pages(tbl->it_base, get_order(tbl->it_size << 3)); - iommu_free_table(tbl, "pnv"); + iommu_tce_table_put(tbl); } static void pnv_pci_ioda2_release_pe_dma(struct pnv_ioda_pe *pe) @@ -3433,7 +3465,7 @@ } pnv_pci_ioda2_table_free_pages(tbl); - iommu_free_table(tbl, "pnv"); + iommu_tce_table_put(tbl); } static void pnv_ioda_free_pe_seg(struct pnv_ioda_pe *pe, --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/pci.c +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/pci.c @@ -767,6 +767,7 @@ tbl = kzalloc_node(sizeof(struct iommu_table), GFP_KERNEL, nid); INIT_LIST_HEAD_RCU(&tbl->it_group_list); + kref_init(&tbl->it_kref); return tbl; } --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/pci.h +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/pci.h @@ -7,6 +7,9 @@ struct pci_dn; +/* Maximum possible number of ATSD MMIO registers per NPU */ +#define NV_NMMU_ATSD_REGS 8 + enum pnv_phb_type { PNV_PHB_IODA1 = 0, PNV_PHB_IODA2 = 1, @@ -174,6 +177,16 @@ struct OpalIoP7IOCErrorData hub_diag; } diag; + /* Nvlink2 data */ + struct npu { + int index; + __be64 *mmio_atsd_regs[NV_NMMU_ATSD_REGS]; + unsigned int mmio_atsd_count; + + /* Bitmask for MMIO register usage */ + unsigned long mmio_atsd_usage; + } npu; + #ifdef CONFIG_CXL_BASE struct cxl_afu *cxl_afu; #endif @@ -229,14 +242,14 @@ /* Nvlink functions */ extern void pnv_npu_try_dma_set_bypass(struct pci_dev *gpdev, bool bypass); -extern void pnv_pci_phb3_tce_invalidate_entire(struct pnv_phb *phb, bool rm); +extern void pnv_pci_ioda2_tce_invalidate_entire(struct pnv_phb *phb, bool rm); extern struct pnv_ioda_pe *pnv_pci_npu_setup_iommu(struct pnv_ioda_pe *npe); extern long pnv_npu_set_window(struct pnv_ioda_pe *npe, int num, struct iommu_table *tbl); extern long pnv_npu_unset_window(struct pnv_ioda_pe *npe, int num); extern void pnv_npu_take_ownership(struct pnv_ioda_pe *npe); extern void pnv_npu_release_ownership(struct pnv_ioda_pe *npe); - +extern int pnv_npu2_init(struct pnv_phb *phb); /* cxl functions */ extern bool pnv_cxl_enable_device_hook(struct pci_dev *dev); --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/powernv.h +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/powernv.h @@ -18,8 +18,6 @@ #endif extern u32 pnv_get_supported_cpuidle_states(void); -extern u64 pnv_deepest_stop_psscr_val; -extern u64 pnv_deepest_stop_psscr_mask; extern void pnv_lpc_init(void); --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/setup.c +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/setup.c @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -76,7 +77,9 @@ static void __init pnv_init_IRQ(void) { - xics_init(); + /* Try using a XIVE if available, otherwise use a XICS */ + if (!xive_native_init()) + xics_init(); WARN_ON(!ppc_md.get_irq); } @@ -218,10 +221,14 @@ static void pnv_kexec_cpu_down(int crash_shutdown, int secondary) { - xics_kexec_teardown_cpu(secondary); + uint64_t reinit_flags; - /* On OPAL, we return all CPUs to firmware */ + if (xive_enabled()) + xive_kexec_teardown_cpu(secondary); + else + xics_kexec_teardown_cpu(secondary); + /* On OPAL, we return all CPUs to firmware */ if (!firmware_has_feature(FW_FEATURE_OPAL)) return; @@ -237,12 +244,23 @@ /* Primary waits for the secondaries to have reached OPAL */ pnv_kexec_wait_secondaries_down(); + /* Switch XIVE back to emulation mode */ + if (xive_enabled()) + xive_shutdown(); + /* * We might be running as little-endian - now that interrupts * are disabled, reset the HILE bit to big-endian so we don't * take interrupts in the wrong endian later + * + * We reinit to enable both radix and hash on P9 to ensure + * the mode used by the next kernel is always supported. */ - opal_reinit_cpus(OPAL_REINIT_CPUS_HILE_BE); + reinit_flags = OPAL_REINIT_CPUS_HILE_BE; + if (cpu_has_feature(CPU_FTR_ARCH_300)) + reinit_flags |= OPAL_REINIT_CPUS_MMU_RADIX | + OPAL_REINIT_CPUS_MMU_HASH; + opal_reinit_cpus(reinit_flags); } } #endif /* CONFIG_KEXEC_CORE */ --- linux-azure-4.11.0.orig/arch/powerpc/platforms/powernv/smp.c +++ linux-azure-4.11.0/arch/powerpc/platforms/powernv/smp.c @@ -29,12 +29,14 @@ #include #include #include +#include #include #include #include #include #include #include +#include #include "powernv.h" @@ -47,13 +49,10 @@ static void pnv_smp_setup_cpu(int cpu) { - if (cpu != boot_cpuid) + if (xive_enabled()) + xive_smp_setup_cpu(); + else if (cpu != boot_cpuid) xics_setup_cpu(); - -#ifdef CONFIG_PPC_DOORBELL - if (cpu_has_feature(CPU_FTR_DBELL)) - doorbell_setup_this_cpu(); -#endif } static int pnv_smp_kick_cpu(int nr) @@ -132,7 +131,10 @@ vdso_data->processorCount--; if (cpu == boot_cpuid) boot_cpuid = cpumask_any(cpu_online_mask); - xics_migrate_irqs_away(); + if (xive_enabled()) + xive_smp_disable_cpu(); + else + xics_migrate_irqs_away(); return 0; } @@ -140,7 +142,6 @@ { unsigned int cpu; unsigned long srr1, wmask; - u32 idle_states; /* Standard hot unplug procedure */ local_irq_disable(); @@ -155,8 +156,6 @@ if (cpu_has_feature(CPU_FTR_ARCH_207S)) wmask = SRR1_WAKEMASK_P8; - idle_states = pnv_get_supported_cpuidle_states(); - /* We don't want to take decrementer interrupts while we are offline, * so clear LPCR:PECE1. We keep PECE2 (and LPCR_PECE_HVEE on P9) * enabled as to let IPIs in. @@ -184,19 +183,7 @@ kvmppc_set_host_ipi(cpu, 0); ppc64_runlatch_off(); - - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - srr1 = power9_idle_stop(pnv_deepest_stop_psscr_val, - pnv_deepest_stop_psscr_mask); - } else if (idle_states & OPAL_PM_WINKLE_ENABLED) { - srr1 = power7_winkle(); - } else if ((idle_states & OPAL_PM_SLEEP_ENABLED) || - (idle_states & OPAL_PM_SLEEP_ENABLED_ER1)) { - srr1 = power7_sleep(); - } else { - srr1 = power7_nap(1); - } - + srr1 = pnv_cpu_offline(cpu); ppc64_runlatch_on(); /* @@ -213,9 +200,12 @@ if (((srr1 & wmask) == SRR1_WAKEEE) || ((srr1 & wmask) == SRR1_WAKEHVI) || (local_paca->irq_happened & PACA_IRQ_EE)) { - if (cpu_has_feature(CPU_FTR_ARCH_300)) - icp_opal_flush_interrupt(); - else + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (xive_enabled()) + xive_flush_interrupt(); + else + icp_opal_flush_interrupt(); + } else icp_native_flush_interrupt(); } else if ((srr1 & wmask) == SRR1_WAKEHDBELL) { unsigned long msg = PPC_DBELL_TYPE(PPC_DBELL_SERVER); @@ -252,10 +242,68 @@ return smp_generic_cpu_bootable(nr); } +static int pnv_smp_prepare_cpu(int cpu) +{ + if (xive_enabled()) + return xive_smp_prepare_cpu(cpu); + return 0; +} + +/* Cause IPI as setup by the interrupt controller (xics or xive) */ +static void (*ic_cause_ipi)(int cpu); + +static void pnv_cause_ipi(int cpu) +{ + if (doorbell_try_core_ipi(cpu)) + return; + + ic_cause_ipi(cpu); +} + +static void pnv_p9_dd1_cause_ipi(int cpu) +{ + int this_cpu = get_cpu(); + + /* + * POWER9 DD1 has a global addressed msgsnd, but for now we restrict + * IPIs to same core, because it requires additional synchronization + * for inter-core doorbells which we do not implement. + */ + if (cpumask_test_cpu(cpu, cpu_sibling_mask(this_cpu))) + doorbell_global_ipi(cpu); + else + ic_cause_ipi(cpu); + + put_cpu(); +} + +static void __init pnv_smp_probe(void) +{ + if (xive_enabled()) + xive_smp_probe(); + else + xics_smp_probe(); + + if (cpu_has_feature(CPU_FTR_DBELL)) { + ic_cause_ipi = smp_ops->cause_ipi; + WARN_ON(!ic_cause_ipi); + + if (cpu_has_feature(CPU_FTR_ARCH_300)) { + if (cpu_has_feature(CPU_FTR_POWER9_DD1)) + smp_ops->cause_ipi = pnv_p9_dd1_cause_ipi; + else + smp_ops->cause_ipi = doorbell_global_ipi; + } else { + smp_ops->cause_ipi = pnv_cause_ipi; + } + } +} + static struct smp_ops_t pnv_smp_ops = { - .message_pass = smp_muxed_ipi_message_pass, - .cause_ipi = NULL, /* Filled at runtime by xics_smp_probe() */ - .probe = xics_smp_probe, + .message_pass = NULL, /* Use smp_muxed_ipi_message_pass */ + .cause_ipi = NULL, /* Filled at runtime by pnv_smp_probe() */ + .probe = pnv_smp_probe, + .prepare_cpu = pnv_smp_prepare_cpu, .kick_cpu = pnv_smp_kick_cpu, .setup_cpu = pnv_smp_setup_cpu, .cpu_bootable = pnv_cpu_bootable, --- linux-azure-4.11.0.orig/arch/powerpc/platforms/pseries/dlpar.c +++ linux-azure-4.11.0/arch/powerpc/platforms/pseries/dlpar.c @@ -288,7 +288,6 @@ if (rc) return rc; - of_node_put(dn); /* Must decrement the refcount */ return 0; } --- linux-azure-4.11.0.orig/arch/powerpc/platforms/pseries/hotplug-memory.c +++ linux-azure-4.11.0/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -124,6 +124,7 @@ for (i = 0; i < num_lmbs; i++) { lmbs[i].base_addr = be64_to_cpu(lmbs[i].base_addr); lmbs[i].drc_index = be32_to_cpu(lmbs[i].drc_index); + lmbs[i].aa_index = be32_to_cpu(lmbs[i].aa_index); lmbs[i].flags = be32_to_cpu(lmbs[i].flags); } @@ -147,6 +148,7 @@ for (i = 0; i < num_lmbs; i++) { lmbs[i].base_addr = cpu_to_be64(lmbs[i].base_addr); lmbs[i].drc_index = cpu_to_be32(lmbs[i].drc_index); + lmbs[i].aa_index = cpu_to_be32(lmbs[i].aa_index); lmbs[i].flags = cpu_to_be32(lmbs[i].flags); } --- linux-azure-4.11.0.orig/arch/powerpc/platforms/pseries/iommu.c +++ linux-azure-4.11.0/arch/powerpc/platforms/pseries/iommu.c @@ -74,6 +74,7 @@ goto fail_exit; INIT_LIST_HEAD_RCU(&tbl->it_group_list); + kref_init(&tbl->it_kref); tgl->table_group = table_group; list_add_rcu(&tgl->next, &tbl->it_group_list); @@ -115,7 +116,7 @@ BUG_ON(table_group->group); } #endif - iommu_free_table(tbl, node_name); + iommu_tce_table_put(tbl); kfree(table_group); } --- linux-azure-4.11.0.orig/arch/powerpc/platforms/pseries/smp.c +++ linux-azure-4.11.0/arch/powerpc/platforms/pseries/smp.c @@ -55,11 +55,6 @@ */ static cpumask_var_t of_spin_mask; -/* - * If we multiplex IPI mechanisms, store the appropriate XICS IPI mechanism here - */ -static void (*xics_cause_ipi)(int cpu, unsigned long data); - /* Query where a cpu is now. Return codes #defined in plpar_wrappers.h */ int smp_query_cpu_stopped(unsigned int pcpu) { @@ -143,8 +138,6 @@ { if (cpu != boot_cpuid) xics_setup_cpu(); - if (cpu_has_feature(CPU_FTR_DBELL)) - doorbell_setup_this_cpu(); if (firmware_has_feature(FW_FEATURE_SPLPAR)) vpa_init(cpu); @@ -187,23 +180,23 @@ return 0; } -/* Only used on systems that support multiple IPI mechanisms */ -static void pSeries_cause_ipi_mux(int cpu, unsigned long data) +static void smp_pseries_cause_ipi(int cpu) { - if (cpumask_test_cpu(cpu, cpu_sibling_mask(smp_processor_id()))) - doorbell_cause_ipi(cpu, data); - else - xics_cause_ipi(cpu, data); + /* POWER9 should not use this handler */ + if (doorbell_try_core_ipi(cpu)) + return; + + icp_ops->cause_ipi(cpu); } static __init void pSeries_smp_probe(void) { xics_smp_probe(); - if (cpu_has_feature(CPU_FTR_DBELL)) { - xics_cause_ipi = smp_ops->cause_ipi; - smp_ops->cause_ipi = pSeries_cause_ipi_mux; - } + if (cpu_has_feature(CPU_FTR_DBELL)) + smp_ops->cause_ipi = smp_pseries_cause_ipi; + else + smp_ops->cause_ipi = icp_ops->cause_ipi; } static struct smp_ops_t pseries_smp_ops = { --- linux-azure-4.11.0.orig/arch/powerpc/platforms/pseries/vio.c +++ linux-azure-4.11.0/arch/powerpc/platforms/pseries/vio.c @@ -1318,7 +1318,7 @@ struct iommu_table *tbl = get_iommu_table_base(dev); if (tbl) - iommu_free_table(tbl, of_node_full_name(dev->of_node)); + iommu_tce_table_put(tbl); of_node_put(dev->of_node); kfree(to_vio_dev(dev)); } --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/Kconfig +++ linux-azure-4.11.0/arch/powerpc/sysdev/Kconfig @@ -28,6 +28,7 @@ default y if PPC_POWERNV source "arch/powerpc/sysdev/xics/Kconfig" +source "arch/powerpc/sysdev/xive/Kconfig" config PPC_SCOM bool --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/Makefile +++ linux-azure-4.11.0/arch/powerpc/sysdev/Makefile @@ -71,5 +71,6 @@ subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror obj-$(CONFIG_PPC_XICS) += xics/ +obj-$(CONFIG_PPC_XIVE) += xive/ obj-$(CONFIG_GE_FPGA) += ge/ --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/simple_gpio.c +++ linux-azure-4.11.0/arch/powerpc/sysdev/simple_gpio.c @@ -75,7 +75,8 @@ static void u8_gpio_save_regs(struct of_mm_gpio_chip *mm_gc) { - struct u8_gpio_chip *u8_gc = gpiochip_get_data(&mm_gc->gc); + struct u8_gpio_chip *u8_gc = + container_of(mm_gc, struct u8_gpio_chip, mm_gc); u8_gc->data = in_8(mm_gc->regs); } --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/xics/icp-hv.c +++ linux-azure-4.11.0/arch/powerpc/sysdev/xics/icp-hv.c @@ -138,7 +138,7 @@ #ifdef CONFIG_SMP -static void icp_hv_cause_ipi(int cpu, unsigned long data) +static void icp_hv_cause_ipi(int cpu) { icp_hv_set_qirr(cpu, IPI_PRIORITY); } --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/xics/icp-native.c +++ linux-azure-4.11.0/arch/powerpc/sysdev/xics/icp-native.c @@ -143,19 +143,9 @@ #ifdef CONFIG_SMP -static void icp_native_cause_ipi(int cpu, unsigned long data) +static void icp_native_cause_ipi(int cpu) { kvmppc_set_host_ipi(cpu, 1); -#ifdef CONFIG_PPC_DOORBELL - if (cpu_has_feature(CPU_FTR_DBELL)) { - if (cpumask_test_cpu(cpu, cpu_sibling_mask(get_cpu()))) { - doorbell_cause_ipi(cpu, data); - put_cpu(); - return; - } - put_cpu(); - } -#endif icp_native_set_qirr(cpu, IPI_PRIORITY); } --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/xics/icp-opal.c +++ linux-azure-4.11.0/arch/powerpc/sysdev/xics/icp-opal.c @@ -126,7 +126,7 @@ #ifdef CONFIG_SMP -static void icp_opal_cause_ipi(int cpu, unsigned long data) +static void icp_opal_cause_ipi(int cpu) { int hw_cpu = get_hard_smp_processor_id(cpu); --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/xics/xics-common.c +++ linux-azure-4.11.0/arch/powerpc/sysdev/xics/xics-common.c @@ -143,11 +143,11 @@ void __init xics_smp_probe(void) { - /* Setup cause_ipi callback based on which ICP is used */ - smp_ops->cause_ipi = icp_ops->cause_ipi; - /* Register all the IPIs */ xics_request_ipi(); + + /* Setup cause_ipi callback based on which ICP is used */ + smp_ops->cause_ipi = icp_ops->cause_ipi; } #endif /* CONFIG_SMP */ --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/xive/Kconfig +++ linux-azure-4.11.0/arch/powerpc/sysdev/xive/Kconfig @@ -0,0 +1,11 @@ +config PPC_XIVE + bool + default n + select PPC_SMP_MUXED_IPI + select HARDIRQS_SW_RESEND + +config PPC_XIVE_NATIVE + bool + default n + select PPC_XIVE + depends on PPC_POWERNV --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/xive/Makefile +++ linux-azure-4.11.0/arch/powerpc/sysdev/xive/Makefile @@ -0,0 +1,4 @@ +subdir-ccflags-$(CONFIG_PPC_WERROR) := -Werror + +obj-y += common.o +obj-$(CONFIG_PPC_XIVE_NATIVE) += native.o --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/xive/common.c +++ linux-azure-4.11.0/arch/powerpc/sysdev/xive/common.c @@ -0,0 +1,1302 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "xive: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xive-internal.h" + +#undef DEBUG_FLUSH +#undef DEBUG_ALL + +#ifdef DEBUG_ALL +#define DBG_VERBOSE(fmt...) pr_devel(fmt) +#else +#define DBG_VERBOSE(fmt...) do { } while(0) +#endif + +bool __xive_enabled; +bool xive_cmdline_disabled; + +/* We use only one priority for now */ +static u8 xive_irq_priority; + +/* TIMA */ +void __iomem *xive_tima; +u32 xive_tima_offset; + +/* Backend ops */ +static const struct xive_ops *xive_ops; + +/* Our global interrupt domain */ +static struct irq_domain *xive_irq_domain; + +#ifdef CONFIG_SMP +/* The IPIs all use the same logical irq number */ +static u32 xive_ipi_irq; +#endif + +/* Xive state for each CPU */ +static DEFINE_PER_CPU(struct xive_cpu *, xive_cpu); + +/* + * A "disabled" interrupt should never fire, to catch problems + * we set its logical number to this + */ +#define XIVE_BAD_IRQ 0x7fffffff +#define XIVE_MAX_IRQ (XIVE_BAD_IRQ - 1) + +/* An invalid CPU target */ +#define XIVE_INVALID_TARGET (-1) + +/* + * Read the next entry in a queue, return its content if it's valid + * or 0 if there is no new entry. + * + * The queue pointer is moved forward unless "just_peek" is set + */ +static u32 xive_read_eq(struct xive_q *q, bool just_peek) +{ + u32 cur; + + if (!q->qpage) + return 0; + cur = be32_to_cpup(q->qpage + q->idx); + + /* Check valid bit (31) vs current toggle polarity */ + if ((cur >> 31) == q->toggle) + return 0; + + /* If consuming from the queue ... */ + if (!just_peek) { + /* Next entry */ + q->idx = (q->idx + 1) & q->msk; + + /* Wrap around: flip valid toggle */ + if (q->idx == 0) + q->toggle ^= 1; + } + /* Mask out the valid bit (31) */ + return cur & 0x7fffffff; +} + +/* + * Scans all the queue that may have interrupts in them + * (based on "pending_prio") in priority order until an + * interrupt is found or all the queues are empty. + * + * Then updates the CPPR (Current Processor Priority + * Register) based on the most favored interrupt found + * (0xff if none) and return what was found (0 if none). + * + * If just_peek is set, return the most favored pending + * interrupt if any but don't update the queue pointers. + * + * Note: This function can operate generically on any number + * of queues (up to 8). The current implementation of the XIVE + * driver only uses a single queue however. + * + * Note2: This will also "flush" "the pending_count" of a queue + * into the "count" when that queue is observed to be empty. + * This is used to keep track of the amount of interrupts + * targetting a queue. When an interrupt is moved away from + * a queue, we only decrement that queue count once the queue + * has been observed empty to avoid races. + */ +static u32 xive_scan_interrupts(struct xive_cpu *xc, bool just_peek) +{ + u32 irq = 0; + u8 prio; + + /* Find highest pending priority */ + while (xc->pending_prio != 0) { + struct xive_q *q; + + prio = ffs(xc->pending_prio) - 1; + DBG_VERBOSE("scan_irq: trying prio %d\n", prio); + + /* Try to fetch */ + irq = xive_read_eq(&xc->queue[prio], just_peek); + + /* Found something ? That's it */ + if (irq) + break; + + /* Clear pending bits */ + xc->pending_prio &= ~(1 << prio); + + /* + * Check if the queue count needs adjusting due to + * interrupts being moved away. See description of + * xive_dec_target_count() + */ + q = &xc->queue[prio]; + if (atomic_read(&q->pending_count)) { + int p = atomic_xchg(&q->pending_count, 0); + if (p) { + WARN_ON(p > atomic_read(&q->count)); + atomic_sub(p, &q->count); + } + } + } + + /* If nothing was found, set CPPR to 0xff */ + if (irq == 0) + prio = 0xff; + + /* Update HW CPPR to match if necessary */ + if (prio != xc->cppr) { + DBG_VERBOSE("scan_irq: adjusting CPPR to %d\n", prio); + xc->cppr = prio; + out_8(xive_tima + xive_tima_offset + TM_CPPR, prio); + } + + return irq; +} + +/* + * This is used to perform the magic loads from an ESB + * described in xive.h + */ +static u8 xive_poke_esb(struct xive_irq_data *xd, u32 offset) +{ + u64 val; + + /* Handle HW errata */ + if (xd->flags & XIVE_IRQ_FLAG_SHIFT_BUG) + offset |= offset << 4; + + val = in_be64(xd->eoi_mmio + offset); + + return (u8)val; +} + +#ifdef CONFIG_XMON +static void xive_dump_eq(const char *name, struct xive_q *q) +{ + u32 i0, i1, idx; + + if (!q->qpage) + return; + idx = q->idx; + i0 = be32_to_cpup(q->qpage + idx); + idx = (idx + 1) & q->msk; + i1 = be32_to_cpup(q->qpage + idx); + xmon_printf(" %s Q T=%d %08x %08x ...\n", name, + q->toggle, i0, i1); +} + +void xmon_xive_do_dump(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + + xmon_printf("XIVE state for CPU %d:\n", cpu); + xmon_printf(" pp=%02x cppr=%02x\n", xc->pending_prio, xc->cppr); + xive_dump_eq("IRQ", &xc->queue[xive_irq_priority]); +#ifdef CONFIG_SMP + { + u64 val = xive_poke_esb(&xc->ipi_data, XIVE_ESB_GET); + xmon_printf(" IPI state: %x:%c%c\n", xc->hw_ipi, + val & XIVE_ESB_VAL_P ? 'P' : 'p', + val & XIVE_ESB_VAL_P ? 'Q' : 'q'); + } +#endif +} +#endif /* CONFIG_XMON */ + +static unsigned int xive_get_irq(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + u32 irq; + + /* + * This can be called either as a result of a HW interrupt or + * as a "replay" because EOI decided there was still something + * in one of the queues. + * + * First we perform an ACK cycle in order to update our mask + * of pending priorities. This will also have the effect of + * updating the CPPR to the most favored pending interrupts. + * + * In the future, if we have a way to differenciate a first + * entry (on HW interrupt) from a replay triggered by EOI, + * we could skip this on replays unless we soft-mask tells us + * that a new HW interrupt occurred. + */ + xive_ops->update_pending(xc); + + DBG_VERBOSE("get_irq: pending=%02x\n", xc->pending_prio); + + /* Scan our queue(s) for interrupts */ + irq = xive_scan_interrupts(xc, false); + + DBG_VERBOSE("get_irq: got irq 0x%x, new pending=0x%02x\n", + irq, xc->pending_prio); + + /* Return pending interrupt if any */ + if (irq == XIVE_BAD_IRQ) + return 0; + return irq; +} + +/* + * After EOI'ing an interrupt, we need to re-check the queue + * to see if another interrupt is pending since multiple + * interrupts can coalesce into a single notification to the + * CPU. + * + * If we find that there is indeed more in there, we call + * force_external_irq_replay() to make Linux synthetize an + * external interrupt on the next call to local_irq_restore(). + */ +static void xive_do_queue_eoi(struct xive_cpu *xc) +{ + if (xive_scan_interrupts(xc, true) != 0) { + DBG_VERBOSE("eoi: pending=0x%02x\n", xc->pending_prio); + force_external_irq_replay(); + } +} + +/* + * EOI an interrupt at the source. There are several methods + * to do this depending on the HW version and source type + */ +void xive_do_source_eoi(u32 hw_irq, struct xive_irq_data *xd) +{ + /* If the XIVE supports the new "store EOI facility, use it */ + if (xd->flags & XIVE_IRQ_FLAG_STORE_EOI) + out_be64(xd->eoi_mmio, 0); + else if (hw_irq && xd->flags & XIVE_IRQ_FLAG_EOI_FW) { + /* + * The FW told us to call it. This happens for some + * interrupt sources that need additional HW whacking + * beyond the ESB manipulation. For example LPC interrupts + * on P9 DD1.0 need a latch to be clared in the LPC bridge + * itself. The Firmware will take care of it. + */ + if (WARN_ON_ONCE(!xive_ops->eoi)) + return; + xive_ops->eoi(hw_irq); + } else { + u8 eoi_val; + + /* + * Otherwise for EOI, we use the special MMIO that does + * a clear of both P and Q and returns the old Q, + * except for LSIs where we use the "EOI cycle" special + * load. + * + * This allows us to then do a re-trigger if Q was set + * rather than synthesizing an interrupt in software + * + * For LSIs, using the HW EOI cycle works around a problem + * on P9 DD1 PHBs where the other ESB accesses don't work + * properly. + */ + if (xd->flags & XIVE_IRQ_FLAG_LSI) + in_be64(xd->eoi_mmio); + else { + eoi_val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); + DBG_VERBOSE("eoi_val=%x\n", offset, eoi_val); + + /* Re-trigger if needed */ + if ((eoi_val & XIVE_ESB_VAL_Q) && xd->trig_mmio) + out_be64(xd->trig_mmio, 0); + } + } +} + +/* irq_chip eoi callback */ +static void xive_irq_eoi(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + DBG_VERBOSE("eoi_irq: irq=%d [0x%lx] pending=%02x\n", + d->irq, irqd_to_hwirq(d), xc->pending_prio); + + /* EOI the source if it hasn't been disabled */ + if (!irqd_irq_disabled(d)) + xive_do_source_eoi(irqd_to_hwirq(d), xd); + + /* + * Clear saved_p to indicate that it's no longer occupying + * a queue slot on the target queue + */ + xd->saved_p = false; + + /* Check for more work in the queue */ + xive_do_queue_eoi(xc); +} + +/* + * Helper used to mask and unmask an interrupt source. This + * is only called for normal interrupts that do not require + * masking/unmasking via firmware. + */ +static void xive_do_source_set_mask(struct xive_irq_data *xd, + bool mask) +{ + u64 val; + + /* + * If the interrupt had P set, it may be in a queue. + * + * We need to make sure we don't re-enable it until it + * has been fetched from that queue and EOId. We keep + * a copy of that P state and use it to restore the + * ESB accordingly on unmask. + */ + if (mask) { + val = xive_poke_esb(xd, XIVE_ESB_SET_PQ_01); + xd->saved_p = !!(val & XIVE_ESB_VAL_P); + } else if (xd->saved_p) + xive_poke_esb(xd, XIVE_ESB_SET_PQ_10); + else + xive_poke_esb(xd, XIVE_ESB_SET_PQ_00); +} + +/* + * Try to chose "cpu" as a new interrupt target. Increments + * the queue accounting for that target if it's not already + * full. + */ +static bool xive_try_pick_target(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + struct xive_q *q = &xc->queue[xive_irq_priority]; + int max; + + /* + * Calculate max number of interrupts in that queue. + * + * We leave a gap of 1 just in case... + */ + max = (q->msk + 1) - 1; + return !!atomic_add_unless(&q->count, 1, max); +} + +/* + * Un-account an interrupt for a target CPU. We don't directly + * decrement q->count since the interrupt might still be present + * in the queue. + * + * Instead increment a separate counter "pending_count" which + * will be substracted from "count" later when that CPU observes + * the queue to be empty. + */ +static void xive_dec_target_count(int cpu) +{ + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + struct xive_q *q = &xc->queue[xive_irq_priority]; + + if (unlikely(WARN_ON(cpu < 0 || !xc))) { + pr_err("%s: cpu=%d xc=%p\n", __func__, cpu, xc); + return; + } + + /* + * We increment the "pending count" which will be used + * to decrement the target queue count whenever it's next + * processed and found empty. This ensure that we don't + * decrement while we still have the interrupt there + * occupying a slot. + */ + atomic_inc(&q->pending_count); +} + +/* Find a tentative CPU target in a CPU mask */ +static int xive_find_target_in_mask(const struct cpumask *mask, + unsigned int fuzz) +{ + int cpu, first, num, i; + + /* Pick up a starting point CPU in the mask based on fuzz */ + num = cpumask_weight(mask); + first = fuzz % num; + + /* Locate it */ + cpu = cpumask_first(mask); + for (i = 0; i < first && cpu < nr_cpu_ids; i++) + cpu = cpumask_next(cpu, mask); + + /* Sanity check */ + if (WARN_ON(cpu >= nr_cpu_ids)) + cpu = cpumask_first(cpu_online_mask); + + /* Remember first one to handle wrap-around */ + first = cpu; + + /* + * Now go through the entire mask until we find a valid + * target. + */ + for (;;) { + /* + * We re-check online as the fallback case passes us + * an untested affinity mask + */ + if (cpu_online(cpu) && xive_try_pick_target(cpu)) + return cpu; + cpu = cpumask_next(cpu, mask); + if (cpu == first) + break; + /* Wrap around */ + if (cpu >= nr_cpu_ids) + cpu = cpumask_first(mask); + } + return -1; +} + +/* + * Pick a target CPU for an interrupt. This is done at + * startup or if the affinity is changed in a way that + * invalidates the current target. + */ +static int xive_pick_irq_target(struct irq_data *d, + const struct cpumask *affinity) +{ + static unsigned int fuzz; + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + cpumask_var_t mask; + int cpu = -1; + + /* + * If we have chip IDs, first we try to build a mask of + * CPUs matching the CPU and find a target in there + */ + if (xd->src_chip != XIVE_INVALID_CHIP_ID && + zalloc_cpumask_var(&mask, GFP_ATOMIC)) { + /* Build a mask of matching chip IDs */ + for_each_cpu_and(cpu, affinity, cpu_online_mask) { + struct xive_cpu *xc = per_cpu(xive_cpu, cpu); + if (xc->chip_id == xd->src_chip) + cpumask_set_cpu(cpu, mask); + } + /* Try to find a target */ + if (cpumask_empty(mask)) + cpu = -1; + else + cpu = xive_find_target_in_mask(mask, fuzz++); + free_cpumask_var(mask); + if (cpu >= 0) + return cpu; + fuzz--; + } + + /* No chip IDs, fallback to using the affinity mask */ + return xive_find_target_in_mask(affinity, fuzz++); +} + +static unsigned int xive_irq_startup(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + int target, rc; + + pr_devel("xive_irq_startup: irq %d [0x%x] data @%p\n", + d->irq, hw_irq, d); + +#ifdef CONFIG_PCI_MSI + /* + * The generic MSI code returns with the interrupt disabled on the + * card, using the MSI mask bits. Firmware doesn't appear to unmask + * at that level, so we do it here by hand. + */ + if (irq_data_get_msi_desc(d)) + pci_msi_unmask_irq(d); +#endif + + /* Pick a target */ + target = xive_pick_irq_target(d, irq_data_get_affinity_mask(d)); + if (target == XIVE_INVALID_TARGET) { + /* Try again breaking affinity */ + target = xive_pick_irq_target(d, cpu_online_mask); + if (target == XIVE_INVALID_TARGET) + return -ENXIO; + pr_warn("irq %d started with broken affinity\n", d->irq); + } + + /* Sanity check */ + if (WARN_ON(target == XIVE_INVALID_TARGET || + target >= nr_cpu_ids)) + target = smp_processor_id(); + + xd->target = target; + + /* + * Configure the logical number to be the Linux IRQ number + * and set the target queue + */ + rc = xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(target), + xive_irq_priority, d->irq); + if (rc) + return rc; + + /* Unmask the ESB */ + xive_do_source_set_mask(xd, false); + + return 0; +} + +static void xive_irq_shutdown(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + + pr_devel("xive_irq_shutdown: irq %d [0x%x] data @%p\n", + d->irq, hw_irq, d); + + if (WARN_ON(xd->target == XIVE_INVALID_TARGET)) + return; + + /* Mask the interrupt at the source */ + xive_do_source_set_mask(xd, true); + + /* + * The above may have set saved_p. We clear it otherwise it + * will prevent re-enabling later on. It is ok to forget the + * fact that the interrupt might be in a queue because we are + * accounting that already in xive_dec_target_count() and will + * be re-routing it to a new queue with proper accounting when + * it's started up again + */ + xd->saved_p = false; + + /* + * Mask the interrupt in HW in the IVT/EAS and set the number + * to be the "bad" IRQ number + */ + xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + 0xff, XIVE_BAD_IRQ); + + xive_dec_target_count(xd->target); + xd->target = XIVE_INVALID_TARGET; +} + +static void xive_irq_unmask(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + pr_devel("xive_irq_unmask: irq %d data @%p\n", d->irq, xd); + + /* + * This is a workaround for PCI LSI problems on P9, for + * these, we call FW to set the mask. The problems might + * be fixed by P9 DD2.0, if that is the case, firmware + * will no longer set that flag. + */ + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) { + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + xive_irq_priority, d->irq); + return; + } + + xive_do_source_set_mask(xd, false); +} + +static void xive_irq_mask(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + pr_devel("xive_irq_mask: irq %d data @%p\n", d->irq, xd); + + /* + * This is a workaround for PCI LSI problems on P9, for + * these, we call OPAL to set the mask. The problems might + * be fixed by P9 DD2.0, if that is the case, firmware + * will no longer set that flag. + */ + if (xd->flags & XIVE_IRQ_FLAG_MASK_FW) { + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(xd->target), + 0xff, d->irq); + return; + } + + xive_do_source_set_mask(xd, true); +} + +static int xive_irq_set_affinity(struct irq_data *d, + const struct cpumask *cpumask, + bool force) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + u32 target, old_target; + int rc = 0; + + pr_devel("xive_irq_set_affinity: irq %d\n", d->irq); + + /* Is this valid ? */ + if (cpumask_any_and(cpumask, cpu_online_mask) >= nr_cpu_ids) + return -EINVAL; + + /* + * If existing target is already in the new mask, and is + * online then do nothing. + */ + if (xd->target != XIVE_INVALID_TARGET && + cpu_online(xd->target) && + cpumask_test_cpu(xd->target, cpumask)) + return IRQ_SET_MASK_OK; + + /* Pick a new target */ + target = xive_pick_irq_target(d, cpumask); + + /* No target found */ + if (target == XIVE_INVALID_TARGET) + return -ENXIO; + + /* Sanity check */ + if (WARN_ON(target >= nr_cpu_ids)) + target = smp_processor_id(); + + old_target = xd->target; + + rc = xive_ops->configure_irq(hw_irq, + get_hard_smp_processor_id(target), + xive_irq_priority, d->irq); + if (rc < 0) { + pr_err("Error %d reconfiguring irq %d\n", rc, d->irq); + return rc; + } + + pr_devel(" target: 0x%x\n", target); + xd->target = target; + + /* Give up previous target */ + if (old_target != XIVE_INVALID_TARGET) + xive_dec_target_count(old_target); + + return IRQ_SET_MASK_OK; +} + +static int xive_irq_set_type(struct irq_data *d, unsigned int flow_type) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* + * We only support these. This has really no effect other than setting + * the corresponding descriptor bits mind you but those will in turn + * affect the resend function when re-enabling an edge interrupt. + * + * Set set the default to edge as explained in map(). + */ + if (flow_type == IRQ_TYPE_DEFAULT || flow_type == IRQ_TYPE_NONE) + flow_type = IRQ_TYPE_EDGE_RISING; + + if (flow_type != IRQ_TYPE_EDGE_RISING && + flow_type != IRQ_TYPE_LEVEL_LOW) + return -EINVAL; + + irqd_set_trigger_type(d, flow_type); + + /* + * Double check it matches what the FW thinks + * + * NOTE: We don't know yet if the PAPR interface will provide + * the LSI vs MSI information apart from the device-tree so + * this check might have to move into an optional backend call + * that is specific to the native backend + */ + if ((flow_type == IRQ_TYPE_LEVEL_LOW) != + !!(xd->flags & XIVE_IRQ_FLAG_LSI)) { + pr_warn("Interrupt %d (HW 0x%x) type mismatch, Linux says %s, FW says %s\n", + d->irq, (u32)irqd_to_hwirq(d), + (flow_type == IRQ_TYPE_LEVEL_LOW) ? "Level" : "Edge", + (xd->flags & XIVE_IRQ_FLAG_LSI) ? "Level" : "Edge"); + } + + return IRQ_SET_MASK_OK_NOCOPY; +} + +static int xive_irq_retrigger(struct irq_data *d) +{ + struct xive_irq_data *xd = irq_data_get_irq_handler_data(d); + + /* This should be only for MSIs */ + if (WARN_ON(xd->flags & XIVE_IRQ_FLAG_LSI)) + return 0; + + /* + * To perform a retrigger, we first set the PQ bits to + * 11, then perform an EOI. + */ + xive_poke_esb(xd, XIVE_ESB_SET_PQ_11); + + /* + * Note: We pass "0" to the hw_irq argument in order to + * avoid calling into the backend EOI code which we don't + * want to do in the case of a re-trigger. Backends typically + * only do EOI for LSIs anyway. + */ + xive_do_source_eoi(0, xd); + + return 1; +} + +static struct irq_chip xive_irq_chip = { + .name = "XIVE-IRQ", + .irq_startup = xive_irq_startup, + .irq_shutdown = xive_irq_shutdown, + .irq_eoi = xive_irq_eoi, + .irq_mask = xive_irq_mask, + .irq_unmask = xive_irq_unmask, + .irq_set_affinity = xive_irq_set_affinity, + .irq_set_type = xive_irq_set_type, + .irq_retrigger = xive_irq_retrigger, +}; + +bool is_xive_irq(struct irq_chip *chip) +{ + return chip == &xive_irq_chip; +} + +void xive_cleanup_irq_data(struct xive_irq_data *xd) +{ + if (xd->eoi_mmio) { + iounmap(xd->eoi_mmio); + if (xd->eoi_mmio == xd->trig_mmio) + xd->trig_mmio = NULL; + xd->eoi_mmio = NULL; + } + if (xd->trig_mmio) { + iounmap(xd->trig_mmio); + xd->trig_mmio = NULL; + } +} + +static int xive_irq_alloc_data(unsigned int virq, irq_hw_number_t hw) +{ + struct xive_irq_data *xd; + int rc; + + xd = kzalloc(sizeof(struct xive_irq_data), GFP_KERNEL); + if (!xd) + return -ENOMEM; + rc = xive_ops->populate_irq_data(hw, xd); + if (rc) { + kfree(xd); + return rc; + } + xd->target = XIVE_INVALID_TARGET; + irq_set_handler_data(virq, xd); + + return 0; +} + +static void xive_irq_free_data(unsigned int virq) +{ + struct xive_irq_data *xd = irq_get_handler_data(virq); + + if (!xd) + return; + irq_set_handler_data(virq, NULL); + xive_cleanup_irq_data(xd); + kfree(xd); +} + +#ifdef CONFIG_SMP + +static void xive_cause_ipi(int cpu) +{ + struct xive_cpu *xc; + struct xive_irq_data *xd; + + xc = per_cpu(xive_cpu, cpu); + + DBG_VERBOSE("IPI CPU %d -> %d (HW IRQ 0x%x)\n", + smp_processor_id(), cpu, xc->hw_ipi); + + xd = &xc->ipi_data; + if (WARN_ON(!xd->trig_mmio)) + return; + out_be64(xd->trig_mmio, 0); +} + +static irqreturn_t xive_muxed_ipi_action(int irq, void *dev_id) +{ + return smp_ipi_demux(); +} + +static void xive_ipi_eoi(struct irq_data *d) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + /* Handle possible race with unplug and drop stale IPIs */ + if (!xc) + return; + xive_do_source_eoi(xc->hw_ipi, &xc->ipi_data); + xive_do_queue_eoi(xc); +} + +static void xive_ipi_do_nothing(struct irq_data *d) +{ + /* + * Nothing to do, we never mask/unmask IPIs, but the callback + * has to exist for the struct irq_chip. + */ +} + +static struct irq_chip xive_ipi_chip = { + .name = "XIVE-IPI", + .irq_eoi = xive_ipi_eoi, + .irq_mask = xive_ipi_do_nothing, + .irq_unmask = xive_ipi_do_nothing, +}; + +static void __init xive_request_ipi(void) +{ + unsigned int virq; + + /* + * Initialization failed, move on, we might manage to + * reach the point where we display our errors before + * the system falls appart + */ + if (!xive_irq_domain) + return; + + /* Initialize it */ + virq = irq_create_mapping(xive_irq_domain, 0); + xive_ipi_irq = virq; + + WARN_ON(request_irq(virq, xive_muxed_ipi_action, + IRQF_PERCPU | IRQF_NO_THREAD, "IPI", NULL)); +} + +static int xive_setup_cpu_ipi(unsigned int cpu) +{ + struct xive_cpu *xc; + int rc; + + pr_debug("Setting up IPI for CPU %d\n", cpu); + + xc = per_cpu(xive_cpu, cpu); + + /* Check if we are already setup */ + if (xc->hw_ipi != 0) + return 0; + + /* Grab an IPI from the backend, this will populate xc->hw_ipi */ + if (xive_ops->get_ipi(cpu, xc)) + return -EIO; + + /* + * Populate the IRQ data in the xive_cpu structure and + * configure the HW / enable the IPIs. + */ + rc = xive_ops->populate_irq_data(xc->hw_ipi, &xc->ipi_data); + if (rc) { + pr_err("Failed to populate IPI data on CPU %d\n", cpu); + return -EIO; + } + rc = xive_ops->configure_irq(xc->hw_ipi, + get_hard_smp_processor_id(cpu), + xive_irq_priority, xive_ipi_irq); + if (rc) { + pr_err("Failed to map IPI CPU %d\n", cpu); + return -EIO; + } + pr_devel("CPU %d HW IPI %x, virq %d, trig_mmio=%p\n", cpu, + xc->hw_ipi, xive_ipi_irq, xc->ipi_data.trig_mmio); + + /* Unmask it */ + xive_do_source_set_mask(&xc->ipi_data, false); + + return 0; +} + +static void xive_cleanup_cpu_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + /* Disable the IPI and free the IRQ data */ + + /* Already cleaned up ? */ + if (xc->hw_ipi == 0) + return; + + /* Mask the IPI */ + xive_do_source_set_mask(&xc->ipi_data, true); + + /* + * Note: We don't call xive_cleanup_irq_data() to free + * the mappings as this is called from an IPI on kexec + * which is not a safe environment to call iounmap() + */ + + /* Deconfigure/mask in the backend */ + xive_ops->configure_irq(xc->hw_ipi, hard_smp_processor_id(), + 0xff, xive_ipi_irq); + + /* Free the IPIs in the backend */ + xive_ops->put_ipi(cpu, xc); +} + +void __init xive_smp_probe(void) +{ + smp_ops->cause_ipi = xive_cause_ipi; + + /* Register the IPI */ + xive_request_ipi(); + + /* Allocate and setup IPI for the boot CPU */ + xive_setup_cpu_ipi(smp_processor_id()); +} + +#endif /* CONFIG_SMP */ + +static int xive_irq_domain_map(struct irq_domain *h, unsigned int virq, + irq_hw_number_t hw) +{ + int rc; + + /* + * Mark interrupts as edge sensitive by default so that resend + * actually works. Will fix that up below if needed. + */ + irq_clear_status_flags(virq, IRQ_LEVEL); + +#ifdef CONFIG_SMP + /* IPIs are special and come up with HW number 0 */ + if (hw == 0) { + /* + * IPIs are marked per-cpu. We use separate HW interrupts under + * the hood but associated with the same "linux" interrupt + */ + irq_set_chip_and_handler(virq, &xive_ipi_chip, + handle_percpu_irq); + return 0; + } +#endif + + rc = xive_irq_alloc_data(virq, hw); + if (rc) + return rc; + + irq_set_chip_and_handler(virq, &xive_irq_chip, handle_fasteoi_irq); + + return 0; +} + +static void xive_irq_domain_unmap(struct irq_domain *d, unsigned int virq) +{ + struct irq_data *data = irq_get_irq_data(virq); + unsigned int hw_irq; + + /* XXX Assign BAD number */ + if (!data) + return; + hw_irq = (unsigned int)irqd_to_hwirq(data); + if (hw_irq) + xive_irq_free_data(virq); +} + +static int xive_irq_domain_xlate(struct irq_domain *h, struct device_node *ct, + const u32 *intspec, unsigned int intsize, + irq_hw_number_t *out_hwirq, unsigned int *out_flags) + +{ + *out_hwirq = intspec[0]; + + /* + * If intsize is at least 2, we look for the type in the second cell, + * we assume the LSB indicates a level interrupt. + */ + if (intsize > 1) { + if (intspec[1] & 1) + *out_flags = IRQ_TYPE_LEVEL_LOW; + else + *out_flags = IRQ_TYPE_EDGE_RISING; + } else + *out_flags = IRQ_TYPE_LEVEL_LOW; + + return 0; +} + +static int xive_irq_domain_match(struct irq_domain *h, struct device_node *node, + enum irq_domain_bus_token bus_token) +{ + return xive_ops->match(node); +} + +static const struct irq_domain_ops xive_irq_domain_ops = { + .match = xive_irq_domain_match, + .map = xive_irq_domain_map, + .unmap = xive_irq_domain_unmap, + .xlate = xive_irq_domain_xlate, +}; + +static void __init xive_init_host(void) +{ + xive_irq_domain = irq_domain_add_nomap(NULL, XIVE_MAX_IRQ, + &xive_irq_domain_ops, NULL); + if (WARN_ON(xive_irq_domain == NULL)) + return; + irq_set_default_host(xive_irq_domain); +} + +static void xive_cleanup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) +{ + if (xc->queue[xive_irq_priority].qpage) + xive_ops->cleanup_queue(cpu, xc, xive_irq_priority); +} + +static int xive_setup_cpu_queues(unsigned int cpu, struct xive_cpu *xc) +{ + int rc = 0; + + /* We setup 1 queues for now with a 64k page */ + if (!xc->queue[xive_irq_priority].qpage) + rc = xive_ops->setup_queue(cpu, xc, xive_irq_priority); + + return rc; +} + +static int xive_prepare_cpu(unsigned int cpu) +{ + struct xive_cpu *xc; + + xc = per_cpu(xive_cpu, cpu); + if (!xc) { + struct device_node *np; + + xc = kzalloc_node(sizeof(struct xive_cpu), + GFP_KERNEL, cpu_to_node(cpu)); + if (!xc) + return -ENOMEM; + np = of_get_cpu_node(cpu, NULL); + if (np) + xc->chip_id = of_get_ibm_chip_id(np); + of_node_put(np); + + per_cpu(xive_cpu, cpu) = xc; + } + + /* Setup EQs if not already */ + return xive_setup_cpu_queues(cpu, xc); +} + +static void xive_setup_cpu(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + + /* Debug: Dump the TM state */ + pr_devel("CPU %d [HW 0x%02x] VT=%02x\n", + smp_processor_id(), hard_smp_processor_id(), + in_8(xive_tima + xive_tima_offset + TM_WORD2)); + + /* The backend might have additional things to do */ + if (xive_ops->setup_cpu) + xive_ops->setup_cpu(smp_processor_id(), xc); + + /* Set CPPR to 0xff to enable flow of interrupts */ + xc->cppr = 0xff; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff); +} + +#ifdef CONFIG_SMP +void xive_smp_setup_cpu(void) +{ + pr_devel("SMP setup CPU %d\n", smp_processor_id()); + + /* This will have already been done on the boot CPU */ + if (smp_processor_id() != boot_cpuid) + xive_setup_cpu(); + +} + +int xive_smp_prepare_cpu(unsigned int cpu) +{ + int rc; + + /* Allocate per-CPU data and queues */ + rc = xive_prepare_cpu(cpu); + if (rc) + return rc; + + /* Allocate and setup IPI for the new CPU */ + return xive_setup_cpu_ipi(cpu); +} + +#ifdef CONFIG_HOTPLUG_CPU +static void xive_flush_cpu_queue(unsigned int cpu, struct xive_cpu *xc) +{ + u32 irq; + + /* We assume local irqs are disabled */ + WARN_ON(!irqs_disabled()); + + /* Check what's already in the CPU queue */ + while ((irq = xive_scan_interrupts(xc, false)) != 0) { + /* + * We need to re-route that interrupt to its new destination. + * First get and lock the descriptor + */ + struct irq_desc *desc = irq_to_desc(irq); + struct irq_data *d = irq_desc_get_irq_data(desc); + struct xive_irq_data *xd; + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + + /* + * Ignore anything that isn't a XIVE irq and ignore + * IPIs, so can just be dropped. + */ + if (d->domain != xive_irq_domain || hw_irq == 0) + continue; + + /* + * The IRQ should have already been re-routed, it's just a + * stale in the old queue, so re-trigger it in order to make + * it reach is new destination. + */ +#ifdef DEBUG_FLUSH + pr_info("CPU %d: Got irq %d while offline, re-sending...\n", + cpu, irq); +#endif + raw_spin_lock(&desc->lock); + xd = irq_desc_get_handler_data(desc); + + /* + * For LSIs, we EOI, this will cause a resend if it's + * still asserted. Otherwise do an MSI retrigger. + */ + if (xd->flags & XIVE_IRQ_FLAG_LSI) + xive_do_source_eoi(irqd_to_hwirq(d), xd); + else + xive_irq_retrigger(d); + + raw_spin_unlock(&desc->lock); + } +} + +void xive_smp_disable_cpu(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Migrate interrupts away from the CPU */ + irq_migrate_all_off_this_cpu(); + + /* Set CPPR to 0 to disable flow of interrupts */ + xc->cppr = 0; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); + + /* Flush everything still in the queue */ + xive_flush_cpu_queue(cpu, xc); + + /* Re-enable CPPR */ + xc->cppr = 0xff; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0xff); +} + +void xive_flush_interrupt(void) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Called if an interrupt occurs while the CPU is hot unplugged */ + xive_flush_cpu_queue(cpu, xc); +} + +#endif /* CONFIG_HOTPLUG_CPU */ + +#endif /* CONFIG_SMP */ + +void xive_kexec_teardown_cpu(int secondary) +{ + struct xive_cpu *xc = __this_cpu_read(xive_cpu); + unsigned int cpu = smp_processor_id(); + + /* Set CPPR to 0 to disable flow of interrupts */ + xc->cppr = 0; + out_8(xive_tima + xive_tima_offset + TM_CPPR, 0); + + /* Backend cleanup if any */ + if (xive_ops->teardown_cpu) + xive_ops->teardown_cpu(cpu, xc); + +#ifdef CONFIG_SMP + /* Get rid of IPI */ + xive_cleanup_cpu_ipi(cpu, xc); +#endif + + /* Disable and free the queues */ + xive_cleanup_cpu_queues(cpu, xc); +} + +void xive_shutdown(void) +{ + xive_ops->shutdown(); +} + +bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, + u8 max_prio) +{ + xive_tima = area; + xive_tima_offset = offset; + xive_ops = ops; + xive_irq_priority = max_prio; + + ppc_md.get_irq = xive_get_irq; + __xive_enabled = true; + + pr_devel("Initializing host..\n"); + xive_init_host(); + + pr_devel("Initializing boot CPU..\n"); + + /* Allocate per-CPU data and queues */ + xive_prepare_cpu(smp_processor_id()); + + /* Get ready for interrupts */ + xive_setup_cpu(); + + pr_info("Interrupt handling intialized with %s backend\n", + xive_ops->name); + pr_info("Using priority %d for all interrupts\n", max_prio); + + return true; +} + +static int __init xive_off(char *arg) +{ + xive_cmdline_disabled = true; + return 0; +} +__setup("xive=off", xive_off); --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/xive/native.c +++ linux-azure-4.11.0/arch/powerpc/sysdev/xive/native.c @@ -0,0 +1,639 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#define pr_fmt(fmt) "xive: " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "xive-internal.h" + + +static u32 xive_provision_size; +static u32 *xive_provision_chips; +static u32 xive_provision_chip_count; +static u32 xive_queue_shift; +static u32 xive_pool_vps = XIVE_INVALID_VP; +static struct kmem_cache *xive_provision_cache; + +int xive_native_populate_irq_data(u32 hw_irq, struct xive_irq_data *data) +{ + __be64 flags, eoi_page, trig_page; + __be32 esb_shift, src_chip; + u64 opal_flags; + s64 rc; + + memset(data, 0, sizeof(*data)); + + rc = opal_xive_get_irq_info(hw_irq, &flags, &eoi_page, &trig_page, + &esb_shift, &src_chip); + if (rc) { + pr_err("opal_xive_get_irq_info(0x%x) returned %lld\n", + hw_irq, rc); + return -EINVAL; + } + + opal_flags = be64_to_cpu(flags); + if (opal_flags & OPAL_XIVE_IRQ_STORE_EOI) + data->flags |= XIVE_IRQ_FLAG_STORE_EOI; + if (opal_flags & OPAL_XIVE_IRQ_LSI) + data->flags |= XIVE_IRQ_FLAG_LSI; + if (opal_flags & OPAL_XIVE_IRQ_SHIFT_BUG) + data->flags |= XIVE_IRQ_FLAG_SHIFT_BUG; + if (opal_flags & OPAL_XIVE_IRQ_MASK_VIA_FW) + data->flags |= XIVE_IRQ_FLAG_MASK_FW; + if (opal_flags & OPAL_XIVE_IRQ_EOI_VIA_FW) + data->flags |= XIVE_IRQ_FLAG_EOI_FW; + data->eoi_page = be64_to_cpu(eoi_page); + data->trig_page = be64_to_cpu(trig_page); + data->esb_shift = be32_to_cpu(esb_shift); + data->src_chip = be32_to_cpu(src_chip); + + data->eoi_mmio = ioremap(data->eoi_page, 1u << data->esb_shift); + if (!data->eoi_mmio) { + pr_err("Failed to map EOI page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + + if (!data->trig_page) + return 0; + if (data->trig_page == data->eoi_page) { + data->trig_mmio = data->eoi_mmio; + return 0; + } + + data->trig_mmio = ioremap(data->trig_page, 1u << data->esb_shift); + if (!data->trig_mmio) { + pr_err("Failed to map trigger page for irq 0x%x\n", hw_irq); + return -ENOMEM; + } + return 0; +} + +int xive_native_configure_irq(u32 hw_irq, u32 target, u8 prio, u32 sw_irq) +{ + s64 rc; + + for (;;) { + rc = opal_xive_set_irq_config(hw_irq, target, prio, sw_irq); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + return rc == 0 ? 0 : -ENXIO; +} + +/* This can be called multiple time to change a queue configuration */ +int xive_native_configure_queue(u32 vp_id, struct xive_q *q, u8 prio, + __be32 *qpage, u32 order, bool can_escalate) +{ + s64 rc = 0; + __be64 qeoi_page_be; + __be32 esc_irq_be; + u64 flags, qpage_phys; + + /* If there's an actual queue page, clean it */ + if (order) { + if (WARN_ON(!qpage)) + return -EINVAL; + qpage_phys = __pa(qpage); + } else + qpage_phys = 0; + + /* Initialize the rest of the fields */ + q->msk = order ? ((1u << (order - 2)) - 1) : 0; + q->idx = 0; + q->toggle = 0; + + rc = opal_xive_get_queue_info(vp_id, prio, NULL, NULL, + &qeoi_page_be, + &esc_irq_be, + NULL); + if (rc) { + pr_err("Error %lld getting queue info prio %d\n", rc, prio); + rc = -EIO; + goto fail; + } + q->eoi_phys = be64_to_cpu(qeoi_page_be); + + /* Default flags */ + flags = OPAL_XIVE_EQ_ALWAYS_NOTIFY | OPAL_XIVE_EQ_ENABLED; + + /* Escalation needed ? */ + if (can_escalate) { + q->esc_irq = be32_to_cpu(esc_irq_be); + flags |= OPAL_XIVE_EQ_ESCALATE; + } + + /* Configure and enable the queue in HW */ + for (;;) { + rc = opal_xive_set_queue_info(vp_id, prio, qpage_phys, order, flags); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc) { + pr_err("Error %lld setting queue for prio %d\n", rc, prio); + rc = -EIO; + } else { + /* + * KVM code requires all of the above to be visible before + * q->qpage is set due to how it manages IPI EOIs + */ + wmb(); + q->qpage = qpage; + } +fail: + return rc; +} + +static void __xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) +{ + s64 rc; + + /* Disable the queue in HW */ + for (;;) { + rc = opal_xive_set_queue_info(vp_id, prio, 0, 0, 0); + break; + msleep(1); + } + if (rc) + pr_err("Error %lld disabling queue for prio %d\n", rc, prio); +} + +void xive_native_disable_queue(u32 vp_id, struct xive_q *q, u8 prio) +{ + __xive_native_disable_queue(vp_id, q, prio); +} + +static int xive_native_setup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + unsigned int alloc_order; + struct page *pages; + __be32 *qpage; + + alloc_order = (xive_queue_shift > PAGE_SHIFT) ? + (xive_queue_shift - PAGE_SHIFT) : 0; + pages = alloc_pages_node(cpu_to_node(cpu), GFP_KERNEL, alloc_order); + if (!pages) + return -ENOMEM; + qpage = (__be32 *)page_address(pages); + memset(qpage, 0, 1 << xive_queue_shift); + return xive_native_configure_queue(get_hard_smp_processor_id(cpu), + q, prio, qpage, xive_queue_shift, false); +} + +static void xive_native_cleanup_queue(unsigned int cpu, struct xive_cpu *xc, u8 prio) +{ + struct xive_q *q = &xc->queue[prio]; + unsigned int alloc_order; + + /* + * We use the variant with no iounmap as this is called on exec + * from an IPI and iounmap isn't safe + */ + __xive_native_disable_queue(get_hard_smp_processor_id(cpu), q, prio); + alloc_order = (xive_queue_shift > PAGE_SHIFT) ? + (xive_queue_shift - PAGE_SHIFT) : 0; + free_pages((unsigned long)q->qpage, alloc_order); + q->qpage = NULL; +} + +static bool xive_native_match(struct device_node *node) +{ + return of_device_is_compatible(node, "ibm,opal-xive-vc"); +} + +#ifdef CONFIG_SMP +static int xive_native_get_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + struct device_node *np; + unsigned int chip_id; + s64 irq; + + /* Find the chip ID */ + np = of_get_cpu_node(cpu, NULL); + if (np) { + if (of_property_read_u32(np, "ibm,chip-id", &chip_id) < 0) + chip_id = 0; + } + + /* Allocate an IPI and populate info about it */ + for (;;) { + irq = opal_xive_allocate_irq(chip_id); + if (irq == OPAL_BUSY) { + msleep(1); + continue; + } + if (irq < 0) { + pr_err("Failed to allocate IPI on CPU %d\n", cpu); + return -ENXIO; + } + xc->hw_ipi = irq; + break; + } + return 0; +} + +u32 xive_native_alloc_irq(void) +{ + s64 rc; + + for (;;) { + rc = opal_xive_allocate_irq(OPAL_XIVE_ANY_CHIP); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc < 0) + return 0; + return rc; +} + +void xive_native_free_irq(u32 irq) +{ + for (;;) { + s64 rc = opal_xive_free_irq(irq); + if (rc != OPAL_BUSY) + break; + msleep(1); + } +} + +static void xive_native_put_ipi(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + + /* Free the IPI */ + if (!xc->hw_ipi) + return; + for (;;) { + rc = opal_xive_free_irq(xc->hw_ipi); + if (rc == OPAL_BUSY) { + msleep(1); + continue; + } + xc->hw_ipi = 0; + break; + } +} +#endif /* CONFIG_SMP */ + +static void xive_native_shutdown(void) +{ + /* Switch the XIVE to emulation mode */ + opal_xive_reset(OPAL_XIVE_MODE_EMU); +} + +/* + * Perform an "ack" cycle on the current thread, thus + * grabbing the pending active priorities and updating + * the CPPR to the most favored one. + */ +static void xive_native_update_pending(struct xive_cpu *xc) +{ + u8 he, cppr; + u16 ack; + + /* Perform the acknowledge hypervisor to register cycle */ + ack = be16_to_cpu(__raw_readw(xive_tima + TM_SPC_ACK_HV_REG)); + + /* Synchronize subsequent queue accesses */ + mb(); + + /* + * Grab the CPPR and the "HE" field which indicates the source + * of the hypervisor interrupt (if any) + */ + cppr = ack & 0xff; + he = GETFIELD(TM_QW3_NSR_HE, (ack >> 8)); + switch(he) { + case TM_QW3_NSR_HE_NONE: /* Nothing to see here */ + break; + case TM_QW3_NSR_HE_PHYS: /* Physical thread interrupt */ + if (cppr == 0xff) + return; + /* Mark the priority pending */ + xc->pending_prio |= 1 << cppr; + + /* + * A new interrupt should never have a CPPR less favored + * than our current one. + */ + if (cppr >= xc->cppr) + pr_err("CPU %d odd ack CPPR, got %d at %d\n", + smp_processor_id(), cppr, xc->cppr); + + /* Update our idea of what the CPPR is */ + xc->cppr = cppr; + break; + case TM_QW3_NSR_HE_POOL: /* HV Pool interrupt (unused) */ + case TM_QW3_NSR_HE_LSI: /* Legacy FW LSI (unused) */ + pr_err("CPU %d got unexpected interrupt type HE=%d\n", + smp_processor_id(), he); + return; + } +} + +static void xive_native_eoi(u32 hw_irq) +{ + /* + * Not normally used except if specific interrupts need + * a workaround on EOI. + */ + opal_int_eoi(hw_irq); +} + +static void xive_native_setup_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + u32 vp; + __be64 vp_cam_be; + u64 vp_cam; + + if (xive_pool_vps == XIVE_INVALID_VP) + return; + + /* Enable the pool VP */ + vp = xive_pool_vps + get_hard_smp_processor_id(cpu); + pr_debug("CPU %d setting up pool VP 0x%x\n", cpu, vp); + for (;;) { + rc = opal_xive_set_vp_info(vp, OPAL_XIVE_VP_ENABLED, 0); + if (rc != OPAL_BUSY) + break; + msleep(1); + } + if (rc) { + pr_err("Failed to enable pool VP on CPU %d\n", cpu); + return; + } + + /* Grab it's CAM value */ + rc = opal_xive_get_vp_info(vp, NULL, &vp_cam_be, NULL, NULL); + if (rc) { + pr_err("Failed to get pool VP info CPU %d\n", cpu); + return; + } + vp_cam = be64_to_cpu(vp_cam_be); + + pr_debug("VP CAM = %llx\n", vp_cam); + + /* Push it on the CPU (set LSMFB to 0xff to skip backlog scan) */ + pr_debug("(Old HW value: %08x)\n", + in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2)); + out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD0, 0xff); + out_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2, + TM_QW2W2_VP | vp_cam); + pr_debug("(New HW value: %08x)\n", + in_be32(xive_tima + TM_QW2_HV_POOL + TM_WORD2)); +} + +static void xive_native_teardown_cpu(unsigned int cpu, struct xive_cpu *xc) +{ + s64 rc; + u32 vp; + + if (xive_pool_vps == XIVE_INVALID_VP) + return; + + /* Pull the pool VP from the CPU */ + in_be64(xive_tima + TM_SPC_PULL_POOL_CTX); + + /* Disable it */ + vp = xive_pool_vps + get_hard_smp_processor_id(cpu); + for (;;) { + rc = opal_xive_set_vp_info(vp, 0, 0); + if (rc != OPAL_BUSY) + break; + msleep(1); + } +} + +static void xive_native_sync_source(u32 hw_irq) +{ + opal_xive_sync(XIVE_SYNC_EAS, hw_irq); +} + +static const struct xive_ops xive_native_ops = { + .populate_irq_data = xive_native_populate_irq_data, + .configure_irq = xive_native_configure_irq, + .setup_queue = xive_native_setup_queue, + .cleanup_queue = xive_native_cleanup_queue, + .match = xive_native_match, + .shutdown = xive_native_shutdown, + .update_pending = xive_native_update_pending, + .eoi = xive_native_eoi, + .setup_cpu = xive_native_setup_cpu, + .teardown_cpu = xive_native_teardown_cpu, + .sync_source = xive_native_sync_source, +#ifdef CONFIG_SMP + .get_ipi = xive_native_get_ipi, + .put_ipi = xive_native_put_ipi, +#endif /* CONFIG_SMP */ + .name = "native", +}; + +static bool xive_parse_provisioning(struct device_node *np) +{ + int rc; + + if (of_property_read_u32(np, "ibm,xive-provision-page-size", + &xive_provision_size) < 0) + return true; + rc = of_property_count_elems_of_size(np, "ibm,xive-provision-chips", 4); + if (rc < 0) { + pr_err("Error %d getting provision chips array\n", rc); + return false; + } + xive_provision_chip_count = rc; + if (rc == 0) + return true; + + xive_provision_chips = kzalloc(4 * xive_provision_chip_count, + GFP_KERNEL); + if (WARN_ON(!xive_provision_chips)) + return false; + + rc = of_property_read_u32_array(np, "ibm,xive-provision-chips", + xive_provision_chips, + xive_provision_chip_count); + if (rc < 0) { + pr_err("Error %d reading provision chips array\n", rc); + return false; + } + + xive_provision_cache = kmem_cache_create("xive-provision", + xive_provision_size, + xive_provision_size, + 0, NULL); + if (!xive_provision_cache) { + pr_err("Failed to allocate provision cache\n"); + return false; + } + return true; +} + +u32 xive_native_default_eq_shift(void) +{ + return xive_queue_shift; +} + +bool xive_native_init(void) +{ + struct device_node *np; + struct resource r; + void __iomem *tima; + struct property *prop; + u8 max_prio = 7; + const __be32 *p; + u32 val; + s64 rc; + + if (xive_cmdline_disabled) + return false; + + pr_devel("xive_native_init()\n"); + np = of_find_compatible_node(NULL, NULL, "ibm,opal-xive-pe"); + if (!np) { + pr_devel("not found !\n"); + return false; + } + pr_devel("Found %s\n", np->full_name); + + /* Resource 1 is HV window */ + if (of_address_to_resource(np, 1, &r)) { + pr_err("Failed to get thread mgmnt area resource\n"); + return false; + } + tima = ioremap(r.start, resource_size(&r)); + if (!tima) { + pr_err("Failed to map thread mgmnt area\n"); + return false; + } + + /* Read number of priorities */ + if (of_property_read_u32(np, "ibm,xive-#priorities", &val) == 0) + max_prio = val - 1; + + /* Iterate the EQ sizes and pick one */ + of_property_for_each_u32(np, "ibm,xive-eq-sizes", prop, p, val) { + xive_queue_shift = val; + if (val == PAGE_SHIFT) + break; + } + + /* Grab size of provisioning pages */ + xive_parse_provisioning(np); + + /* Switch the XIVE to exploitation mode */ + rc = opal_xive_reset(OPAL_XIVE_MODE_EXPL); + if (rc) { + pr_err("Switch to exploitation mode failed with error %lld\n", rc); + return false; + } + + /* Initialize XIVE core with our backend */ + if (!xive_core_init(&xive_native_ops, tima, TM_QW3_HV_PHYS, + max_prio)) { + opal_xive_reset(OPAL_XIVE_MODE_EMU); + return false; + } + pr_info("Using %dkB queues\n", 1 << (xive_queue_shift - 10)); + return true; +} + +static bool xive_native_provision_pages(void) +{ + u32 i; + void *p; + + for (i = 0; i < xive_provision_chip_count; i++) { + u32 chip = xive_provision_chips[i]; + + /* + * XXX TODO: Try to make the allocation local to the node where + * the chip resides. + */ + p = kmem_cache_alloc(xive_provision_cache, GFP_KERNEL); + if (!p) { + pr_err("Failed to allocate provisioning page\n"); + return false; + } + opal_xive_donate_page(chip, __pa(p)); + } + return true; +} + +u32 xive_native_alloc_vp_block(u32 max_vcpus) +{ + s64 rc; + u32 order; + + order = fls(max_vcpus) - 1; + if (max_vcpus > (1 << order)) + order++; + + pr_info("VP block alloc, for max VCPUs %d use order %d\n", + max_vcpus, order); + + for (;;) { + rc = opal_xive_alloc_vp_block(order); + switch (rc) { + case OPAL_BUSY: + msleep(1); + break; + case OPAL_XIVE_PROVISIONING: + if (!xive_native_provision_pages()) + return XIVE_INVALID_VP; + break; + default: + if (rc < 0) { + pr_err("OPAL failed to allocate VCPUs order %d, err %lld\n", + order, rc); + return XIVE_INVALID_VP; + } + return rc; + } + } +} +EXPORT_SYMBOL_GPL(xive_native_alloc_vp_block); + +void xive_native_free_vp_block(u32 vp_base) +{ + s64 rc; + + if (vp_base == XIVE_INVALID_VP) + return; + + rc = opal_xive_free_vp_block(vp_base); + if (rc < 0) + pr_warn("OPAL error %lld freeing VP block\n", rc); +} +EXPORT_SYMBOL_GPL(xive_native_free_vp_block); --- linux-azure-4.11.0.orig/arch/powerpc/sysdev/xive/xive-internal.h +++ linux-azure-4.11.0/arch/powerpc/sysdev/xive/xive-internal.h @@ -0,0 +1,62 @@ +/* + * Copyright 2016,2017 IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#ifndef __XIVE_INTERNAL_H +#define __XIVE_INTERNAL_H + +/* Each CPU carry one of these with various per-CPU state */ +struct xive_cpu { +#ifdef CONFIG_SMP + /* HW irq number and data of IPI */ + u32 hw_ipi; + struct xive_irq_data ipi_data; +#endif /* CONFIG_SMP */ + + int chip_id; + + /* Queue datas. Only one is populated */ +#define XIVE_MAX_QUEUES 8 + struct xive_q queue[XIVE_MAX_QUEUES]; + + /* + * Pending mask. Each bit corresponds to a priority that + * potentially has pending interrupts. + */ + u8 pending_prio; + + /* Cache of HW CPPR */ + u8 cppr; +}; + +/* Backend ops */ +struct xive_ops { + int (*populate_irq_data)(u32 hw_irq, struct xive_irq_data *data); + int (*configure_irq)(u32 hw_irq, u32 target, u8 prio, u32 sw_irq); + int (*setup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); + void (*cleanup_queue)(unsigned int cpu, struct xive_cpu *xc, u8 prio); + void (*setup_cpu)(unsigned int cpu, struct xive_cpu *xc); + void (*teardown_cpu)(unsigned int cpu, struct xive_cpu *xc); + bool (*match)(struct device_node *np); + void (*shutdown)(void); + + void (*update_pending)(struct xive_cpu *xc); + void (*eoi)(u32 hw_irq); + void (*sync_source)(u32 hw_irq); +#ifdef CONFIG_SMP + int (*get_ipi)(unsigned int cpu, struct xive_cpu *xc); + void (*put_ipi)(unsigned int cpu, struct xive_cpu *xc); +#endif + const char *name; +}; + +bool xive_core_init(const struct xive_ops *ops, void __iomem *area, u32 offset, + u8 max_prio); + +extern bool xive_cmdline_disabled; + +#endif /* __XIVE_INTERNAL_H */ --- linux-azure-4.11.0.orig/arch/powerpc/xmon/xmon.c +++ linux-azure-4.11.0/arch/powerpc/xmon/xmon.c @@ -30,6 +30,7 @@ #include #include +#include #include #include #include @@ -48,7 +49,7 @@ #include #include #include - +#include #include #include @@ -232,7 +233,13 @@ "\ dr dump stream of raw bytes\n\ dt dump the tracing buffers (uses printk)\n\ - e print exception information\n\ +" +#ifdef CONFIG_PPC_POWERNV +" dx# dump xive on CPU #\n\ + dxi# dump xive irq state #\n\ + dxa dump xive on all CPUs\n" +#endif +" e print exception information\n\ f flush cache\n\ la lookup symbol+offset of specified address\n\ ls lookup address of specified symbol\n\ @@ -2338,6 +2345,81 @@ } #endif +#ifdef CONFIG_PPC_POWERNV +static void dump_one_xive(int cpu) +{ + unsigned int hwid = get_hard_smp_processor_id(cpu); + + opal_xive_dump(XIVE_DUMP_TM_HYP, hwid); + opal_xive_dump(XIVE_DUMP_TM_POOL, hwid); + opal_xive_dump(XIVE_DUMP_TM_OS, hwid); + opal_xive_dump(XIVE_DUMP_TM_USER, hwid); + opal_xive_dump(XIVE_DUMP_VP, hwid); + opal_xive_dump(XIVE_DUMP_EMU_STATE, hwid); + + if (setjmp(bus_error_jmp) != 0) { + catch_memory_errors = 0; + printf("*** Error dumping xive on cpu %d\n", cpu); + return; + } + + catch_memory_errors = 1; + sync(); + xmon_xive_do_dump(cpu); + sync(); + __delay(200); + catch_memory_errors = 0; +} + +static void dump_all_xives(void) +{ + int cpu; + + if (num_possible_cpus() == 0) { + printf("No possible cpus, use 'dx #' to dump individual cpus\n"); + return; + } + + for_each_possible_cpu(cpu) + dump_one_xive(cpu); +} + +static void dump_one_xive_irq(u32 num) +{ + s64 rc; + __be64 vp; + u8 prio; + __be32 lirq; + + rc = opal_xive_get_irq_config(num, &vp, &prio, &lirq); + xmon_printf("IRQ 0x%x config: vp=0x%llx prio=%d lirq=0x%x (rc=%lld)\n", + num, be64_to_cpu(vp), prio, be32_to_cpu(lirq), rc); +} + +static void dump_xives(void) +{ + unsigned long num; + int c; + + c = inchar(); + if (c == 'a') { + dump_all_xives(); + return; + } else if (c == 'i') { + if (scanhex(&num)) + dump_one_xive_irq(num); + return; + } + + termch = c; /* Put c back, it wasn't 'a' */ + + if (scanhex(&num)) + dump_one_xive(num); + else + dump_one_xive(xmon_owner); +} +#endif /* CONFIG_PPC_POWERNV */ + static void dump_by_size(unsigned long addr, long count, int size) { unsigned char temp[16]; @@ -2385,6 +2467,14 @@ xmon_end_pagination(); return; } +#endif +#ifdef CONFIG_PPC_POWERNV + if (c == 'x') { + xmon_start_pagination(); + dump_xives(); + xmon_end_pagination(); + return; + } #endif if (c == '\n') --- linux-azure-4.11.0.orig/arch/s390/Kconfig +++ linux-azure-4.11.0/arch/s390/Kconfig @@ -893,3 +893,11 @@ need this. endmenu + +config KMSG_IDS + def_bool y + prompt "Kernel message numbers" + help + Select this option if you want to include a message number to the + prefix for kernel messages issued by the s390 architecture and + driver code. See "Documentation/s390/kmsg.txt" for more details. --- linux-azure-4.11.0.orig/arch/s390/include/asm/elf.h +++ linux-azure-4.11.0/arch/s390/include/asm/elf.h @@ -160,14 +160,13 @@ #define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE 4096 -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. 64-bit - tasks are aligned to 4GB. */ -#define ELF_ET_DYN_BASE (is_compat_task() ? \ - (STACK_TOP / 3 * 2) : \ - (STACK_TOP / 3 * 2) & ~((1UL << 32) - 1)) +/* + * This is the base location for PIE (ET_DYN with INTERP) loads. On + * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +#define ELF_ET_DYN_BASE (is_compat_task() ? 0x000400000UL : \ + 0x100000000UL) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. */ --- linux-azure-4.11.0.orig/arch/s390/kernel/Makefile +++ linux-azure-4.11.0/arch/s390/kernel/Makefile @@ -85,3 +85,6 @@ # vdso obj-y += vdso64/ obj-$(CONFIG_COMPAT) += vdso32/ + +# kernel message catalog +obj-$(CONFIG_KMSG_IDS) += kmsg.o --- linux-azure-4.11.0.orig/arch/s390/kernel/crash_dump.c +++ linux-azure-4.11.0/arch/s390/kernel/crash_dump.c @@ -429,6 +429,20 @@ } /* + * Initialize final note (needed for /proc/vmcore code) + */ +static void *nt_final(void *ptr) +{ + Elf64_Nhdr *note; + + note = (Elf64_Nhdr *) ptr; + note->n_namesz = 0; + note->n_descsz = 0; + note->n_type = 0; + return PTR_ADD(ptr, sizeof(Elf64_Nhdr)); +} + +/* * Initialize ELF header (new kernel) */ static void *ehdr_init(Elf64_Ehdr *ehdr, int mem_chunk_cnt) @@ -515,6 +529,7 @@ if (sa->prefix != 0) ptr = fill_cpu_elf_notes(ptr, cpu++, sa); ptr = nt_vmcoreinfo(ptr); + ptr = nt_final(ptr); memset(phdr, 0, sizeof(*phdr)); phdr->p_type = PT_NOTE; phdr->p_offset = notes_offset; --- linux-azure-4.11.0.orig/arch/s390/kernel/entry.S +++ linux-azure-4.11.0/arch/s390/kernel/entry.S @@ -233,12 +233,17 @@ lctlg %c1,%c1,__LC_USER_ASCE # load primary asce .Lsie_done: # some program checks are suppressing. C code (e.g. do_protection_exception) -# will rewind the PSW by the ILC, which is 4 bytes in case of SIE. Other -# instructions between sie64a and .Lsie_done should not cause program -# interrupts. So lets use a nop (47 00 00 00) as a landing pad. +# will rewind the PSW by the ILC, which is often 4 bytes in case of SIE. There +# are some corner cases (e.g. runtime instrumentation) where ILC is unpredictable. +# Other instructions between sie64a and .Lsie_done should not cause program +# interrupts. So lets use 3 nops as a landing pad for all possible rewinds. # See also .Lcleanup_sie -.Lrewind_pad: - nop 0 +.Lrewind_pad6: + nopr 7 +.Lrewind_pad4: + nopr 7 +.Lrewind_pad2: + nopr 7 .globl sie_exit sie_exit: lg %r14,__SF_EMPTY+8(%r15) # load guest register save area @@ -251,7 +256,9 @@ stg %r14,__SF_EMPTY+16(%r15) # set exit reason code j sie_exit - EX_TABLE(.Lrewind_pad,.Lsie_fault) + EX_TABLE(.Lrewind_pad6,.Lsie_fault) + EX_TABLE(.Lrewind_pad4,.Lsie_fault) + EX_TABLE(.Lrewind_pad2,.Lsie_fault) EX_TABLE(sie_exit,.Lsie_fault) EXPORT_SYMBOL(sie64a) EXPORT_SYMBOL(sie_exit) @@ -314,6 +321,7 @@ lg %r14,__LC_VDSO_PER_CPU lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) +.Lsysc_exit_timer: stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER lmg %r11,%r15,__PT_R11(%r11) @@ -601,6 +609,7 @@ lg %r14,__LC_VDSO_PER_CPU lmg %r0,%r10,__PT_R0(%r11) mvc __LC_RETURN_PSW(16),__PT_PSW(%r11) +.Lio_exit_timer: stpt __LC_EXIT_TIMER mvc __VDSO_ECTG_BASE(16,%r14),__LC_EXIT_TIMER lmg %r11,%r15,__PT_R11(%r11) @@ -1124,15 +1133,23 @@ br %r14 .Lcleanup_sysc_restore: + # check if stpt has been executed clg %r9,BASED(.Lcleanup_sysc_restore_insn) + jh 0f + mvc __LC_EXIT_TIMER(8),__LC_ASYNC_ENTER_TIMER + cghi %r11,__LC_SAVE_AREA_ASYNC je 0f + mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER +0: clg %r9,BASED(.Lcleanup_sysc_restore_insn+8) + je 1f lg %r9,24(%r11) # get saved pointer to pt_regs mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) mvc 0(64,%r11),__PT_R8(%r9) lmg %r0,%r7,__PT_R0(%r9) -0: lmg %r8,%r9,__LC_RETURN_PSW +1: lmg %r8,%r9,__LC_RETURN_PSW br %r14 .Lcleanup_sysc_restore_insn: + .quad .Lsysc_exit_timer .quad .Lsysc_done - 4 .Lcleanup_io_tif: @@ -1140,15 +1157,20 @@ br %r14 .Lcleanup_io_restore: + # check if stpt has been executed clg %r9,BASED(.Lcleanup_io_restore_insn) - je 0f + jh 0f + mvc __LC_EXIT_TIMER(8),__LC_MCCK_ENTER_TIMER +0: clg %r9,BASED(.Lcleanup_io_restore_insn+8) + je 1f lg %r9,24(%r11) # get saved r11 pointer to pt_regs mvc __LC_RETURN_PSW(16),__PT_PSW(%r9) mvc 0(64,%r11),__PT_R8(%r9) lmg %r0,%r7,__PT_R0(%r9) -0: lmg %r8,%r9,__LC_RETURN_PSW +1: lmg %r8,%r9,__LC_RETURN_PSW br %r14 .Lcleanup_io_restore_insn: + .quad .Lio_exit_timer .quad .Lio_done - 4 .Lcleanup_idle: --- linux-azure-4.11.0.orig/arch/s390/kernel/kmsg.c +++ linux-azure-4.11.0/arch/s390/kernel/kmsg.c @@ -0,0 +1,114 @@ +/* + * Message printing with message catalog prefixes. + * + * Copyright IBM Corp. 2012 + */ + +#include +#include +#include +#include +#include + +static inline u32 __printk_jhash(const void *key, u32 length) +{ + u32 a, b, c, len; + const u8 *k; + u8 zk[12]; + + a = b = 0x9e3779b9; + c = 0; + for (len = length + 12, k = key; len >= 12; len -= 12, k += 12) { + if (len >= 24) { + a += k[0] | k[1] << 8 | k[2] << 16 | k[3] << 24; + b += k[4] | k[5] << 8 | k[6] << 16 | k[7] << 24; + c += k[8] | k[9] << 8 | k[10] << 16 | k[11] << 24; + } else { + memset(zk, 0, 12); + memcpy(zk, k, len - 12); + a += zk[0] | zk[1] << 8 | zk[2] << 16 | zk[3] << 24; + b += zk[4] | zk[5] << 8 | zk[6] << 16 | zk[7] << 24; + c += (u32) zk[8] << 8; + c += (u32) zk[9] << 16; + c += (u32) zk[10] << 24; + c += length; + } + a -= b + c; a ^= (c>>13); + b -= a + c; b ^= (a<<8); + c -= a + b; c ^= (b>>13); + a -= b + c; a ^= (c>>12); + b -= a + c; b ^= (a<<16); + c -= a + b; c ^= (b>>5); + a -= b + c; a ^= (c>>3); + b -= a + c; b ^= (a<<10); + c -= a + b; c ^= (b>>15); + } + return c; +} + +/** + * __jhash_string - calculate the six digit jhash of a string + * @str: string to calculate the jhash + */ +unsigned long long __jhash_string(const char *str) +{ + return __printk_jhash(str, strlen(str)) & 0xffffff; +} +EXPORT_SYMBOL(__jhash_string); + +static int __dev_printk_hash(const char *level, const struct device *dev, + struct va_format *vaf) +{ + if (!dev) + return printk("%s(NULL device *): %pV", level, vaf); + + return printk("%s%s.%06x: %pV", level, dev_driver_string(dev), + __printk_jhash(vaf->fmt, strlen(vaf->fmt)) & 0xffffff, + vaf); +} + +int dev_printk_hash(const char *level, const struct device *dev, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + int r; + + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + r = __dev_printk_hash(level, dev, &vaf); + va_end(args); + + return r; +} +EXPORT_SYMBOL(dev_printk_hash); + +#define define_dev_printk_hash_level(func, kern_level) \ +int func(const struct device *dev, const char *fmt, ...) \ +{ \ + struct va_format vaf; \ + va_list args; \ + int r; \ + \ + va_start(args, fmt); \ + \ + vaf.fmt = fmt; \ + vaf.va = &args; \ + \ + r = __dev_printk_hash(kern_level, dev, &vaf); \ + va_end(args); \ + \ + return r; \ +} \ +EXPORT_SYMBOL(func); + +define_dev_printk_hash_level(dev_emerg_hash, KERN_EMERG); +define_dev_printk_hash_level(dev_alert_hash, KERN_ALERT); +define_dev_printk_hash_level(dev_crit_hash, KERN_CRIT); +define_dev_printk_hash_level(dev_err_hash, KERN_ERR); +define_dev_printk_hash_level(dev_warn_hash, KERN_WARNING); +define_dev_printk_hash_level(dev_notice_hash, KERN_NOTICE); +define_dev_printk_hash_level(_dev_info_hash, KERN_INFO); --- linux-azure-4.11.0.orig/arch/s390/kvm/gaccess.c +++ linux-azure-4.11.0/arch/s390/kvm/gaccess.c @@ -977,11 +977,12 @@ ptr = asce.origin * 4096; if (asce.r) { *fake = 1; + ptr = 0; asce.dt = ASCE_TYPE_REGION1; } switch (asce.dt) { case ASCE_TYPE_REGION1: - if (vaddr.rfx01 > asce.tl && !asce.r) + if (vaddr.rfx01 > asce.tl && !*fake) return PGM_REGION_FIRST_TRANS; break; case ASCE_TYPE_REGION2: @@ -1009,8 +1010,7 @@ union region1_table_entry rfte; if (*fake) { - /* offset in 16EB guest memory block */ - ptr = ptr + ((unsigned long) vaddr.rsx << 53UL); + ptr += (unsigned long) vaddr.rfx << 53; rfte.val = ptr; goto shadow_r2t; } @@ -1036,8 +1036,7 @@ union region2_table_entry rste; if (*fake) { - /* offset in 8PB guest memory block */ - ptr = ptr + ((unsigned long) vaddr.rtx << 42UL); + ptr += (unsigned long) vaddr.rsx << 42; rste.val = ptr; goto shadow_r3t; } @@ -1064,8 +1063,7 @@ union region3_table_entry rtte; if (*fake) { - /* offset in 4TB guest memory block */ - ptr = ptr + ((unsigned long) vaddr.sx << 31UL); + ptr += (unsigned long) vaddr.rtx << 31; rtte.val = ptr; goto shadow_sgt; } @@ -1101,8 +1099,7 @@ union segment_table_entry ste; if (*fake) { - /* offset in 2G guest memory block */ - ptr = ptr + ((unsigned long) vaddr.sx << 20UL); + ptr += (unsigned long) vaddr.sx << 20; ste.val = ptr; goto shadow_pgt; } --- linux-azure-4.11.0.orig/arch/s390/mm/mmap.c +++ linux-azure-4.11.0/arch/s390/mm/mmap.c @@ -100,7 +100,7 @@ addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -138,7 +138,7 @@ addr = PAGE_ALIGN(addr); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && addr >= mmap_min_addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } --- linux-azure-4.11.0.orig/arch/sh/mm/mmap.c +++ linux-azure-4.11.0/arch/sh/mm/mmap.c @@ -64,7 +64,7 @@ vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -114,7 +114,7 @@ vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } --- linux-azure-4.11.0.orig/arch/sparc/Kconfig +++ linux-azure-4.11.0/arch/sparc/Kconfig @@ -192,9 +192,9 @@ int "Maximum number of CPUs" depends on SMP range 2 32 if SPARC32 - range 2 1024 if SPARC64 + range 2 4096 if SPARC64 default 32 if SPARC32 - default 64 if SPARC64 + default 4096 if SPARC64 source kernel/Kconfig.hz --- linux-azure-4.11.0.orig/arch/sparc/include/asm/asm-prototypes.h +++ linux-azure-4.11.0/arch/sparc/include/asm/asm-prototypes.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2017 Oracle and/or its affiliates. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +void *__memscan_zero(void *, size_t); +void *__memscan_generic(void *, int, size_t); +void *__bzero(void *, size_t); +void VISenter(void); /* Dummy prototype to supress warning */ +#undef memcpy +#undef memset +void *memcpy(void *dest, const void *src, size_t n); +void *memset(void *s, int c, size_t n); +typedef int TItype __attribute__((mode(TI))); +TItype __multi3(TItype a, TItype b); --- linux-azure-4.11.0.orig/arch/sparc/include/asm/hugetlb.h +++ linux-azure-4.11.0/arch/sparc/include/asm/hugetlb.h @@ -24,9 +24,11 @@ static inline int prepare_hugepage_range(struct file *file, unsigned long addr, unsigned long len) { - if (len & ~HPAGE_MASK) + struct hstate *h = hstate_file(file); + + if (len & ~huge_page_mask(h)) return -EINVAL; - if (addr & ~HPAGE_MASK) + if (addr & ~huge_page_mask(h)) return -EINVAL; return 0; } --- linux-azure-4.11.0.orig/arch/sparc/include/asm/mmu_64.h +++ linux-azure-4.11.0/arch/sparc/include/asm/mmu_64.h @@ -52,7 +52,7 @@ #define CTX_NR_MASK TAG_CONTEXT_BITS #define CTX_HW_MASK (CTX_NR_MASK | CTX_PGSZ_MASK) -#define CTX_FIRST_VERSION ((_AC(1,UL) << CTX_VERSION_SHIFT) + _AC(1,UL)) +#define CTX_FIRST_VERSION BIT(CTX_VERSION_SHIFT) #define CTX_VALID(__ctx) \ (!(((__ctx.sparc64_ctx_val) ^ tlb_context_cache) & CTX_VERSION_MASK)) #define CTX_HWBITS(__ctx) ((__ctx.sparc64_ctx_val) & CTX_HW_MASK) --- linux-azure-4.11.0.orig/arch/sparc/include/asm/mmu_context_64.h +++ linux-azure-4.11.0/arch/sparc/include/asm/mmu_context_64.h @@ -19,13 +19,8 @@ extern unsigned long tlb_context_cache; extern unsigned long mmu_context_bmap[]; +DECLARE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm); void get_new_mmu_context(struct mm_struct *mm); -#ifdef CONFIG_SMP -void smp_new_mmu_context_version(void); -#else -#define smp_new_mmu_context_version() do { } while (0) -#endif - int init_new_context(struct task_struct *tsk, struct mm_struct *mm); void destroy_context(struct mm_struct *mm); @@ -76,8 +71,9 @@ static inline void switch_mm(struct mm_struct *old_mm, struct mm_struct *mm, struct task_struct *tsk) { unsigned long ctx_valid, flags; - int cpu; + int cpu = smp_processor_id(); + per_cpu(per_cpu_secondary_mm, cpu) = mm; if (unlikely(mm == &init_mm)) return; @@ -123,7 +119,6 @@ * for the first time, we must flush that context out of the * local TLB. */ - cpu = smp_processor_id(); if (!ctx_valid || !cpumask_test_cpu(cpu, mm_cpumask(mm))) { cpumask_set_cpu(cpu, mm_cpumask(mm)); __flush_tlb_mm(CTX_HWBITS(mm->context), @@ -133,26 +128,7 @@ } #define deactivate_mm(tsk,mm) do { } while (0) - -/* Activate a new MM instance for the current task. */ -static inline void activate_mm(struct mm_struct *active_mm, struct mm_struct *mm) -{ - unsigned long flags; - int cpu; - - spin_lock_irqsave(&mm->context.lock, flags); - if (!CTX_VALID(mm->context)) - get_new_mmu_context(mm); - cpu = smp_processor_id(); - if (!cpumask_test_cpu(cpu, mm_cpumask(mm))) - cpumask_set_cpu(cpu, mm_cpumask(mm)); - - load_secondary_context(mm); - __flush_tlb_mm(CTX_HWBITS(mm->context), SECONDARY_CONTEXT); - tsb_context_switch(mm); - spin_unlock_irqrestore(&mm->context.lock, flags); -} - +#define activate_mm(active_mm, mm) switch_mm(active_mm, mm, NULL) #endif /* !(__ASSEMBLY__) */ #endif /* !(__SPARC64_MMU_CONTEXT_H) */ --- linux-azure-4.11.0.orig/arch/sparc/include/asm/pgtable_32.h +++ linux-azure-4.11.0/arch/sparc/include/asm/pgtable_32.h @@ -91,9 +91,9 @@ * ZERO_PAGE is a global shared page that is always zero: used * for zero-mapped memory areas etc.. */ -extern unsigned long empty_zero_page; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; -#define ZERO_PAGE(vaddr) (virt_to_page(&empty_zero_page)) +#define ZERO_PAGE(vaddr) (virt_to_page(empty_zero_page)) /* * In general all page table modifications should use the V8 atomic --- linux-azure-4.11.0.orig/arch/sparc/include/asm/pil.h +++ linux-azure-4.11.0/arch/sparc/include/asm/pil.h @@ -20,7 +20,6 @@ #define PIL_SMP_CALL_FUNC 1 #define PIL_SMP_RECEIVE_SIGNAL 2 #define PIL_SMP_CAPTURE 3 -#define PIL_SMP_CTX_NEW_VERSION 4 #define PIL_DEVICE_IRQ 5 #define PIL_SMP_CALL_FUNC_SNGL 6 #define PIL_DEFERRED_PCR_WORK 7 --- linux-azure-4.11.0.orig/arch/sparc/include/asm/setup.h +++ linux-azure-4.11.0/arch/sparc/include/asm/setup.h @@ -16,7 +16,7 @@ */ extern unsigned char boot_cpu_id; -extern unsigned long empty_zero_page; +extern unsigned long empty_zero_page[PAGE_SIZE / sizeof(unsigned long)]; extern int serial_console; static inline int con_is_present(void) --- linux-azure-4.11.0.orig/arch/sparc/include/asm/vio.h +++ linux-azure-4.11.0/arch/sparc/include/asm/vio.h @@ -327,6 +327,7 @@ int compat_len; u64 dev_no; + u64 id; unsigned long channel_id; --- linux-azure-4.11.0.orig/arch/sparc/kernel/ftrace.c +++ linux-azure-4.11.0/arch/sparc/kernel/ftrace.c @@ -130,17 +130,16 @@ if (unlikely(atomic_read(¤t->tracing_graph_pause))) return parent + 8UL; - if (ftrace_push_return_trace(parent, self_addr, &trace.depth, - frame_pointer, NULL) == -EBUSY) - return parent + 8UL; - trace.func = self_addr; + trace.depth = current->curr_ret_stack + 1; /* Only trace if the calling function expects to */ - if (!ftrace_graph_entry(&trace)) { - current->curr_ret_stack--; + if (!ftrace_graph_entry(&trace)) + return parent + 8UL; + + if (ftrace_push_return_trace(parent, self_addr, &trace.depth, + frame_pointer, NULL) == -EBUSY) return parent + 8UL; - } return return_hooker; } --- linux-azure-4.11.0.orig/arch/sparc/kernel/head_64.S +++ linux-azure-4.11.0/arch/sparc/kernel/head_64.S @@ -939,3 +939,9 @@ retl mov %o1, %o0 ENDPROC(__retl_o1) + +ENTRY(__retl_o1_asi) + wr %o5, 0x0, %asi + retl + mov %o1, %o0 +ENDPROC(__retl_o1_asi) --- linux-azure-4.11.0.orig/arch/sparc/kernel/irq_64.c +++ linux-azure-4.11.0/arch/sparc/kernel/irq_64.c @@ -1034,17 +1034,26 @@ { #ifdef CONFIG_SMP unsigned long page; + void *mondo, *p; - BUILD_BUG_ON((NR_CPUS * sizeof(u16)) > (PAGE_SIZE - 64)); + BUILD_BUG_ON((NR_CPUS * sizeof(u16)) > PAGE_SIZE); + + /* Make sure mondo block is 64byte aligned */ + p = kzalloc(127, GFP_KERNEL); + if (!p) { + prom_printf("SUN4V: Error, cannot allocate mondo block.\n"); + prom_halt(); + } + mondo = (void *)(((unsigned long)p + 63) & ~0x3f); + tb->cpu_mondo_block_pa = __pa(mondo); page = get_zeroed_page(GFP_KERNEL); if (!page) { - prom_printf("SUN4V: Error, cannot allocate cpu mondo page.\n"); + prom_printf("SUN4V: Error, cannot allocate cpu list page.\n"); prom_halt(); } - tb->cpu_mondo_block_pa = __pa(page); - tb->cpu_list_pa = __pa(page + 64); + tb->cpu_list_pa = __pa(page); #endif } --- linux-azure-4.11.0.orig/arch/sparc/kernel/kernel.h +++ linux-azure-4.11.0/arch/sparc/kernel/kernel.h @@ -37,7 +37,6 @@ /* smp_64.c */ void __irq_entry smp_call_function_client(int irq, struct pt_regs *regs); void __irq_entry smp_call_function_single_client(int irq, struct pt_regs *regs); -void __irq_entry smp_new_mmu_context_version_client(int irq, struct pt_regs *regs); void __irq_entry smp_penguin_jailcell(int irq, struct pt_regs *regs); void __irq_entry smp_receive_signal_client(int irq, struct pt_regs *regs); --- linux-azure-4.11.0.orig/arch/sparc/kernel/smp_64.c +++ linux-azure-4.11.0/arch/sparc/kernel/smp_64.c @@ -964,37 +964,6 @@ preempt_enable(); } -void __irq_entry smp_new_mmu_context_version_client(int irq, struct pt_regs *regs) -{ - struct mm_struct *mm; - unsigned long flags; - - clear_softint(1 << irq); - - /* See if we need to allocate a new TLB context because - * the version of the one we are using is now out of date. - */ - mm = current->active_mm; - if (unlikely(!mm || (mm == &init_mm))) - return; - - spin_lock_irqsave(&mm->context.lock, flags); - - if (unlikely(!CTX_VALID(mm->context))) - get_new_mmu_context(mm); - - spin_unlock_irqrestore(&mm->context.lock, flags); - - load_secondary_context(mm); - __flush_tlb_mm(CTX_HWBITS(mm->context), - SECONDARY_CONTEXT); -} - -void smp_new_mmu_context_version(void) -{ - smp_cross_call(&xcall_new_mmu_context_version, 0, 0, 0); -} - #ifdef CONFIG_KGDB void kgdb_roundup_cpus(unsigned long flags) { --- linux-azure-4.11.0.orig/arch/sparc/kernel/sys_sparc_64.c +++ linux-azure-4.11.0/arch/sparc/kernel/sys_sparc_64.c @@ -120,7 +120,7 @@ vma = find_vma(mm, addr); if (task_size - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } @@ -183,7 +183,7 @@ vma = find_vma(mm, addr); if (task_size - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } --- linux-azure-4.11.0.orig/arch/sparc/kernel/tsb.S +++ linux-azure-4.11.0/arch/sparc/kernel/tsb.S @@ -455,13 +455,16 @@ .type copy_tsb,#function copy_tsb: /* %o0=old_tsb_base, %o1=old_tsb_size * %o2=new_tsb_base, %o3=new_tsb_size + * %o4=page_size_shift */ sethi %uhi(TSB_PASS_BITS), %g7 srlx %o3, 4, %o3 - add %o0, %o1, %g1 /* end of old tsb */ + add %o0, %o1, %o1 /* end of old tsb */ sllx %g7, 32, %g7 sub %o3, 1, %o3 /* %o3 == new tsb hash mask */ + mov %o4, %g1 /* page_size_shift */ + 661: prefetcha [%o0] ASI_N, #one_read .section .tsb_phys_patch, "ax" .word 661b @@ -486,9 +489,9 @@ /* This can definitely be computed faster... */ srlx %o0, 4, %o5 /* Build index */ and %o5, 511, %o5 /* Mask index */ - sllx %o5, PAGE_SHIFT, %o5 /* Put into vaddr position */ + sllx %o5, %g1, %o5 /* Put into vaddr position */ or %o4, %o5, %o4 /* Full VADDR. */ - srlx %o4, PAGE_SHIFT, %o4 /* Shift down to create index */ + srlx %o4, %g1, %o4 /* Shift down to create index */ and %o4, %o3, %o4 /* Mask with new_tsb_nents-1 */ sllx %o4, 4, %o4 /* Shift back up into tsb ent offset */ TSB_STORE(%o2 + %o4, %g2) /* Store TAG */ @@ -496,7 +499,7 @@ TSB_STORE(%o2 + %o4, %g3) /* Store TTE */ 80: add %o0, 16, %o0 - cmp %o0, %g1 + cmp %o0, %o1 bne,pt %xcc, 90b nop --- linux-azure-4.11.0.orig/arch/sparc/kernel/ttable_64.S +++ linux-azure-4.11.0/arch/sparc/kernel/ttable_64.S @@ -50,7 +50,7 @@ tl0_irq1: TRAP_IRQ(smp_call_function_client, 1) tl0_irq2: TRAP_IRQ(smp_receive_signal_client, 2) tl0_irq3: TRAP_IRQ(smp_penguin_jailcell, 3) -tl0_irq4: TRAP_IRQ(smp_new_mmu_context_version_client, 4) +tl0_irq4: BTRAP(0x44) #else tl0_irq1: BTRAP(0x41) tl0_irq2: BTRAP(0x42) --- linux-azure-4.11.0.orig/arch/sparc/kernel/vio.c +++ linux-azure-4.11.0/arch/sparc/kernel/vio.c @@ -302,13 +302,16 @@ if (!id) { dev_set_name(&vdev->dev, "%s", bus_id_name); vdev->dev_no = ~(u64)0; + vdev->id = ~(u64)0; } else if (!cfg_handle) { dev_set_name(&vdev->dev, "%s-%llu", bus_id_name, *id); vdev->dev_no = *id; + vdev->id = ~(u64)0; } else { dev_set_name(&vdev->dev, "%s-%llu-%llu", bus_id_name, *cfg_handle, *id); vdev->dev_no = *cfg_handle; + vdev->id = *id; } vdev->dev.parent = parent; @@ -351,27 +354,84 @@ (void) vio_create_one(hp, node, &root_vdev->dev); } +struct vio_md_node_query { + const char *type; + u64 dev_no; + u64 id; +}; + static int vio_md_node_match(struct device *dev, void *arg) { + struct vio_md_node_query *query = (struct vio_md_node_query *) arg; struct vio_dev *vdev = to_vio_dev(dev); - if (vdev->mp == (u64) arg) - return 1; + if (vdev->dev_no != query->dev_no) + return 0; + if (vdev->id != query->id) + return 0; + if (strcmp(vdev->type, query->type)) + return 0; - return 0; + return 1; } static void vio_remove(struct mdesc_handle *hp, u64 node) { + const char *type; + const u64 *id, *cfg_handle; + u64 a; + struct vio_md_node_query query; struct device *dev; - dev = device_find_child(&root_vdev->dev, (void *) node, + type = mdesc_get_property(hp, node, "device-type", NULL); + if (!type) { + type = mdesc_get_property(hp, node, "name", NULL); + if (!type) + type = mdesc_node_name(hp, node); + } + + query.type = type; + + id = mdesc_get_property(hp, node, "id", NULL); + cfg_handle = NULL; + mdesc_for_each_arc(a, hp, node, MDESC_ARC_TYPE_BACK) { + u64 target; + + target = mdesc_arc_target(hp, a); + cfg_handle = mdesc_get_property(hp, target, + "cfg-handle", NULL); + if (cfg_handle) + break; + } + + if (!id) { + query.dev_no = ~(u64)0; + query.id = ~(u64)0; + } else if (!cfg_handle) { + query.dev_no = *id; + query.id = ~(u64)0; + } else { + query.dev_no = *cfg_handle; + query.id = *id; + } + + dev = device_find_child(&root_vdev->dev, &query, vio_md_node_match); if (dev) { printk(KERN_INFO "VIO: Removing device %s\n", dev_name(dev)); device_unregister(dev); put_device(dev); + } else { + if (!id) + printk(KERN_ERR "VIO: Removed unknown %s node.\n", + type); + else if (!cfg_handle) + printk(KERN_ERR "VIO: Removed unknown %s node %llu.\n", + type, *id); + else + printk(KERN_ERR "VIO: Removed unknown %s node %llu-%llu.\n", + type, *cfg_handle, *id); } } --- linux-azure-4.11.0.orig/arch/sparc/lib/GENbzero.S +++ linux-azure-4.11.0/arch/sparc/lib/GENbzero.S @@ -8,7 +8,7 @@ 98: x,y; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, __retl_o1; \ + .word 98b, __retl_o1_asi;\ .text; \ .align 4; --- linux-azure-4.11.0.orig/arch/sparc/lib/Makefile +++ linux-azure-4.11.0/arch/sparc/lib/Makefile @@ -15,6 +15,7 @@ lib-$(CONFIG_SPARC64) += atomic_64.o lib-$(CONFIG_SPARC32) += lshrdi3.o ashldi3.o lib-$(CONFIG_SPARC32) += muldi3.o bitext.o cmpdi2.o +lib-$(CONFIG_SPARC64) += multi3.o lib-$(CONFIG_SPARC64) += copy_page.o clear_page.o bzero.o lib-$(CONFIG_SPARC64) += csum_copy.o csum_copy_from_user.o csum_copy_to_user.o --- linux-azure-4.11.0.orig/arch/sparc/lib/NGbzero.S +++ linux-azure-4.11.0/arch/sparc/lib/NGbzero.S @@ -8,7 +8,7 @@ 98: x,y; \ .section __ex_table,"a";\ .align 4; \ - .word 98b, __retl_o1; \ + .word 98b, __retl_o1_asi;\ .text; \ .align 4; --- linux-azure-4.11.0.orig/arch/sparc/lib/atomic_64.S +++ linux-azure-4.11.0/arch/sparc/lib/atomic_64.S @@ -62,19 +62,23 @@ ENDPROC(atomic_fetch_##op); \ EXPORT_SYMBOL(atomic_fetch_##op); -#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_OP_RETURN(op) ATOMIC_FETCH_OP(op) +ATOMIC_OP(add) +ATOMIC_OP_RETURN(add) +ATOMIC_FETCH_OP(add) + +ATOMIC_OP(sub) +ATOMIC_OP_RETURN(sub) +ATOMIC_FETCH_OP(sub) -ATOMIC_OPS(add) -ATOMIC_OPS(sub) +ATOMIC_OP(and) +ATOMIC_FETCH_OP(and) -#undef ATOMIC_OPS -#define ATOMIC_OPS(op) ATOMIC_OP(op) ATOMIC_FETCH_OP(op) +ATOMIC_OP(or) +ATOMIC_FETCH_OP(or) -ATOMIC_OPS(and) -ATOMIC_OPS(or) -ATOMIC_OPS(xor) +ATOMIC_OP(xor) +ATOMIC_FETCH_OP(xor) -#undef ATOMIC_OPS #undef ATOMIC_FETCH_OP #undef ATOMIC_OP_RETURN #undef ATOMIC_OP @@ -124,19 +128,23 @@ ENDPROC(atomic64_fetch_##op); \ EXPORT_SYMBOL(atomic64_fetch_##op); -#define ATOMIC64_OPS(op) ATOMIC64_OP(op) ATOMIC64_OP_RETURN(op) ATOMIC64_FETCH_OP(op) +ATOMIC64_OP(add) +ATOMIC64_OP_RETURN(add) +ATOMIC64_FETCH_OP(add) + +ATOMIC64_OP(sub) +ATOMIC64_OP_RETURN(sub) +ATOMIC64_FETCH_OP(sub) -ATOMIC64_OPS(add) -ATOMIC64_OPS(sub) +ATOMIC64_OP(and) +ATOMIC64_FETCH_OP(and) -#undef ATOMIC64_OPS -#define ATOMIC64_OPS(op) ATOMIC64_OP(op) ATOMIC64_FETCH_OP(op) +ATOMIC64_OP(or) +ATOMIC64_FETCH_OP(or) -ATOMIC64_OPS(and) -ATOMIC64_OPS(or) -ATOMIC64_OPS(xor) +ATOMIC64_OP(xor) +ATOMIC64_FETCH_OP(xor) -#undef ATOMIC64_OPS #undef ATOMIC64_FETCH_OP #undef ATOMIC64_OP_RETURN #undef ATOMIC64_OP --- linux-azure-4.11.0.orig/arch/sparc/lib/checksum_64.S +++ linux-azure-4.11.0/arch/sparc/lib/checksum_64.S @@ -38,6 +38,7 @@ .align 32 .globl csum_partial + .type csum_partial,#function EXPORT_SYMBOL(csum_partial) csum_partial: /* %o0=buff, %o1=len, %o2=sum */ prefetch [%o0 + 0x000], #n_reads --- linux-azure-4.11.0.orig/arch/sparc/lib/csum_copy.S +++ linux-azure-4.11.0/arch/sparc/lib/csum_copy.S @@ -65,6 +65,7 @@ add %o5, %o4, %o4 .globl FUNC_NAME + .type FUNC_NAME,#function EXPORT_SYMBOL(FUNC_NAME) FUNC_NAME: /* %o0=src, %o1=dst, %o2=len, %o3=sum */ LOAD(prefetch, %o0 + 0x000, #n_reads) --- linux-azure-4.11.0.orig/arch/sparc/lib/memscan_64.S +++ linux-azure-4.11.0/arch/sparc/lib/memscan_64.S @@ -14,6 +14,8 @@ .text .align 32 .globl __memscan_zero, __memscan_generic + .type __memscan_zero,#function + .type __memscan_generic,#function .globl memscan EXPORT_SYMBOL(__memscan_zero) EXPORT_SYMBOL(__memscan_generic) --- linux-azure-4.11.0.orig/arch/sparc/lib/memset.S +++ linux-azure-4.11.0/arch/sparc/lib/memset.S @@ -63,6 +63,7 @@ __bzero_begin: .globl __bzero + .type __bzero,#function .globl memset EXPORT_SYMBOL(__bzero) EXPORT_SYMBOL(memset) --- linux-azure-4.11.0.orig/arch/sparc/lib/multi3.S +++ linux-azure-4.11.0/arch/sparc/lib/multi3.S @@ -0,0 +1,35 @@ +#include +#include + + .text + .align 4 +ENTRY(__multi3) /* %o0 = u, %o1 = v */ + mov %o1, %g1 + srl %o3, 0, %g4 + mulx %g4, %g1, %o1 + srlx %g1, 0x20, %g3 + mulx %g3, %g4, %g5 + sllx %g5, 0x20, %o5 + srl %g1, 0, %g4 + sub %o1, %o5, %o5 + srlx %o5, 0x20, %o5 + addcc %g5, %o5, %g5 + srlx %o3, 0x20, %o5 + mulx %g4, %o5, %g4 + mulx %g3, %o5, %o5 + sethi %hi(0x80000000), %g3 + addcc %g5, %g4, %g5 + srlx %g5, 0x20, %g5 + add %g3, %g3, %g3 + movcc %xcc, %g0, %g3 + addcc %o5, %g5, %o5 + sllx %g4, 0x20, %g4 + add %o1, %g4, %o1 + add %o5, %g3, %g2 + mulx %g1, %o2, %g1 + add %g1, %g2, %g1 + mulx %o0, %o3, %o0 + retl + add %g1, %o0, %o0 +ENDPROC(__multi3) +EXPORT_SYMBOL(__multi3) --- linux-azure-4.11.0.orig/arch/sparc/mm/gup.c +++ linux-azure-4.11.0/arch/sparc/mm/gup.c @@ -78,8 +78,8 @@ return 0; refs = 0; - head = pmd_page(pmd); - page = head + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + page = pmd_page(pmd) + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + head = compound_head(page); do { VM_BUG_ON(compound_head(page) != head); pages[*nr] = page; --- linux-azure-4.11.0.orig/arch/sparc/mm/hugetlbpage.c +++ linux-azure-4.11.0/arch/sparc/mm/hugetlbpage.c @@ -120,7 +120,7 @@ addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (task_size - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } if (mm->get_unmapped_area == arch_get_unmapped_area) --- linux-azure-4.11.0.orig/arch/sparc/mm/init_32.c +++ linux-azure-4.11.0/arch/sparc/mm/init_32.c @@ -290,7 +290,7 @@ /* Saves us work later. */ - memset((void *)&empty_zero_page, 0, PAGE_SIZE); + memset((void *)empty_zero_page, 0, PAGE_SIZE); i = last_valid_pfn >> ((20 - PAGE_SHIFT) + 5); i += 1; --- linux-azure-4.11.0.orig/arch/sparc/mm/init_64.c +++ linux-azure-4.11.0/arch/sparc/mm/init_64.c @@ -358,7 +358,8 @@ } if ((hv_pgsz_mask & cpu_pgsz_mask) == 0U) { - pr_warn("hugepagesz=%llu not supported by MMU.\n", + hugetlb_bad_size(); + pr_err("hugepagesz=%llu not supported by MMU.\n", hugepage_size); goto out; } @@ -706,10 +707,58 @@ /* get_new_mmu_context() uses "cache + 1". */ DEFINE_SPINLOCK(ctx_alloc_lock); -unsigned long tlb_context_cache = CTX_FIRST_VERSION - 1; +unsigned long tlb_context_cache = CTX_FIRST_VERSION; #define MAX_CTX_NR (1UL << CTX_NR_BITS) #define CTX_BMAP_SLOTS BITS_TO_LONGS(MAX_CTX_NR) DECLARE_BITMAP(mmu_context_bmap, MAX_CTX_NR); +DEFINE_PER_CPU(struct mm_struct *, per_cpu_secondary_mm) = {0}; + +static void mmu_context_wrap(void) +{ + unsigned long old_ver = tlb_context_cache & CTX_VERSION_MASK; + unsigned long new_ver, new_ctx, old_ctx; + struct mm_struct *mm; + int cpu; + + bitmap_zero(mmu_context_bmap, 1 << CTX_NR_BITS); + + /* Reserve kernel context */ + set_bit(0, mmu_context_bmap); + + new_ver = (tlb_context_cache & CTX_VERSION_MASK) + CTX_FIRST_VERSION; + if (unlikely(new_ver == 0)) + new_ver = CTX_FIRST_VERSION; + tlb_context_cache = new_ver; + + /* + * Make sure that any new mm that are added into per_cpu_secondary_mm, + * are going to go through get_new_mmu_context() path. + */ + mb(); + + /* + * Updated versions to current on those CPUs that had valid secondary + * contexts + */ + for_each_online_cpu(cpu) { + /* + * If a new mm is stored after we took this mm from the array, + * it will go into get_new_mmu_context() path, because we + * already bumped the version in tlb_context_cache. + */ + mm = per_cpu(per_cpu_secondary_mm, cpu); + + if (unlikely(!mm || mm == &init_mm)) + continue; + + old_ctx = mm->context.sparc64_ctx_val; + if (likely((old_ctx & CTX_VERSION_MASK) == old_ver)) { + new_ctx = (old_ctx & ~CTX_VERSION_MASK) | new_ver; + set_bit(new_ctx & CTX_NR_MASK, mmu_context_bmap); + mm->context.sparc64_ctx_val = new_ctx; + } + } +} /* Caller does TLB context flushing on local CPU if necessary. * The caller also ensures that CTX_VALID(mm->context) is false. @@ -725,48 +774,30 @@ { unsigned long ctx, new_ctx; unsigned long orig_pgsz_bits; - int new_version; spin_lock(&ctx_alloc_lock); +retry: + /* wrap might have happened, test again if our context became valid */ + if (unlikely(CTX_VALID(mm->context))) + goto out; orig_pgsz_bits = (mm->context.sparc64_ctx_val & CTX_PGSZ_MASK); ctx = (tlb_context_cache + 1) & CTX_NR_MASK; new_ctx = find_next_zero_bit(mmu_context_bmap, 1 << CTX_NR_BITS, ctx); - new_version = 0; if (new_ctx >= (1 << CTX_NR_BITS)) { new_ctx = find_next_zero_bit(mmu_context_bmap, ctx, 1); if (new_ctx >= ctx) { - int i; - new_ctx = (tlb_context_cache & CTX_VERSION_MASK) + - CTX_FIRST_VERSION; - if (new_ctx == 1) - new_ctx = CTX_FIRST_VERSION; - - /* Don't call memset, for 16 entries that's just - * plain silly... - */ - mmu_context_bmap[0] = 3; - mmu_context_bmap[1] = 0; - mmu_context_bmap[2] = 0; - mmu_context_bmap[3] = 0; - for (i = 4; i < CTX_BMAP_SLOTS; i += 4) { - mmu_context_bmap[i + 0] = 0; - mmu_context_bmap[i + 1] = 0; - mmu_context_bmap[i + 2] = 0; - mmu_context_bmap[i + 3] = 0; - } - new_version = 1; - goto out; + mmu_context_wrap(); + goto retry; } } + if (mm->context.sparc64_ctx_val) + cpumask_clear(mm_cpumask(mm)); mmu_context_bmap[new_ctx>>6] |= (1UL << (new_ctx & 63)); new_ctx |= (tlb_context_cache & CTX_VERSION_MASK); -out: tlb_context_cache = new_ctx; mm->context.sparc64_ctx_val = new_ctx | orig_pgsz_bits; +out: spin_unlock(&ctx_alloc_lock); - - if (unlikely(new_version)) - smp_new_mmu_context_version(); } static int numa_enabled = 1; --- linux-azure-4.11.0.orig/arch/sparc/mm/tsb.c +++ linux-azure-4.11.0/arch/sparc/mm/tsb.c @@ -496,7 +496,8 @@ extern void copy_tsb(unsigned long old_tsb_base, unsigned long old_tsb_size, unsigned long new_tsb_base, - unsigned long new_tsb_size); + unsigned long new_tsb_size, + unsigned long page_size_shift); unsigned long old_tsb_base = (unsigned long) old_tsb; unsigned long new_tsb_base = (unsigned long) new_tsb; @@ -504,7 +505,9 @@ old_tsb_base = __pa(old_tsb_base); new_tsb_base = __pa(new_tsb_base); } - copy_tsb(old_tsb_base, old_size, new_tsb_base, new_size); + copy_tsb(old_tsb_base, old_size, new_tsb_base, new_size, + tsb_index == MM_TSB_BASE ? + PAGE_SHIFT : REAL_HPAGE_SHIFT); } mm->context.tsb_block[tsb_index].tsb = new_tsb; --- linux-azure-4.11.0.orig/arch/sparc/mm/ultra.S +++ linux-azure-4.11.0/arch/sparc/mm/ultra.S @@ -971,11 +971,6 @@ wr %g0, (1 << PIL_SMP_CAPTURE), %set_softint retry - .globl xcall_new_mmu_context_version -xcall_new_mmu_context_version: - wr %g0, (1 << PIL_SMP_CTX_NEW_VERSION), %set_softint - retry - #ifdef CONFIG_KGDB .globl xcall_kgdb_capture xcall_kgdb_capture: --- linux-azure-4.11.0.orig/arch/tile/mm/hugetlbpage.c +++ linux-azure-4.11.0/arch/tile/mm/hugetlbpage.c @@ -233,7 +233,7 @@ addr = ALIGN(addr, huge_page_size(h)); vma = find_vma(mm, addr); if (TASK_SIZE - len >= addr && - (!vma || addr + len <= vma->vm_start)) + (!vma || addr + len <= vm_start_gap(vma))) return addr; } if (current->mm->get_unmapped_area == arch_get_unmapped_area) --- linux-azure-4.11.0.orig/arch/um/kernel/initrd.c +++ linux-azure-4.11.0/arch/um/kernel/initrd.c @@ -14,7 +14,7 @@ static char *initrd __initdata = NULL; static int load_initrd(char *filename, void *buf, int size); -static int __init read_initrd(void) +int __init read_initrd(void) { void *area; long long size; @@ -46,8 +46,6 @@ return 0; } -__uml_postsetup(read_initrd); - static int __init uml_initrd_setup(char *line, int *add) { initrd = line; --- linux-azure-4.11.0.orig/arch/um/kernel/um_arch.c +++ linux-azure-4.11.0/arch/um/kernel/um_arch.c @@ -338,11 +338,17 @@ return start_uml(); } +int __init __weak read_initrd(void) +{ + return 0; +} + void __init setup_arch(char **cmdline_p) { stack_protections((unsigned long) &init_thread_info); setup_physmem(uml_physmem, uml_reserved, physmem_size, highmem); mem_total_pages(physmem_size, iomem_size, highmem); + read_initrd(); paging_init(); strlcpy(boot_command_line, command_line, COMMAND_LINE_SIZE); --- linux-azure-4.11.0.orig/arch/x86/Kconfig +++ linux-azure-4.11.0/arch/x86/Kconfig @@ -1817,6 +1817,28 @@ If unsure, say N. +config EFI_SECURE_BOOT_LOCK_DOWN + def_bool n + depends on EFI + prompt "Lock down the kernel when UEFI Secure Boot is enabled" + ---help--- + UEFI Secure Boot provides a mechanism for ensuring that the firmware + will only load signed bootloaders and kernels. Certain use cases may + also require that all kernel modules also be signed and that + userspace is prevented from directly changing the running kernel + image. Say Y here to automatically lock down the kernel when a + system boots with UEFI Secure Boot enabled. + +config EFI_ALLOW_SECURE_BOOT_EXIT + def_bool n + depends on EFI_SECURE_BOOT_LOCK_DOWN && MAGIC_SYSRQ + select ALLOW_LOCKDOWN_LIFT + prompt "Allow secure boot mode to be exited with SysRq+x on a keyboard" + ---help--- + Allow secure boot mode to be exited and the kernel lockdown lifted by + typing SysRq+x on a keyboard attached to the system (not permitted + through procfs). + config SECCOMP def_bool y prompt "Enable seccomp to safely compute untrusted bytecode" @@ -2791,6 +2813,8 @@ source "drivers/Kconfig" +source "ubuntu/Kconfig" + source "drivers/firmware/Kconfig" source "fs/Kconfig" --- linux-azure-4.11.0.orig/arch/x86/boot/boot.h +++ linux-azure-4.11.0/arch/x86/boot/boot.h @@ -16,7 +16,7 @@ #ifndef BOOT_BOOT_H #define BOOT_BOOT_H -#define STACK_SIZE 512 /* Minimum number of bytes for stack */ +#define STACK_SIZE 1024 /* Minimum number of bytes for stack */ #ifndef __ASSEMBLY__ --- linux-azure-4.11.0.orig/arch/x86/boot/compressed/Makefile +++ linux-azure-4.11.0/arch/x86/boot/compressed/Makefile @@ -94,7 +94,7 @@ quiet_cmd_check_data_rel = DATAREL $@ define cmd_check_data_rel for obj in $(filter %.o,$^); do \ - readelf -S $$obj | grep -qF .rel.local && { \ + ${CROSS_COMPILE}readelf -S $$obj | grep -qF .rel.local && { \ echo "error: $$obj has data relocations!" >&2; \ exit 1; \ } || true; \ --- linux-azure-4.11.0.orig/arch/x86/boot/compressed/eboot.c +++ linux-azure-4.11.0/arch/x86/boot/compressed/eboot.c @@ -12,6 +12,7 @@ #include #include #include +#include #include "../string.h" #include "eboot.h" @@ -604,6 +605,31 @@ } } +#define MEMORY_ONLY_RESET_CONTROL_GUID \ + EFI_GUID (0xe20939be, 0x32d4, 0x41be, 0xa1, 0x50, 0x89, 0x7f, 0x85, 0xd4, 0x98, 0x29) + +static void enable_reset_attack_mitigation(void) +{ + static const efi_guid_t var_guid = MEMORY_ONLY_RESET_CONTROL_GUID; + static const efi_char16_t MemoryOverwriteRequestControl_name[] = { + 'M', 'e', 'm', 'o', 'r', 'y', + 'O', 'v', 'e', 'r', 'w', 'r', 'i', 't', 'e', + 'R', 'e', 'q', 'u', 'e', 's', 't', + 'C', 'o', 'n', 't', 'r', 'o', 'l', + 0 + }; + u8 val = 1; + + /* Ignore the return value here - there's not really a lot we can do */ + efi_call_runtime(set_variable, + (efi_char16_t *)MemoryOverwriteRequestControl_name, + (efi_guid_t *)&var_guid, + EFI_VARIABLE_NON_VOLATILE | + EFI_VARIABLE_BOOTSERVICE_ACCESS | + EFI_VARIABLE_RUNTIME_ACCESS, + sizeof(val), val); +} + /* * Because the x86 boot code expects to be passed a boot_params we * need to create one ourselves (usually the bootloader would create @@ -988,6 +1014,11 @@ else setup_boot_services32(efi_early); + /* Ask the firmware to clear memory if we don't have a clean shutdown */ + enable_reset_attack_mitigation(); + + sanitize_boot_params(boot_params); + /* * If the boot loader gave us a value for secure_boot then we use that, * otherwise we ask the BIOS. --- linux-azure-4.11.0.orig/arch/x86/boot/compressed/kaslr.c +++ linux-azure-4.11.0/arch/x86/boot/compressed/kaslr.c @@ -564,9 +564,6 @@ { unsigned long random_addr, min_addr; - /* By default, keep output position unchanged. */ - *virt_addr = *output; - if (cmdline_find_option_bool("nokaslr")) { warn("KASLR disabled: 'nokaslr' on cmdline."); return; @@ -597,10 +594,17 @@ add_identity_map(random_addr, output_size); *output = random_addr; } + + /* + * This loads the identity mapping page table. + * This should only be done if a new physical address + * is found for the kernel, otherwise we should keep + * the old page table to make it be like the "nokaslr" + * case. + */ + finalize_identity_maps(); } - /* This actually loads the identity pagetable on x86_64. */ - finalize_identity_maps(); /* Pick random virtual address starting from LOAD_PHYSICAL_ADDR. */ if (IS_ENABLED(CONFIG_X86_64)) --- linux-azure-4.11.0.orig/arch/x86/boot/compressed/misc.c +++ linux-azure-4.11.0/arch/x86/boot/compressed/misc.c @@ -338,7 +338,7 @@ unsigned long output_len) { const unsigned long kernel_total_size = VO__end - VO__text; - unsigned long virt_addr = (unsigned long)output; + unsigned long virt_addr = LOAD_PHYSICAL_ADDR; /* Retain x86 boot parameters pointer passed from startup_32/64. */ boot_params = rmode; @@ -397,7 +397,7 @@ #ifndef CONFIG_RELOCATABLE if ((unsigned long)output != LOAD_PHYSICAL_ADDR) error("Destination address does not match LOAD_PHYSICAL_ADDR"); - if ((unsigned long)output != virt_addr) + if (virt_addr != LOAD_PHYSICAL_ADDR) error("Destination virtual address changed when not relocatable"); #endif --- linux-azure-4.11.0.orig/arch/x86/boot/compressed/misc.h +++ linux-azure-4.11.0/arch/x86/boot/compressed/misc.h @@ -81,8 +81,6 @@ unsigned long output_size, unsigned long *virt_addr) { - /* No change from existing output location. */ - *virt_addr = *output; } #endif --- linux-azure-4.11.0.orig/arch/x86/crypto/sha1_ssse3_glue.c +++ linux-azure-4.11.0/arch/x86/crypto/sha1_ssse3_glue.c @@ -201,7 +201,7 @@ static bool avx2_usable(void) { - if (avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) + if (false && avx_usable() && boot_cpu_has(X86_FEATURE_AVX2) && boot_cpu_has(X86_FEATURE_BMI1) && boot_cpu_has(X86_FEATURE_BMI2)) return true; --- linux-azure-4.11.0.orig/arch/x86/entry/entry_32.S +++ linux-azure-4.11.0/arch/x86/entry/entry_32.S @@ -255,6 +255,23 @@ END(__switch_to_asm) /* + * The unwinder expects the last frame on the stack to always be at the same + * offset from the end of the page, which allows it to validate the stack. + * Calling schedule_tail() directly would break that convention because its an + * asmlinkage function so its argument has to be pushed on the stack. This + * wrapper creates a proper "end of stack" frame header before the call. + */ +ENTRY(schedule_tail_wrapper) + FRAME_BEGIN + + pushl %eax + call schedule_tail + popl %eax + + FRAME_END + ret +ENDPROC(schedule_tail_wrapper) +/* * A newly forked process directly context switches into this address. * * eax: prev task we switched from @@ -262,24 +279,15 @@ * edi: kernel thread arg */ ENTRY(ret_from_fork) - FRAME_BEGIN /* help unwinder find end of stack */ - - /* - * schedule_tail() is asmlinkage so we have to put its 'prev' argument - * on the stack. - */ - pushl %eax - call schedule_tail - popl %eax + call schedule_tail_wrapper testl %ebx, %ebx jnz 1f /* kernel threads are uncommon */ 2: /* When we fork, we trace the syscall return in the child, too. */ - leal FRAME_OFFSET(%esp), %eax + movl %esp, %eax call syscall_return_slowpath - FRAME_END jmp restore_all /* kernel thread */ --- linux-azure-4.11.0.orig/arch/x86/entry/entry_64.S +++ linux-azure-4.11.0/arch/x86/entry/entry_64.S @@ -36,7 +36,6 @@ #include #include #include -#include #include .code64 @@ -409,19 +408,17 @@ * r12: kernel thread arg */ ENTRY(ret_from_fork) - FRAME_BEGIN /* help unwinder find end of stack */ movq %rax, %rdi - call schedule_tail /* rdi: 'prev' task parameter */ + call schedule_tail /* rdi: 'prev' task parameter */ - testq %rbx, %rbx /* from kernel_thread? */ - jnz 1f /* kernel threads are uncommon */ + testq %rbx, %rbx /* from kernel_thread? */ + jnz 1f /* kernel threads are uncommon */ 2: - leaq FRAME_OFFSET(%rsp),%rdi /* pt_regs pointer */ + movq %rsp, %rdi call syscall_return_slowpath /* returns with IRQs disabled */ TRACE_IRQS_ON /* user mode is traced as IRQS on */ SWAPGS - FRAME_END jmp restore_regs_and_iret 1: --- linux-azure-4.11.0.orig/arch/x86/entry/vdso/vclock_gettime.c +++ linux-azure-4.11.0/arch/x86/entry/vdso/vclock_gettime.c @@ -17,6 +17,7 @@ #include #include #include +#include #include #include #include @@ -32,6 +33,11 @@ __attribute__((visibility("hidden"))); #endif +#ifdef CONFIG_HYPERV_TSCPAGE +extern u8 hvclock_page + __attribute__((visibility("hidden"))); +#endif + #ifndef BUILD_VDSO32 notrace static long vdso_fallback_gettime(long clock, struct timespec *ts) @@ -141,6 +147,20 @@ return last; } #endif +#ifdef CONFIG_HYPERV_TSCPAGE +static notrace u64 vread_hvclock(int *mode) +{ + const struct ms_hyperv_tsc_page *tsc_pg = + (const struct ms_hyperv_tsc_page *)&hvclock_page; + u64 current_tick = hv_read_tsc_page(tsc_pg); + + if (current_tick != U64_MAX) + return current_tick; + + *mode = VCLOCK_NONE; + return 0; +} +#endif notrace static u64 vread_tsc(void) { @@ -173,6 +193,10 @@ else if (gtod->vclock_mode == VCLOCK_PVCLOCK) cycles = vread_pvclock(mode); #endif +#ifdef CONFIG_HYPERV_TSCPAGE + else if (gtod->vclock_mode == VCLOCK_HVCLOCK) + cycles = vread_hvclock(mode); +#endif else return 0; v = (cycles - gtod->cycle_last) & gtod->mask; --- linux-azure-4.11.0.orig/arch/x86/entry/vdso/vdso-layout.lds.S +++ linux-azure-4.11.0/arch/x86/entry/vdso/vdso-layout.lds.S @@ -25,7 +25,7 @@ * segment. */ - vvar_start = . - 2 * PAGE_SIZE; + vvar_start = . - 3 * PAGE_SIZE; vvar_page = vvar_start; /* Place all vvars at the offsets in asm/vvar.h. */ @@ -36,6 +36,7 @@ #undef EMIT_VVAR pvclock_page = vvar_start + PAGE_SIZE; + hvclock_page = vvar_start + 2 * PAGE_SIZE; . = SIZEOF_HEADERS; --- linux-azure-4.11.0.orig/arch/x86/entry/vdso/vdso2c.c +++ linux-azure-4.11.0/arch/x86/entry/vdso/vdso2c.c @@ -74,6 +74,7 @@ sym_vvar_page, sym_hpet_page, sym_pvclock_page, + sym_hvclock_page, sym_VDSO_FAKE_SECTION_TABLE_START, sym_VDSO_FAKE_SECTION_TABLE_END, }; @@ -82,6 +83,7 @@ sym_vvar_page, sym_hpet_page, sym_pvclock_page, + sym_hvclock_page, }; struct vdso_sym { @@ -94,6 +96,7 @@ [sym_vvar_page] = {"vvar_page", true}, [sym_hpet_page] = {"hpet_page", true}, [sym_pvclock_page] = {"pvclock_page", true}, + [sym_hvclock_page] = {"hvclock_page", true}, [sym_VDSO_FAKE_SECTION_TABLE_START] = { "VDSO_FAKE_SECTION_TABLE_START", false }, --- linux-azure-4.11.0.orig/arch/x86/entry/vdso/vma.c +++ linux-azure-4.11.0/arch/x86/entry/vdso/vma.c @@ -22,6 +22,7 @@ #include #include #include +#include #if defined(CONFIG_X86_64) unsigned int __read_mostly vdso64_enabled = 1; @@ -121,6 +122,12 @@ vmf->address, __pa(pvti) >> PAGE_SHIFT); } + } else if (sym_offset == image->sym_hvclock_page) { + struct ms_hyperv_tsc_page *tsc_pg = hv_get_tsc_page(); + + if (tsc_pg && vclock_was_used(VCLOCK_HVCLOCK)) + ret = vm_insert_pfn(vma, vmf->address, + vmalloc_to_pfn(tsc_pg)); } if (ret == 0 || ret == -EBUSY) --- linux-azure-4.11.0.orig/arch/x86/events/intel/core.c +++ linux-azure-4.11.0/arch/x86/events/intel/core.c @@ -431,11 +431,11 @@ [ C(DTLB) ] = { [ C(OP_READ) ] = { [ C(RESULT_ACCESS) ] = 0x81d0, /* MEM_INST_RETIRED.ALL_LOADS */ - [ C(RESULT_MISS) ] = 0x608, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ + [ C(RESULT_MISS) ] = 0xe08, /* DTLB_LOAD_MISSES.WALK_COMPLETED */ }, [ C(OP_WRITE) ] = { [ C(RESULT_ACCESS) ] = 0x82d0, /* MEM_INST_RETIRED.ALL_STORES */ - [ C(RESULT_MISS) ] = 0x649, /* DTLB_STORE_MISSES.WALK_COMPLETED */ + [ C(RESULT_MISS) ] = 0xe49, /* DTLB_STORE_MISSES.WALK_COMPLETED */ }, [ C(OP_PREFETCH) ] = { [ C(RESULT_ACCESS) ] = 0x0, @@ -2130,7 +2130,7 @@ * counters from the GLOBAL_STATUS mask and we always process PEBS * events via drain_pebs(). */ - status &= ~cpuc->pebs_enabled; + status &= ~(cpuc->pebs_enabled & PEBS_COUNTER_MASK); /* * PEBS overflow sets bit 62 in the global status register --- linux-azure-4.11.0.orig/arch/x86/events/intel/ds.c +++ linux-azure-4.11.0/arch/x86/events/intel/ds.c @@ -1222,7 +1222,7 @@ /* clear non-PEBS bit and re-check */ pebs_status = p->status & cpuc->pebs_enabled; - pebs_status &= (1ULL << MAX_PEBS_EVENTS) - 1; + pebs_status &= PEBS_COUNTER_MASK; if (pebs_status == (1 << bit)) return at; } --- linux-azure-4.11.0.orig/arch/x86/events/intel/rapl.c +++ linux-azure-4.11.0/arch/x86/events/intel/rapl.c @@ -761,7 +761,7 @@ X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_CORE, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_GT3E, hsw_rapl_init), - X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsw_rapl_init), + X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_X, hsx_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_BROADWELL_XEON_D, hsw_rapl_init), X86_RAPL_MODEL_MATCH(INTEL_FAM6_XEON_PHI_KNL, knl_rapl_init), --- linux-azure-4.11.0.orig/arch/x86/events/intel/uncore.c +++ linux-azure-4.11.0/arch/x86/events/intel/uncore.c @@ -1170,7 +1170,7 @@ pmu = type->pmus; for (i = 0; i < type->num_boxes; i++, pmu++) { box = pmu->boxes[pkg]; - if (!box && atomic_inc_return(&box->refcnt) == 1) + if (box && atomic_inc_return(&box->refcnt) == 1) uncore_box_init(box); } } --- linux-azure-4.11.0.orig/arch/x86/events/perf_event.h +++ linux-azure-4.11.0/arch/x86/events/perf_event.h @@ -79,6 +79,7 @@ /* The maximal number of PEBS events: */ #define MAX_PEBS_EVENTS 8 +#define PEBS_COUNTER_MASK ((1ULL << MAX_PEBS_EVENTS) - 1) /* * Flags PEBS can handle without an PMI. --- linux-azure-4.11.0.orig/arch/x86/hyperv/Makefile +++ linux-azure-4.11.0/arch/x86/hyperv/Makefile @@ -1 +1 @@ -obj-y := hv_init.o +obj-y := hv_init.o mmu.o --- linux-azure-4.11.0.orig/arch/x86/hyperv/hv_init.c +++ linux-azure-4.11.0/arch/x86/hyperv/hv_init.c @@ -24,48 +24,35 @@ #include #include #include +#include #include +#include +#include +#ifndef PKG_ABI +/* + * Preserve the ability to 'make deb-pkg' since PKG_ABI is provided + * by the Ubuntu build rules. + */ +#define PKG_ABI 0 +#endif -#ifdef CONFIG_X86_64 +#ifdef CONFIG_HYPERV_TSCPAGE static struct ms_hyperv_tsc_page *tsc_pg; +struct ms_hyperv_tsc_page *hv_get_tsc_page(void) +{ + return tsc_pg; +} + static u64 read_hv_clock_tsc(struct clocksource *arg) { - u64 current_tick; + u64 current_tick = hv_read_tsc_page(tsc_pg); + + if (current_tick == U64_MAX) + rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); - if (tsc_pg->tsc_sequence != 0) { - /* - * Use the tsc page to compute the value. - */ - - while (1) { - u64 tmp; - u32 sequence = tsc_pg->tsc_sequence; - u64 cur_tsc; - u64 scale = tsc_pg->tsc_scale; - s64 offset = tsc_pg->tsc_offset; - - rdtscll(cur_tsc); - /* current_tick = ((cur_tsc *scale) >> 64) + offset */ - asm("mulq %3" - : "=d" (current_tick), "=a" (tmp) - : "a" (cur_tsc), "r" (scale)); - - current_tick += offset; - if (tsc_pg->tsc_sequence == sequence) - return current_tick; - - if (tsc_pg->tsc_sequence != 0) - continue; - /* - * Fallback using MSR method. - */ - break; - } - } - rdmsrl(HV_X64_MSR_TIME_REF_COUNT, current_tick); return current_tick; } @@ -98,10 +85,25 @@ .flags = CLOCK_SOURCE_IS_CONTINUOUS, }; -static void *hypercall_pg; +void *hv_hypercall_pg; +EXPORT_SYMBOL_GPL(hv_hypercall_pg); struct clocksource *hyperv_cs; EXPORT_SYMBOL_GPL(hyperv_cs); +u32 *hv_vp_index; +EXPORT_SYMBOL_GPL(hv_vp_index); + +static int hv_cpu_init(unsigned int cpu) +{ + u64 msr_vp_index; + + hv_get_vp_index(msr_vp_index); + + hv_vp_index[smp_processor_id()] = (u32)msr_vp_index; + + return 0; +} + /* * This function is to be invoked early in the boot sequence after the * hypervisor has been detected. @@ -117,29 +119,41 @@ if (x86_hyper != &x86_hyper_ms_hyperv) return; + /* Allocate percpu VP index */ + hv_vp_index = kcalloc(num_possible_cpus(), sizeof(*hv_vp_index), + GFP_KERNEL); + if (!hv_vp_index) + return; + + if (cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "x86/hyperv_init:online", + hv_cpu_init, NULL) < 0) + goto free_vp_index; + /* * Setup the hypercall page and enable hypercalls. * 1. Register the guest ID * 2. Enable the hypercall and register the hypercall page */ - guest_id = generate_guest_id(0, LINUX_VERSION_CODE, 0); + guest_id = generate_guest_id(0x80 /*Canonical*/, LINUX_VERSION_CODE, PKG_ABI); wrmsrl(HV_X64_MSR_GUEST_OS_ID, guest_id); - hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); - if (hypercall_pg == NULL) { + hv_hypercall_pg = __vmalloc(PAGE_SIZE, GFP_KERNEL, PAGE_KERNEL_RX); + if (hv_hypercall_pg == NULL) { wrmsrl(HV_X64_MSR_GUEST_OS_ID, 0); - return; + goto free_vp_index; } rdmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); hypercall_msr.enable = 1; - hypercall_msr.guest_physical_address = vmalloc_to_pfn(hypercall_pg); + hypercall_msr.guest_physical_address = vmalloc_to_pfn(hv_hypercall_pg); wrmsrl(HV_X64_MSR_HYPERCALL, hypercall_msr.as_uint64); + hyper_alloc_mmu(); + /* * Register Hyper-V specific clocksource. */ -#ifdef CONFIG_X86_64 +#ifdef CONFIG_HYPERV_TSCPAGE if (ms_hyperv.features & HV_X64_MSR_REFERENCE_TSC_AVAILABLE) { union hv_x64_msr_hypercall_contents tsc_msr; @@ -155,6 +169,9 @@ tsc_msr.guest_physical_address = vmalloc_to_pfn(tsc_pg); wrmsrl(HV_X64_MSR_REFERENCE_TSC, tsc_msr.as_uint64); + + hyperv_cs_tsc.archdata.vclock_mode = VCLOCK_HVCLOCK; + clocksource_register_hz(&hyperv_cs_tsc, NSEC_PER_SEC/100); return; } @@ -168,6 +185,12 @@ hyperv_cs = &hyperv_cs_msr; if (ms_hyperv.features & HV_X64_MSR_TIME_REF_COUNT_AVAILABLE) clocksource_register_hz(&hyperv_cs_msr, NSEC_PER_SEC/100); + + return; + +free_vp_index: + kfree(hv_vp_index); + hv_vp_index = NULL; } /* @@ -190,51 +213,6 @@ } EXPORT_SYMBOL_GPL(hyperv_cleanup); -/* - * hv_do_hypercall- Invoke the specified hypercall - */ -u64 hv_do_hypercall(u64 control, void *input, void *output) -{ - u64 input_address = (input) ? virt_to_phys(input) : 0; - u64 output_address = (output) ? virt_to_phys(output) : 0; -#ifdef CONFIG_X86_64 - u64 hv_status = 0; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__("mov %0, %%r8" : : "r" (output_address) : "r8"); - __asm__ __volatile__("call *%3" : "=a" (hv_status) : - "c" (control), "d" (input_address), - "m" (hypercall_pg)); - - return hv_status; - -#else - - u32 control_hi = control >> 32; - u32 control_lo = control & 0xFFFFFFFF; - u32 hv_status_hi = 1; - u32 hv_status_lo = 1; - u32 input_address_hi = input_address >> 32; - u32 input_address_lo = input_address & 0xFFFFFFFF; - u32 output_address_hi = output_address >> 32; - u32 output_address_lo = output_address & 0xFFFFFFFF; - - if (!hypercall_pg) - return (u64)ULLONG_MAX; - - __asm__ __volatile__ ("call *%8" : "=d"(hv_status_hi), - "=a"(hv_status_lo) : "d" (control_hi), - "a" (control_lo), "b" (input_address_hi), - "c" (input_address_lo), "D"(output_address_hi), - "S"(output_address_lo), "m" (hypercall_pg)); - - return hv_status_lo | ((u64)hv_status_hi << 32); -#endif /* !x86_64 */ -} -EXPORT_SYMBOL_GPL(hv_do_hypercall); - void hyperv_report_panic(struct pt_regs *regs) { static bool panic_reported; --- linux-azure-4.11.0.orig/arch/x86/hyperv/mmu.c +++ linux-azure-4.11.0/arch/x86/hyperv/mmu.c @@ -0,0 +1,270 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define CREATE_TRACE_POINTS +DEFINE_TRACE(hyperv_mmu_flush_tlb_others); + +/* HvFlushVirtualAddressSpace, HvFlushVirtualAddressList hypercalls */ +struct hv_flush_pcpu { + __u64 address_space; + __u64 flags; + __u64 processor_mask; + __u64 gva_list[]; +}; + +/* HvFlushVirtualAddressSpaceEx, HvFlushVirtualAddressListEx hypercalls */ +struct hv_flush_pcpu_ex { + __u64 address_space; + __u64 flags; + struct { + __u64 format; + __u64 valid_bank_mask; + __u64 bank_contents[]; + } hv_vp_set; + __u64 gva_list[]; +}; + +static struct hv_flush_pcpu __percpu *pcpu_flush; + +static struct hv_flush_pcpu_ex __percpu *pcpu_flush_ex; + +static inline int cpumask_to_vp_set(struct hv_flush_pcpu_ex *flush, + const struct cpumask *cpus) +{ + int cur_bank, cpu, vcpu, nr_bank = 0; + bool has_cpus; + + /* + * We can't be sure that translated vcpu numbers will always be + * in ascending order, so iterate over all possible banks and + * check all vcpus in it instead. + */ + for (cur_bank = 0; cur_bank < ms_hyperv.max_vp_index/64; cur_bank++) { + has_cpus = false; + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + if (vcpu/64 != cur_bank) + continue; + if (!has_cpus) { + flush->hv_vp_set.valid_bank_mask |= + 1 << vcpu / 64; + flush->hv_vp_set.bank_contents[nr_bank] = + 1 << vcpu % 64; + has_cpus = true; + } else { + flush->hv_vp_set.bank_contents[nr_bank] |= + 1 << vcpu % 64; + } + } + if (has_cpus) + nr_bank++; + } + + return nr_bank; +} + +static void hyperv_flush_tlb_others(const struct cpumask *cpus, + struct mm_struct *mm, unsigned long start, + unsigned long end) +{ + struct hv_flush_pcpu *flush; + unsigned long cur, flags; + u64 status = -1ULL; + int cpu, vcpu, gva_n, max_gvas; + + trace_hyperv_mmu_flush_tlb_others(cpus, mm, start, end); + + if (!pcpu_flush || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush = this_cpu_ptr(pcpu_flush); + + if (mm) { + flush->address_space = virt_to_phys(mm->pgd); + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->processor_mask = 0; + if (cpumask_equal(cpus, cpu_present_mask)) { + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } else { + for_each_cpu(cpu, cpus) { + vcpu = hv_cpu_number_to_vp_number(cpu); + if (vcpu != -1 && vcpu < 64) + flush->processor_mask |= 1 << vcpu; + else + goto do_native; + } + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = (PAGE_SIZE - sizeof(*flush)) / 8; + + if (end == TLB_FLUSH_ALL || + (end && ((end - start)/(PAGE_SIZE*PAGE_SIZE)) > max_gvas)) { + if (end == TLB_FLUSH_ALL) + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + status = hv_do_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE, + flush, NULL); + } else { + cur = start; + gva_n = 0; + do { + flush->gva_list[gva_n] = cur & PAGE_MASK; + /* + * Lower 12 bits encode the number of additional + * pages to flush (in addition to the 'cur' page). + */ + if (end >= cur + PAGE_SIZE * PAGE_SIZE) + flush->gva_list[gva_n] |= ~PAGE_MASK; + else if (end > cur) + flush->gva_list[gva_n] |= + (end - cur - 1) >> PAGE_SHIFT; + + cur += PAGE_SIZE * PAGE_SIZE; + ++gva_n; + + } while (cur < end); + + status = hv_do_rep_hypercall(HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST, + gva_n, 0, flush, NULL); + + } + + local_irq_restore(flags); + + if (!(status & 0xffff)) + return; +do_native: + native_flush_tlb_others(cpus, mm, start, end); +} + +static void hyperv_flush_tlb_others_ex(const struct cpumask *cpus, + struct mm_struct *mm, + unsigned long start, + unsigned long end) +{ + struct hv_flush_pcpu_ex *flush; + unsigned long cur, flags; + u64 status = -1ULL; + int nr_bank = 0, max_gvas, gva_n; + + trace_hyperv_mmu_flush_tlb_others(cpus, mm, start, end); + + if (!pcpu_flush_ex || !hv_hypercall_pg) + goto do_native; + + if (cpumask_empty(cpus)) + return; + + local_irq_save(flags); + + flush = this_cpu_ptr(pcpu_flush_ex); + + if (mm) { + flush->address_space = virt_to_phys(mm->pgd); + flush->flags = 0; + } else { + flush->address_space = 0; + flush->flags = HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES; + } + + flush->hv_vp_set.valid_bank_mask = 0; + + if (cpumask_equal(cpus, cpu_present_mask)) { + flush->hv_vp_set.format = HV_GENERIC_SET_ALL; + flush->flags |= HV_FLUSH_ALL_PROCESSORS; + } else { + flush->hv_vp_set.format = HV_GENERIC_SET_SPARCE_4K; + nr_bank = cpumask_to_vp_set(flush, cpus); + } + + /* + * We can flush not more than max_gvas with one hypercall. Flush the + * whole address space if we were asked to do more. + */ + max_gvas = (PAGE_SIZE - sizeof(*flush) - nr_bank*8) / 8; + + if (end == TLB_FLUSH_ALL || + (end && ((end - start)/(PAGE_SIZE*PAGE_SIZE)) > max_gvas)) { + if (end == TLB_FLUSH_ALL) + flush->flags |= HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY; + + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX, + 0, nr_bank + 2, flush, NULL); + } else { + cur = start; + gva_n = nr_bank; + do { + flush->gva_list[gva_n] = cur & PAGE_MASK; + /* + * Lower 12 bits encode the number of additional + * pages to flush (in addition to the 'cur' page). + */ + if (end >= cur + PAGE_SIZE * PAGE_SIZE) + flush->gva_list[gva_n] |= ~PAGE_MASK; + else if (end > cur) + flush->gva_list[gva_n] |= + (end - cur - 1) >> PAGE_SHIFT; + + cur += PAGE_SIZE * PAGE_SIZE; + ++gva_n; + + } while (cur < end); + + status = hv_do_rep_hypercall( + HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX, + gva_n, nr_bank + 2, flush, NULL); + } + + local_irq_restore(flags); + + if (!(status & 0xffff)) + return; +do_native: + native_flush_tlb_others(cpus, mm, start, end); +} + +void hyperv_setup_mmu_ops(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) { + pr_info("Hyper-V: Using hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others; + } else { + pr_info("Hyper-V: Using ext hypercall for remote TLB flush\n"); + pv_mmu_ops.flush_tlb_others = hyperv_flush_tlb_others_ex; + } +} + +void hyper_alloc_mmu(void) +{ + if (!(ms_hyperv.hints & HV_X64_REMOTE_TLB_FLUSH_RECOMMENDED)) + return; + + if (!(ms_hyperv.hints & HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED)) + pcpu_flush = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); + else + pcpu_flush_ex = __alloc_percpu(PAGE_SIZE, PAGE_SIZE); +} --- linux-azure-4.11.0.orig/arch/x86/include/asm/clocksource.h +++ linux-azure-4.11.0/arch/x86/include/asm/clocksource.h @@ -6,7 +6,8 @@ #define VCLOCK_NONE 0 /* No vDSO clock available. */ #define VCLOCK_TSC 1 /* vDSO should use vread_tsc. */ #define VCLOCK_PVCLOCK 2 /* vDSO should use vread_pvclock. */ -#define VCLOCK_MAX 2 +#define VCLOCK_HVCLOCK 3 /* vDSO should use vread_hvclock. */ +#define VCLOCK_MAX 3 struct arch_clocksource_data { int vclock_mode; --- linux-azure-4.11.0.orig/arch/x86/include/asm/elf.h +++ linux-azure-4.11.0/arch/x86/include/asm/elf.h @@ -245,12 +245,13 @@ #define CORE_DUMP_USE_REGSET #define ELF_EXEC_PAGESIZE 4096 -/* This is the location that an ET_DYN program is loaded if exec'ed. Typical - use of this is to invoke "./ld.so someprog" to test out a new version of - the loader. We need to make sure that it is out of the way of the program - that it will "exec", and that there is sufficient room for the brk. */ - -#define ELF_ET_DYN_BASE (TASK_SIZE / 3 * 2) +/* + * This is the base location for PIE (ET_DYN with INTERP) loads. On + * 64-bit, this is raised to 4GB to leave the entire 32-bit address + * space open for things that want to use the area for 32-bit pointers. + */ +#define ELF_ET_DYN_BASE (mmap_is_ia32() ? 0x000400000UL : \ + 0x100000000UL) /* This yields a mask that user programs can use to figure out what instruction set this CPU supports. This could be done in user space, --- linux-azure-4.11.0.orig/arch/x86/include/asm/kvm_emulate.h +++ linux-azure-4.11.0/arch/x86/include/asm/kvm_emulate.h @@ -221,6 +221,9 @@ void (*get_cpuid)(struct x86_emulate_ctxt *ctxt, u32 *eax, u32 *ebx, u32 *ecx, u32 *edx); void (*set_nmi_mask)(struct x86_emulate_ctxt *ctxt, bool masked); + + unsigned (*get_hflags)(struct x86_emulate_ctxt *ctxt); + void (*set_hflags)(struct x86_emulate_ctxt *ctxt, unsigned hflags); }; typedef u32 __attribute__((vector_size(16))) sse128_t; @@ -290,10 +293,10 @@ /* interruptibility state, as a result of execution of STI or MOV SS */ int interruptibility; - int emul_flags; bool perm_ok; /* do not check permissions if true */ bool ud; /* inject an #UD if host doesn't support insn */ + bool tf; /* TF value before instruction (after for syscall/sysret) */ bool have_exception; struct x86_exception exception; --- linux-azure-4.11.0.orig/arch/x86/include/asm/mce.h +++ linux-azure-4.11.0/arch/x86/include/asm/mce.h @@ -264,6 +264,7 @@ #endif int mce_available(struct cpuinfo_x86 *c); +bool mce_is_memory_error(struct mce *m); DECLARE_PER_CPU(unsigned, mce_exception_count); DECLARE_PER_CPU(unsigned, mce_poll_count); --- linux-azure-4.11.0.orig/arch/x86/include/asm/mshyperv.h +++ linux-azure-4.11.0/arch/x86/include/asm/mshyperv.h @@ -2,8 +2,8 @@ #define _ASM_X86_MSHYPER_H #include -#include -#include +#include +#include #include /* @@ -29,6 +29,8 @@ u32 features; u32 misc_features; u32 hints; + u32 max_vp_index; + u32 max_lp_index; }; extern struct ms_hyperv_info ms_hyperv; @@ -170,10 +172,199 @@ #if IS_ENABLED(CONFIG_HYPERV) extern struct clocksource *hyperv_cs; +extern void *hv_hypercall_pg; + +static inline u64 hv_do_hypercall(u64 control, void *input, void *output) +{ + u64 input_address = (input) ? virt_to_phys(input) : 0; + u64 output_address = (output) ? virt_to_phys(output) : 0; +#ifdef CONFIG_X86_64 + u64 hv_status; + + if (!hv_hypercall_pg) + return (u64)ULLONG_MAX; + + __asm__ __volatile__("mov %3, %%r8\n" + "call *%4" + : "=a" (hv_status), + "+c" (control), "+d" (input_address) + : "r" (output_address), "m" (hv_hypercall_pg) + : "cc", "memory", "r8", "r9", "r10", "r11"); + + return hv_status; + +#else + u32 control_hi = control >> 32; + u32 control_lo = control & 0xFFFFFFFF; + u32 input_address_hi = input_address >> 32; + u32 input_address_lo = input_address & 0xFFFFFFFF; + u32 output_address_hi = output_address >> 32; + u32 output_address_lo = output_address & 0xFFFFFFFF; + + if (!hv_hypercall_pg) + return (u64)ULLONG_MAX; + + __asm__ __volatile__("call *%6" + : "+a" (control_lo), "+d" (control_hi), + "+c" (input_address_lo) + : "b" (input_address_hi), + "D"(output_address_hi), "S"(output_address_lo), + "m" (hv_hypercall_pg) + : "cc", "memory"); + + return control_lo | ((u64)control_hi << 32); +#endif /* !x86_64 */ +} + +/* Fast hypercall with 8 bytes of input and no output */ +static inline u64 hv_do_fast_hypercall8(u16 code, u64 input1) +{ + union hv_hypercall_input control = {0}; + + control.code = code; + control.fast = 1; +#ifdef CONFIG_X86_64 + { + u64 hv_status; + + __asm__ __volatile__("call *%3" + : "=a" (hv_status), + "+c" (control.as_uint64), "+d" (input1) + : "m" (hv_hypercall_pg) + : "cc", "r8", "r9", "r10", "r11"); + return hv_status; + } +#else + { + u32 hv_status_hi, hv_status_lo; + u32 input1_hi = (u32)(input1 >> 32); + u32 input1_lo = (u32)input1; + + __asm__ __volatile__ ("call *%6" + : "=d"(hv_status_hi), + "=a"(hv_status_lo), + "+c"(input1_lo) + : "d" (control.as_uint32_hi), + "a" (control.as_uint32_lo), + "b" (input1_hi), + "m" (hv_hypercall_pg) + : "cc", "edi", "esi"); + + return hv_status_lo | ((u64)hv_status_hi << 32); + } +#endif +} + +/* + * Rep hypercalls. Callers of this functions are supposed to ensure that + * rep_count and vahead_size comply with union hv_hypercall_input definition. + */ +static inline u64 hv_do_rep_hypercall(u16 code, u16 rep_count, u16 varhead_size, + void *input, void *output) +{ + union hv_hypercall_input hc_input = { .code = code, + .varhead_size = varhead_size, + .rep_count = rep_count}; + u64 status; + + do { + status = hv_do_hypercall(hc_input.as_uint64, input, output); + if ((status & 0xffff) != HV_STATUS_SUCCESS) + return status; + + hc_input.rep_start = (status >> 32) & 0xfff; + + touch_nmi_watchdog(); + } while (hc_input.rep_start < hc_input.rep_count); + + return status; +} + +/* + * Hypervisor's notion of virtual processor ID is different from + * Linux' notion of CPU ID. This information can only be retrieved + * in the context of the calling CPU. Setup a map for easy access + * to this information. + */ +extern u32 __percpu *hv_vp_index; + +/** + * hv_cpu_number_to_vp_number() - Map CPU to VP. + * @cpu_number: CPU number in Linux terms + * + * This function returns the mapping between the Linux processor + * number and the hypervisor's virtual processor number, useful + * in making hypercalls and such that talk about specific + * processors. + * + * Return: Virtual processor number in Hyper-V terms + */ +static inline int hv_cpu_number_to_vp_number(int cpu_number) +{ + WARN_ON(hv_vp_index[cpu_number] == -1); + + return hv_vp_index[cpu_number]; +} void hyperv_init(void); +void hyperv_setup_mmu_ops(void); +void hyper_alloc_mmu(void); void hyperv_report_panic(struct pt_regs *regs); bool hv_is_hypercall_page_setup(void); void hyperv_cleanup(void); #endif +#ifdef CONFIG_HYPERV_TSCPAGE +struct ms_hyperv_tsc_page *hv_get_tsc_page(void); +static inline u64 hv_read_tsc_page(const struct ms_hyperv_tsc_page *tsc_pg) +{ + u64 scale, offset, cur_tsc; + u32 sequence; + + /* + * The protocol for reading Hyper-V TSC page is specified in Hypervisor + * Top-Level Functional Specification ver. 3.0 and above. To get the + * reference time we must do the following: + * - READ ReferenceTscSequence + * A special '0' value indicates the time source is unreliable and we + * need to use something else. The currently published specification + * versions (up to 4.0b) contain a mistake and wrongly claim '-1' + * instead of '0' as the special value, see commit c35b82ef0294. + * - ReferenceTime = + * ((RDTSC() * ReferenceTscScale) >> 64) + ReferenceTscOffset + * - READ ReferenceTscSequence again. In case its value has changed + * since our first reading we need to discard ReferenceTime and repeat + * the whole sequence as the hypervisor was updating the page in + * between. + */ + do { + sequence = READ_ONCE(tsc_pg->tsc_sequence); + if (!sequence) + return U64_MAX; + /* + * Make sure we read sequence before we read other values from + * TSC page. + */ + smp_rmb(); + + scale = READ_ONCE(tsc_pg->tsc_scale); + offset = READ_ONCE(tsc_pg->tsc_offset); + cur_tsc = rdtsc_ordered(); + + /* + * Make sure we read sequence after we read all other values + * from TSC page. + */ + smp_rmb(); + + } while (READ_ONCE(tsc_pg->tsc_sequence) != sequence); + + return mul_u64_u64_shr(cur_tsc, scale, 64) + offset; +} + +#else +static inline struct ms_hyperv_tsc_page *hv_get_tsc_page(void) +{ + return NULL; +} +#endif #endif --- linux-azure-4.11.0.orig/arch/x86/include/asm/msr-index.h +++ linux-azure-4.11.0/arch/x86/include/asm/msr-index.h @@ -417,6 +417,8 @@ #define MSR_IA32_TSC_ADJUST 0x0000003b #define MSR_IA32_BNDCFGS 0x00000d90 +#define MSR_IA32_BNDCFGS_RSVD 0x00000ffc + #define MSR_IA32_XSS 0x00000da0 #define FEATURE_CONTROL_LOCKED (1<<0) --- linux-azure-4.11.0.orig/arch/x86/include/asm/pat.h +++ linux-azure-4.11.0/arch/x86/include/asm/pat.h @@ -7,6 +7,7 @@ bool pat_enabled(void); void pat_disable(const char *reason); extern void pat_init(void); +extern void init_cache_modes(void); extern int reserve_memtype(u64 start, u64 end, enum page_cache_mode req_pcm, enum page_cache_mode *ret_pcm); --- linux-azure-4.11.0.orig/arch/x86/include/asm/pmem.h +++ linux-azure-4.11.0/arch/x86/include/asm/pmem.h @@ -103,7 +103,7 @@ if (bytes < 8) { if (!IS_ALIGNED(dest, 4) || (bytes != 4)) - arch_wb_cache_pmem(addr, 1); + arch_wb_cache_pmem(addr, bytes); } else { if (!IS_ALIGNED(dest, 8)) { dest = ALIGN(dest, boot_cpu_data.x86_clflush_size); --- linux-azure-4.11.0.orig/arch/x86/include/asm/trace/hyperv.h +++ linux-azure-4.11.0/arch/x86/include/asm/trace/hyperv.h @@ -0,0 +1,34 @@ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM hyperv + +#if !defined(_TRACE_HYPERV_H) || defined(TRACE_HEADER_MULTI_READ) +#define _TRACE_HYPERV_H + +#include + +#if IS_ENABLED(CONFIG_HYPERV) + +TRACE_EVENT(hyperv_mmu_flush_tlb_others, + TP_PROTO(const struct cpumask *cpus, struct mm_struct *mm, + unsigned long addr, unsigned long end), + TP_ARGS(cpus, mm, addr, end), + TP_STRUCT__entry( + __field(unsigned int, ncpus) + __field(struct mm_struct *, mm) + __field(unsigned long, addr) + __field(unsigned long, end) + ), + TP_fast_assign(__entry->ncpus = cpumask_weight(cpus); + __entry->mm = mm; + __entry->addr = addr, + __entry->end = end), + TP_printk("ncpus %d mm %p addr %lx, end %lx", + __entry->ncpus, __entry->mm, __entry->addr, __entry->end) + ); + +#endif /* CONFIG_HYPERV */ + +#endif /* _TRACE_HYPERV_H */ + +/* This part must be outside protection */ +#include --- linux-azure-4.11.0.orig/arch/x86/include/asm/uaccess.h +++ linux-azure-4.11.0/arch/x86/include/asm/uaccess.h @@ -324,10 +324,10 @@ #define __get_user_asm_u64(x, ptr, retval, errret) \ ({ \ __typeof__(ptr) __ptr = (ptr); \ - asm volatile(ASM_STAC "\n" \ + asm volatile("\n" \ "1: movl %2,%%eax\n" \ "2: movl %3,%%edx\n" \ - "3: " ASM_CLAC "\n" \ + "3:\n" \ ".section .fixup,\"ax\"\n" \ "4: mov %4,%0\n" \ " xorl %%eax,%%eax\n" \ @@ -336,7 +336,7 @@ ".previous\n" \ _ASM_EXTABLE(1b, 4b) \ _ASM_EXTABLE(2b, 4b) \ - : "=r" (retval), "=A"(x) \ + : "=r" (retval), "=&A"(x) \ : "m" (__m(__ptr)), "m" __m(((u32 *)(__ptr)) + 1), \ "i" (errret), "0" (retval)); \ }) --- linux-azure-4.11.0.orig/arch/x86/include/asm/vdso.h +++ linux-azure-4.11.0/arch/x86/include/asm/vdso.h @@ -20,6 +20,7 @@ long sym_vvar_page; long sym_hpet_page; long sym_pvclock_page; + long sym_hvclock_page; long sym_VDSO32_NOTE_MASK; long sym___kernel_sigreturn; long sym___kernel_rt_sigreturn; --- linux-azure-4.11.0.orig/arch/x86/include/asm/xen/events.h +++ linux-azure-4.11.0/arch/x86/include/asm/xen/events.h @@ -20,4 +20,15 @@ /* No need for a barrier -- XCHG is a barrier on x86. */ #define xchg_xen_ulong(ptr, val) xchg((ptr), (val)) +extern int xen_have_vector_callback; + +/* + * Events delivered via platform PCI interrupts are always + * routed to vcpu 0 and hence cannot be rebound. + */ +static inline bool xen_support_evtchn_rebind(void) +{ + return (!xen_hvm_domain() || xen_have_vector_callback); +} + #endif /* _ASM_X86_XEN_EVENTS_H */ --- linux-azure-4.11.0.orig/arch/x86/include/uapi/asm/hyperv.h +++ linux-azure-4.11.0/arch/x86/include/uapi/asm/hyperv.h @@ -34,16 +34,10 @@ #define HV_X64_MSR_REFERENCE_TSC 0x40000021 /* - * There is a single feature flag that signifies the presence of the MSR - * that can be used to retrieve both the local APIC Timer frequency as - * well as the TSC frequency. + * There is a single feature flag that signifies if the partition has access + * to MSRs with local APIC and TSC frequencies. */ - -/* Local APIC timer frequency MSR (HV_X64_MSR_APIC_FREQUENCY) is available */ -#define HV_X64_MSR_APIC_FREQUENCY_AVAILABLE (1 << 11) - -/* TSC frequency MSR (HV_X64_MSR_TSC_FREQUENCY) is available */ -#define HV_X64_MSR_TSC_FREQUENCY_AVAILABLE (1 << 11) +#define HV_X64_ACCESS_FREQUENCY_MSRS (1 << 11) /* * Basic SynIC MSRs (HV_X64_MSR_SCONTROL through HV_X64_MSR_EOM @@ -73,6 +67,9 @@ */ #define HV_X64_MSR_STAT_PAGES_AVAILABLE (1 << 8) +/* Frequency MSRs available */ +#define HV_FEATURE_FREQUENCY_MSRS_AVAILABLE (1 << 8) + /* Crash MSR available */ #define HV_FEATURE_GUEST_CRASH_MSR_AVAILABLE (1 << 10) @@ -124,7 +121,7 @@ * Recommend using hypercall for address space switches rather * than MOV to CR3 instruction */ -#define HV_X64_MWAIT_RECOMMENDED (1 << 0) +#define HV_X64_AS_SWITCH_RECOMMENDED (1 << 0) /* Recommend using hypercall for local TLB flushes rather * than INVLPG or MOV to CR3 instructions */ #define HV_X64_LOCAL_TLB_FLUSH_RECOMMENDED (1 << 1) @@ -148,6 +145,20 @@ #define HV_X64_RELAXED_TIMING_RECOMMENDED (1 << 5) /* + * Virtual APIC support + */ +#define HV_X64_DEPRECATING_AEOI_RECOMMENDED (1 << 9) + +/* Recommend using the newer ExProcessorMasks interface */ +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) + +/* + * HV_VP_SET available + */ +#define HV_X64_EX_PROCESSOR_MASKS_RECOMMENDED (1 << 11) + + +/* * Crash notification flag. */ #define HV_CRASH_CTL_CRASH_NOTIFY (1ULL << 63) @@ -234,7 +245,11 @@ (~((1ull << HV_X64_MSR_HYPERCALL_PAGE_ADDRESS_SHIFT) - 1)) /* Declare the various hypercall operations. */ +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE 0x0002 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST 0x0003 #define HVCALL_NOTIFY_LONG_SPIN_WAIT 0x0008 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_SPACE_EX 0x0013 +#define HVCALL_FLUSH_VIRTUAL_ADDRESS_LIST_EX 0x0014 #define HVCALL_POST_MESSAGE 0x005c #define HVCALL_SIGNAL_EVENT 0x005d @@ -251,6 +266,35 @@ #define HV_PROCESSOR_POWER_STATE_C2 2 #define HV_PROCESSOR_POWER_STATE_C3 3 +#define HV_FLUSH_ALL_PROCESSORS 0x00000001 +#define HV_FLUSH_ALL_VIRTUAL_ADDRESS_SPACES 0x00000002 +#define HV_FLUSH_NON_GLOBAL_MAPPINGS_ONLY 0x00000004 +#define HV_FLUSH_USE_EXTENDED_RANGE_FORMAT 0x00000008 + +enum HV_GENERIC_SET_FORMAT { + HV_GENERIC_SET_SPARCE_4K, + HV_GENERIC_SET_ALL, +}; + +/* Hypercall interface */ +union hv_hypercall_input { + u64 as_uint64; + struct { + __u32 as_uint32_lo; + __u32 as_uint32_hi; + }; + struct { + __u64 code:16; + __u64 fast:1; + __u64 varhead_size:10; + __u64 reserved1:5; + __u64 rep_count:12; + __u64 reserved2:4; + __u64 rep_start:12; + __u64 reserved3:4; + }; +}; + /* hypercall status code */ #define HV_STATUS_SUCCESS 0 #define HV_STATUS_INVALID_HYPERCALL_CODE 2 --- linux-azure-4.11.0.orig/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c +++ linux-azure-4.11.0/arch/x86/kernel/cpu/intel_rdt_rdtgroup.c @@ -767,11 +767,13 @@ dentry = kernfs_mount(fs_type, flags, rdt_root, RDTGROUP_SUPER_MAGIC, NULL); if (IS_ERR(dentry)) - goto out_cdp; + goto out_destroy; static_branch_enable(&rdt_enable_key); goto out; +out_destroy: + kernfs_remove(kn_info); out_cdp: cdp_disable(); out: --- linux-azure-4.11.0.orig/arch/x86/kernel/cpu/mcheck/mce.c +++ linux-azure-4.11.0/arch/x86/kernel/cpu/mcheck/mce.c @@ -643,16 +643,14 @@ } } -static bool memory_error(struct mce *m) +bool mce_is_memory_error(struct mce *m) { - struct cpuinfo_x86 *c = &boot_cpu_data; - - if (c->x86_vendor == X86_VENDOR_AMD) { + if (m->cpuvendor == X86_VENDOR_AMD) { /* ErrCodeExt[20:16] */ u8 xec = (m->status >> 16) & 0x1f; return (xec == 0x0 || xec == 0x8); - } else if (c->x86_vendor == X86_VENDOR_INTEL) { + } else if (m->cpuvendor == X86_VENDOR_INTEL) { /* * Intel SDM Volume 3B - 15.9.2 Compound Error Codes * @@ -673,6 +671,7 @@ return false; } +EXPORT_SYMBOL_GPL(mce_is_memory_error); DEFINE_PER_CPU(unsigned, mce_poll_count); @@ -734,7 +733,7 @@ severity = mce_severity(&m, mca_cfg.tolerant, NULL, false); - if (severity == MCE_DEFERRED_SEVERITY && memory_error(&m)) + if (severity == MCE_DEFERRED_SEVERITY && mce_is_memory_error(&m)) if (m.status & MCI_STATUS_ADDRV) m.severity = severity; --- linux-azure-4.11.0.orig/arch/x86/kernel/cpu/microcode/intel.c +++ linux-azure-4.11.0/arch/x86/kernel/cpu/microcode/intel.c @@ -619,6 +619,9 @@ show_saved_mc(); + /* initrd is going away, clear patch ptr. */ + intel_ucode_patch = NULL; + return 0; } --- linux-azure-4.11.0.orig/arch/x86/kernel/cpu/mshyperv.c +++ linux-azure-4.11.0/arch/x86/kernel/cpu/mshyperv.c @@ -49,6 +49,9 @@ if (vmbus_handler) vmbus_handler(); + if (ms_hyperv.hints & HV_X64_DEPRECATING_AEOI_RECOMMENDED) + ack_APIC_irq(); + exiting_irq(); set_irq_regs(old_regs); } @@ -158,6 +161,15 @@ } #endif +static unsigned long hv_get_tsc_khz(void) +{ + unsigned long freq; + + rdmsrl(HV_X64_MSR_TSC_FREQUENCY, freq); + + return freq / 1000; +} + static void __init ms_hyperv_init_platform(void) { int hv_host_info_eax; @@ -172,9 +184,15 @@ ms_hyperv.misc_features = cpuid_edx(HYPERV_CPUID_FEATURES); ms_hyperv.hints = cpuid_eax(HYPERV_CPUID_ENLIGHTMENT_INFO); - pr_info("HyperV: features 0x%x, hints 0x%x\n", + pr_info("Hyper-V: features 0x%x, hints 0x%x\n", ms_hyperv.features, ms_hyperv.hints); + ms_hyperv.max_vp_index = cpuid_eax(HVCPUID_IMPLEMENTATION_LIMITS); + ms_hyperv.max_lp_index = cpuid_ebx(HVCPUID_IMPLEMENTATION_LIMITS); + + pr_debug("Hyper-V: max %u virtual processors, %u logical processors\n", + ms_hyperv.max_vp_index, ms_hyperv.max_lp_index); + /* * Extract host information. */ @@ -190,8 +208,15 @@ hv_host_info_edx >> 24, hv_host_info_edx & 0xFFFFFF); } + if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { + x86_platform.calibrate_tsc = hv_get_tsc_khz; + x86_platform.calibrate_cpu = hv_get_tsc_khz; + } + #ifdef CONFIG_X86_LOCAL_APIC - if (ms_hyperv.features & HV_X64_MSR_APIC_FREQUENCY_AVAILABLE) { + if (ms_hyperv.features & HV_X64_ACCESS_FREQUENCY_MSRS && + ms_hyperv.misc_features & HV_FEATURE_FREQUENCY_MSRS_AVAILABLE) { /* * Get the APIC frequency. */ @@ -200,7 +225,7 @@ rdmsrl(HV_X64_MSR_APIC_FREQUENCY, hv_lapic_frequency); hv_lapic_frequency = div_u64(hv_lapic_frequency, HZ); lapic_timer_frequency = hv_lapic_frequency; - pr_info("HyperV: LAPIC Timer Frequency: %#x\n", + pr_info("Hyper-V: LAPIC Timer Frequency: %#x\n", lapic_timer_frequency); } @@ -230,11 +255,12 @@ * Setup the hook to get control post apic initialization. */ x86_platform.apic_post_init = hyperv_init; + hyperv_setup_mmu_ops(); #endif } const __refconst struct hypervisor_x86 x86_hyper_ms_hyperv = { - .name = "Microsoft HyperV", + .name = "Microsoft Hyper-V", .detect = ms_hyperv_platform, .init_platform = ms_hyperv_init_platform, }; --- linux-azure-4.11.0.orig/arch/x86/kernel/fpu/init.c +++ linux-azure-4.11.0/arch/x86/kernel/fpu/init.c @@ -90,6 +90,7 @@ * Boot time FPU feature detection code: */ unsigned int mxcsr_feature_mask __read_mostly = 0xffffffffu; +EXPORT_SYMBOL_GPL(mxcsr_feature_mask); static void __init fpu__init_system_mxcsr(void) { --- linux-azure-4.11.0.orig/arch/x86/kernel/ioport.c +++ linux-azure-4.11.0/arch/x86/kernel/ioport.c @@ -30,7 +30,7 @@ if ((from + num <= from) || (from + num > IO_BITMAP_BITS)) return -EINVAL; - if (turn_on && !capable(CAP_SYS_RAWIO)) + if (turn_on && (!capable(CAP_SYS_RAWIO) || kernel_is_locked_down())) return -EPERM; /* @@ -120,7 +120,7 @@ return -EINVAL; /* Trying to gain more privileges? */ if (level > old) { - if (!capable(CAP_SYS_RAWIO)) + if (!capable(CAP_SYS_RAWIO) || kernel_is_locked_down()) return -EPERM; } regs->flags = (regs->flags & ~X86_EFLAGS_IOPL) | --- linux-azure-4.11.0.orig/arch/x86/kernel/kexec-bzimage64.c +++ linux-azure-4.11.0/arch/x86/kernel/kexec-bzimage64.c @@ -179,6 +179,7 @@ if (efi_enabled(EFI_OLD_MEMMAP)) return 0; + params->secure_boot = boot_params.secure_boot; ei->efi_loader_signature = current_ei->efi_loader_signature; ei->efi_systab = current_ei->efi_systab; ei->efi_systab_hi = current_ei->efi_systab_hi; --- linux-azure-4.11.0.orig/arch/x86/kernel/kvm.c +++ linux-azure-4.11.0/arch/x86/kernel/kvm.c @@ -161,8 +161,8 @@ */ rcu_irq_exit(); native_safe_halt(); - rcu_irq_enter(); local_irq_disable(); + rcu_irq_enter(); } } if (!n.halted) --- linux-azure-4.11.0.orig/arch/x86/kernel/msr.c +++ linux-azure-4.11.0/arch/x86/kernel/msr.c @@ -84,6 +84,9 @@ int err = 0; ssize_t bytes = 0; + if (kernel_is_locked_down()) + return -EPERM; + if (count % 8) return -EINVAL; /* Invalid chunk size */ @@ -131,6 +134,10 @@ err = -EBADF; break; } + if (kernel_is_locked_down()) { + err = -EPERM; + break; + } if (copy_from_user(®s, uregs, sizeof regs)) { err = -EFAULT; break; --- linux-azure-4.11.0.orig/arch/x86/kernel/reboot.c +++ linux-azure-4.11.0/arch/x86/kernel/reboot.c @@ -452,7 +452,46 @@ DMI_MATCH(DMI_PRODUCT_NAME, "VGN-Z540N"), }, }, - + { /* Handle problems with rebooting on the Latitude E6520. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6520", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6520"), + }, + }, + { /* Handle problems with rebooting on the OptiPlex 790. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 790", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 790"), + }, + }, + { /* Handle problems with rebooting on the OptiPlex 990. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 990", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 990"), + }, + }, + { /* Handle problems with rebooting on the Latitude E6220. */ + .callback = set_pci_reboot, + .ident = "Dell Latitude E6220", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "Latitude E6220"), + }, + }, + { /* Handle problems with rebooting on the OptiPlex 390. */ + .callback = set_pci_reboot, + .ident = "Dell OptiPlex 390", + .matches = { + DMI_MATCH(DMI_SYS_VENDOR, "Dell Inc."), + DMI_MATCH(DMI_PRODUCT_NAME, "OptiPlex 390"), + }, + }, { } }; --- linux-azure-4.11.0.orig/arch/x86/kernel/setup.c +++ linux-azure-4.11.0/arch/x86/kernel/setup.c @@ -69,6 +69,12 @@ #include #include #include +#include + +#include +#include +#include +#include #include